In [None]:
# Q1: Purpose of Grid Search CV in machine learning

# Grid Search CV (Cross-Validation):
# - It is used to find the best hyperparameters for a machine learning model.
# - Works by exhaustively searching through a specified set of hyperparameters.
# - Evaluates each combination using cross-validation to identify the best-performing configuration.

# Example in Python:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Define model and parameter grid
model = RandomForestClassifier()
param_grid = {'n_estimators': [50, 100, 150], 'max_depth': [None, 10, 20]}

# Perform Grid Search
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Best parameters
print("Best Parameters:", grid_search.best_params_)

# Q2: Difference between Grid Search CV and Randomized Search CV

# Grid Search CV:
# - Tests all possible combinations of hyperparameters.
# - Computationally expensive for large parameter spaces.

# Randomized Search CV:
# - Randomly samples a fixed number of combinations from the hyperparameter space.
# - More efficient for large or complex parameter spaces.

# Example of Randomized Search:
from sklearn.model_selection import RandomizedSearchCV

random_search = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=10, cv=5)
random_search.fit(X_train, y_train)

print("Best Parameters from Randomized Search:", random_search.best_params_)

# Q3: What is data leakage, and why is it a problem?

# Data leakage occurs when information from outside the training dataset is used to create the model.
# It can lead to overly optimistic performance during training but poor generalization.

# Example:
# Including future data (e.g., target values) as features in the training data.

# Q4: How to prevent data leakage?

# 1. Properly split data into training, validation, and testing sets.
# 2. Perform data preprocessing (e.g., scaling, encoding) separately on training and testing data.
# 3. Avoid including features that won't be available in real-world scenarios.

# Example: Using scikit-learn's Pipeline to prevent leakage:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier())
])
pipeline.fit(X_train, y_train)

# Q5: What is a confusion matrix?

# A confusion matrix is a table that summarizes the performance of a classification model.
# It shows the counts of True Positives (TP), True Negatives (TN), False Positives (FP), and False Negatives (FN).

# Example:
from sklearn.metrics import confusion_matrix

y_pred = grid_search.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

# Q6: Precision vs. Recall

# Precision: Proportion of true positive predictions out of all positive predictions.
# Formula: Precision = TP / (TP + FP)

# Recall: Proportion of true positives out of all actual positive cases.
# Formula: Recall = TP / (TP + FN)

# Q7: Interpreting confusion matrix to identify errors

# - FP: Type I Error (false alarms).
# - FN: Type II Error (missed detections).
# Example: Look at the off-diagonal elements of the confusion matrix.

# Q8: Metrics derived from a confusion matrix

# 1. Accuracy: (TP + TN) / Total
# 2. Precision: TP / (TP + FP)
# 3. Recall: TP / (TP + FN)
# 4. F1 Score: 2 * (Precision * Recall) / (Precision + Recall)

from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

# Q9: Relationship between accuracy and confusion matrix

# Accuracy depends on all elements of the confusion matrix.
# Example: High accuracy in imbalanced datasets might be misleading due to TN dominance.

# Q10: Identifying biases or limitations from confusion matrix

# Analyze:
# - High FP: Indicates over-prediction of positive class.
# - High FN: Indicates under-prediction of positive class.
# Bias Example: A model biased towards the majority class will show very few FN for the majority class.

# Addressing bias: Use balanced datasets, resampling techniques, or adjust class weights.

# Example of class imbalance handling with balanced class weights:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(class_weight='balanced')
model.fit(X_train, y_train)
