# Testing out models (Version Virginia)

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# sklearn
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, LabelEncoder
from sklearn.pipeline import Pipeline, FunctionTransformer
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, learning_curve
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, cohen_kappa_score

## Get data

In [2]:
df = pd.read_csv('../raw_data/hospital_readmissions.csv')

In [3]:
# Clean data
df = df[df['diag_1'] != 'Missing']
df = df[df['diag_2'] != 'Missing']
df = df[df['diag_3'] != 'Missing']

In [4]:
# Separate features and target
X = df.drop('readmitted', axis=1)
y = df['readmitted'].map({'yes': 1, 'no': 0})

# Split the data into training + validation and testing sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Further split the training + validation set into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)  # 0.25 * 0.8 = 0.2

## Preprocessing

In [5]:
# Clean data function
def make_clean_data(df: pd.DataFrame) -> pd.DataFrame:
    df['n_lab_procedures_grouped'] = (df['n_lab_procedures'] // 10).astype(int)
    df['n_medications_grouped'] = (df['n_medications'] // 5).astype(int)
    df['n_outpatient'] = df['n_outpatient'].map({0: 0, 1: 1}).fillna(2).astype(int)
    df['n_inpatient'] = df['n_inpatient'].map({0: 0, 1: 1}).fillna(2).astype(int)
    df['n_emergency'] = df['n_emergency'].map({0: 0, 1: 1}).fillna(2).astype(int)

    df = df.drop(columns=['n_lab_procedures',
                          'medical_specialty',
                          'glucose_test',
                          'n_medications'],
                )
    return df

In [6]:
# Custom transformer for Label Encoding 'age' column
class AgeLabelEncoder:
    def fit(self, X, y=None):
        self.encoder = LabelEncoder()
        self.encoder.fit(X['age'])
        return self

    def transform(self, X):
        X = X.copy()
        X['age'] = self.encoder.transform(X['age'])
        return X

    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X)

In [7]:
data_cleaner = FunctionTransformer(make_clean_data)
age_label_encoder = FunctionTransformer(lambda X: AgeLabelEncoder().fit_transform(X))

# Numeric preprocessing pipeline
num_preproc = Pipeline([
    ('scaler', MinMaxScaler()),
])

# Categorical preprocessing pipeline (excluding 'age')
categorical_columns = [col for col in data_cleaner.transform(X_train).select_dtypes(include=['object']).columns if col != 'age']
cat_preproc = Pipeline([
    ('ohe', OneHotEncoder(sparse_output=False, drop="if_binary")),
])

preproc = ColumnTransformer([
    ('age_label_encoder', age_label_encoder, ['age']),
    ('num_transf', num_preproc, make_column_selector(dtype_include='number')),
    ('cat_transf', cat_preproc, categorical_columns),
], verbose_feature_names_out=False).set_output(transform='pandas')

pipe_preproc = Pipeline([
    ('data_cleaner', data_cleaner),
    ('preprocessor', preproc),
])

pipe_preproc

In [8]:
# Fit and transform the training data
X_train_preprocessed = pipe_preproc.fit_transform(X_train)
X_val_preprocessed = pipe_preproc.transform(X_val)
X_test_preprocessed = pipe_preproc.transform(X_test)

### Scores

***Precision*** = High precision indicates that when the model predicts a readmission, it is usually correct. <br>
***Recall*** =  High recall indicates that the model correctly identifies a high percentage of actual readmissions. <br>
***F1-Score*** = A higher F1-score indicates a better balance between precision and recall. <br>
***AUC-ROC*** = A higher AUC-ROC value indicates better overall performance.

## Base model (Linear Regression)

In [9]:
# Baseline Logistic Regression model
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train_preprocessed, y_train)

# Predictions and evaluation
y_val_pred = log_reg.predict(X_val_preprocessed)
print("Logistic Regression Validation Performance")
print(classification_report(y_val, y_val_pred))
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred):.4f}")
logreg_score = accuracy_score(y_val, y_val_pred)

Logistic Regression Validation Performance
              precision    recall  f1-score   support

           0       0.59      0.76      0.67      2580
           1       0.62      0.42      0.51      2376

    accuracy                           0.60      4956
   macro avg       0.61      0.59      0.59      4956
weighted avg       0.61      0.60      0.59      4956

Validation Accuracy: 0.6009


## Other models

### SVC 

In [10]:
svc = SVC(random_state=42)
svc.fit(X_train_preprocessed, y_train)
y_val_pred_svc = svc.predict(X_val_preprocessed)
print("Support Vector Classifier Validation Performance")
print(classification_report(y_val, y_val_pred_svc))
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred_svc):.4f}")
svc_score = accuracy_score(y_val, y_val_pred_svc)

Support Vector Classifier Validation Performance
              precision    recall  f1-score   support

           0       0.59      0.77      0.67      2580
           1       0.63      0.43      0.51      2376

    accuracy                           0.61      4956
   macro avg       0.61      0.60      0.59      4956
weighted avg       0.61      0.61      0.59      4956

Validation Accuracy: 0.6051


### Random Forest

In [11]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_preprocessed, y_train)
y_val_pred_rf = rf.predict(X_val_preprocessed)
print("Random Forest Validation Performance")
print(classification_report(y_val, y_val_pred_rf))
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred_rf):.4f}")
rf_score = accuracy_score(y_val, y_val_pred_rf)

Random Forest Validation Performance
              precision    recall  f1-score   support

           0       0.59      0.68      0.63      2580
           1       0.59      0.49      0.54      2376

    accuracy                           0.59      4956
   macro avg       0.59      0.59      0.59      4956
weighted avg       0.59      0.59      0.59      4956

Validation Accuracy: 0.5914


### Gradient Boosting

In [12]:
gb = GradientBoostingClassifier(random_state=42)
gb.fit(X_train_preprocessed, y_train)
y_val_pred_gb = gb.predict(X_val_preprocessed)
print("Gradient Boosting Validation Performance")
print(classification_report(y_val, y_val_pred_gb))
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred_gb):.4f}")
gb_score = accuracy_score(y_val, y_val_pred_gb)

Gradient Boosting Validation Performance
              precision    recall  f1-score   support

           0       0.60      0.74      0.66      2580
           1       0.62      0.46      0.52      2376

    accuracy                           0.60      4956
   macro avg       0.61      0.60      0.59      4956
weighted avg       0.61      0.60      0.60      4956

Validation Accuracy: 0.6043


### XGBoost

In [13]:
xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train_preprocessed, y_train)
y_val_pred_xgb = xgb.predict(X_val_preprocessed)
print("XGBoost Validation Performance")
print(classification_report(y_val, y_val_pred_xgb))
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred_xgb):.4f}")
xgb_score = accuracy_score(y_val, y_val_pred_xgb)

XGBoost Validation Performance
              precision    recall  f1-score   support

           0       0.59      0.68      0.63      2580
           1       0.58      0.50      0.54      2376

    accuracy                           0.59      4956
   macro avg       0.59      0.59      0.58      4956
weighted avg       0.59      0.59      0.59      4956

Validation Accuracy: 0.5896


### DecisionTreeClassifier

In [14]:
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_preprocessed, y_train)
y_val_pred_dt = dt.predict(X_val_preprocessed)
print("Decision Tree Validation Performance")
print(classification_report(y_val, y_val_pred_dt))
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred_dt):.4f}")
dt_score = accuracy_score(y_val, y_val_pred_dt)

Decision Tree Validation Performance
              precision    recall  f1-score   support

           0       0.56      0.58      0.57      2580
           1       0.53      0.52      0.52      2376

    accuracy                           0.55      4956
   macro avg       0.55      0.55      0.55      4956
weighted avg       0.55      0.55      0.55      4956

Validation Accuracy: 0.5468


### AdaBoostClassifier

In [15]:
ada = AdaBoostClassifier(random_state=42)
ada.fit(X_train_preprocessed, y_train)
y_val_pred_ada = ada.predict(X_val_preprocessed)
print("AdaBoost Validation Performance")
print(classification_report(y_val, y_val_pred_ada))
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred_ada):.4f}")
ada_score = accuracy_score(y_val, y_val_pred_ada)



AdaBoost Validation Performance
              precision    recall  f1-score   support

           0       0.59      0.75      0.67      2580
           1       0.62      0.44      0.52      2376

    accuracy                           0.60      4956
   macro avg       0.61      0.60      0.59      4956
weighted avg       0.61      0.60      0.59      4956

Validation Accuracy: 0.6045


### Test set evaluation with the best model

In [16]:
# Creating a DataFrame
accuracy = {
    'Model': ['Logistic Regression', 'SVC', 'Random Forest', 'Gradient Boosting', 'XGBoost', 'Decision Tree', 'AdaBoost'],
    'Accuracy Score': [logreg_score, svc_score, rf_score, gb_score, xgb_score, dt_score, ada_score]
}

df_accuracy = pd.DataFrame(accuracy).sort_values(by='Accuracy Score', ascending=False).reset_index(drop=True)
df_accuracy

Unnamed: 0,Model,Accuracy Score
0,SVC,0.605125
1,AdaBoost,0.60452
2,Gradient Boosting,0.604318
3,Logistic Regression,0.600888
4,Random Forest,0.591404
5,XGBoost,0.589588
6,Decision Tree,0.546812


SVC Performed best

In [17]:
best_model = svc
y_test_pred = best_model.predict(X_test_preprocessed)
print("Best Model Test Performance")
print(classification_report(y_test, y_test_pred))
print(f"Test Accuracy: {accuracy_score(y_test, y_test_pred):.4f}")

Best Model Test Performance
              precision    recall  f1-score   support

           0       0.60      0.76      0.67      2559
           1       0.64      0.45      0.53      2397

    accuracy                           0.61      4956
   macro avg       0.62      0.60      0.60      4956
weighted avg       0.62      0.61      0.60      4956

Test Accuracy: 0.6098


## Improving models

### Feature Selection

In [21]:
# Feature Selection using RandomForestClassifier feature importance
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train_preprocessed, y_train)

feature_importances = rf.feature_importances_
sorted_idx = np.argsort(feature_importances)[::-1]

# Print feature importance
print("Feature Importances:")
for idx in sorted_idx:
    print(f"{X_train_preprocessed.columns[idx]}: {feature_importances[idx]}")

Feature Importances:
time_in_hospital: 0.12884751203103628
n_lab_procedures_grouped: 0.12624797520263553
n_medications_grouped: 0.09866111485880502
age: 0.09374283664293735
n_procedures: 0.07557675430125879
n_inpatient: 0.05534677859380795
n_outpatient: 0.02733585015011078
change_yes: 0.02721658217557083
diag_3_Other: 0.0245310029121059
diag_2_Other: 0.023594809632032452
diag_3_Circulatory: 0.023493829589252047
n_emergency: 0.02250831369068293
diag_2_Circulatory: 0.02199985963899063
diag_1_Circulatory: 0.02108102652554416
diag_1_Other: 0.02001184218094527
diabetes_med_yes: 0.018372978544288866
diag_3_Diabetes: 0.017872110038462525
diag_2_Respiratory: 0.015459968628364868
diag_1_Respiratory: 0.015249602884206272
A1Ctest_no: 0.014782658394159076
diag_2_Diabetes: 0.014668757405636914
diag_3_Respiratory: 0.013595855785554748
diag_1_Digestive: 0.01329924514041762
A1Ctest_high: 0.011868752153326681
diag_1_Injury: 0.0108592496386813
diag_1_Diabetes: 0.010575327481697056
diag_3_Digestive: 0.00

In [26]:
# Select top k features based on importance (loop)

# Initialize lists to store the results
k_values = []
validation_accuracies = []

# Loop over different values of k
for k in range(1, len(sorted_idx) + 1):
    selected_features = X_train_preprocessed.columns[sorted_idx][:k]

    # Subset X_train_preprocessed and X_val_preprocessed with selected features
    X_train_selected = X_train_preprocessed[selected_features]
    X_val_selected = X_val_preprocessed[selected_features]

    # Retrain the SVC model with selected features
    svc = SVC(random_state=42)
    svc.fit(X_train_selected, y_train)
    y_val_pred_svc = svc.predict(X_val_selected)

    # Evaluate the performance
    val_accuracy = accuracy_score(y_val, y_val_pred_svc)
    validation_accuracies.append(val_accuracy)
    k_values.append(k)

    #print(f"Validation Performance with top {k} features")
    #print(classification_report(y_val, y_val_pred_svc))
    #print(f"Validation Accuracy: {val_accuracy:.4f}")

# Determine the best k based on the highest validation accuracy
best_k = k_values[np.argmax(validation_accuracies)]
best_accuracy = max(validation_accuracies)

print(f"Best k: {best_k} with Validation Accuracy: {best_accuracy:.4f}")

Validation Performance with top 1 features
              precision    recall  f1-score   support

           0       0.52      0.93      0.67      2580
           1       0.49      0.07      0.13      2376

    accuracy                           0.52      4956
   macro avg       0.51      0.50      0.40      4956
weighted avg       0.51      0.52      0.41      4956

Validation Accuracy: 0.5194
Validation Performance with top 2 features
              precision    recall  f1-score   support

           0       0.52      0.97      0.68      2580
           1       0.49      0.03      0.06      2376

    accuracy                           0.52      4956
   macro avg       0.51      0.50      0.37      4956
weighted avg       0.51      0.52      0.38      4956

Validation Accuracy: 0.5202
Validation Performance with top 3 features
              precision    recall  f1-score   support

           0       0.55      0.74      0.63      2580
           1       0.54      0.34      0.42      237

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Validation Performance with top 5 features
              precision    recall  f1-score   support

           0       0.52      0.95      0.68      2580
           1       0.54      0.07      0.12      2376

    accuracy                           0.53      4956
   macro avg       0.53      0.51      0.40      4956
weighted avg       0.53      0.53      0.41      4956

Validation Accuracy: 0.5252
Validation Performance with top 6 features
              precision    recall  f1-score   support

           0       0.59      0.76      0.66      2580
           1       0.62      0.43      0.51      2376

    accuracy                           0.60      4956
   macro avg       0.61      0.59      0.59      4956
weighted avg       0.61      0.60      0.59      4956

Validation Accuracy: 0.6013
Validation Performance with top 7 features
              precision    recall  f1-score   support

           0       0.60      0.72      0.65      2580
           1       0.61      0.48      0.53      237

In [27]:
# Retrain the SVC model with the best k features on the test set
best_selected_features = X_train_preprocessed.columns[sorted_idx][:best_k]
X_train_best_selected = X_train_preprocessed[best_selected_features]
X_val_best_selected = X_val_preprocessed[best_selected_features]
X_test_best_selected = X_test_preprocessed[best_selected_features]

svc_best = SVC(random_state=42)
svc_best.fit(X_train_best_selected, y_train)
y_test_pred_svc_best = svc_best.predict(X_test_best_selected)

# Evaluate the performance on the test set
print("Support Vector Classifier Test Performance with Best Selected Features")
print(classification_report(y_test, y_test_pred_svc_best))
print(f"Test Accuracy: {accuracy_score(y_test, y_test_pred_svc_best):.4f}")

Support Vector Classifier Test Performance with Best Selected Features
              precision    recall  f1-score   support

           0       0.60      0.75      0.67      2559
           1       0.63      0.46      0.53      2397

    accuracy                           0.61      4956
   macro avg       0.62      0.60      0.60      4956
weighted avg       0.61      0.61      0.60      4956

Test Accuracy: 0.6098


Feature selection did not improve my score by very much. I keep 25 features.

### Model Tuning

In [32]:
param_grid = {
    'C': [0.1, 1, 10, 20, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
}

grid_search = GridSearchCV(SVC(random_state=42), param_grid, cv=5,  n_jobs=-1, scoring='accuracy', verbose=0)
grid_search.fit(X_train_best_selected, y_train)

# Best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best CV Score:", grid_search.best_score_)

In [None]:
# Get the best model
best_svc = grid_search.best_estimator_

# Predict on the validation set using the best model
y_val_pred_best_svc = best_svc.predict(X_val_best_selected)

# Evaluate the performance on the validation set
print("Best SVC Model Validation Performance")
print(classification_report(y_val, y_val_pred_best_svc))
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred_best_svc):.4f}")

# Evaluate the performance on the test set
X_test_best_selected = X_test_preprocessed[best_selected_features]
y_test_pred_best_svc = best_svc.predict(X_test_best_selected)

print("Best SVC Model Test Performance")
print(classification_report(y_test, y_test_pred_best_svc))
print(f"Test Accuracy: {accuracy_score(y_test, y_test_pred_best_svc):.4f}")

# Print the best parameters
print(f"Best parameters found: {grid_search.best_params_}")

In [None]:
# Define the parameter distribution for RandomizedSearchCV
param_dist_rf = {
    'n_estimators': randint(50, 200),
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(1, 5),
    'bootstrap': [True, False]
}

# Randomized search with cross-validation
random_search_rf = RandomizedSearchCV(estimator=RandomForestClassifier(random_state=42), param_distributions=param_dist_rf, n_iter=50, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
random_search_rf.fit(X_train_selected, y_train)

# Best parameters from the randomized search
best_params_rf = random_search_rf.best_params_
print(f"Best parameters for RandomForest: {best_params_rf}")

# Retrain the model with the best parameters
rf_best = RandomForestClassifier(random_state=42, **best_params_rf)
rf_best.fit(X_train_selected, y_train)

# Evaluate on validation and test sets
y_val_pred_rf_best = rf_best.predict(X_val_selected)
y_test_pred_rf_best = rf_best.predict(X_test_selected)

print("RandomForest Validation Performance with Best Parameters")
print(classification_report(y_val, y_val_pred_rf_best))
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred_rf_best):.4f}")

print("RandomForest Test Performance with Best Parameters")
print(classification_report(y_test, y_test_pred_rf_best))
print(f"Test Accuracy: {accuracy_score(y_test, y_test_pred_rf_best):.4f}")


In [None]:
# Define the parameter distribution for RandomizedSearchCV
param_dist_gb = {
    'n_estimators': randint(50, 200),
    'learning_rate': uniform(0.01, 0.2),  # Randomly sample learning rates between 0.01 and 0.21
    'max_depth': randint(3, 8),
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(1, 5)
}

# Randomized search with cross-validation
random_search_gb = RandomizedSearchCV(estimator=GradientBoostingClassifier(random_state=42), param_distributions=param_dist_gb, n_iter=50, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
random_search_gb.fit(X_train_selected, y_train)

# Best parameters from the randomized search
best_params_gb = random_search_gb.best_params_
print(f"Best parameters for GradientBoosting: {best_params_gb}")

# Retrain the model with the best parameters
gb_best = GradientBoostingClassifier(random_state=42, **best_params_gb)
gb_best.fit(X_train_selected, y_train)

# Evaluate on validation and test sets
y_val_pred_gb_best = gb_best.predict(X_val_selected)
y_test_pred_gb_best = gb_best.predict(X_test_selected)

print("GradientBoosting Validation Performance with Best Parameters")
print(classification_report(y_val, y_val_pred_gb_best))
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred_gb_best):.4f}")

print("GradientBoosting Test Performance with Best Parameters")
print(classification_report(y_test, y_test_pred_gb_best))
print(f"Test Accuracy: {accuracy_score(y_test, y_test_pred_gb_best):.4f}")


In [None]:
# Define the parameter distribution for RandomizedSearchCV
param_dist_lr = {
    'C': uniform(0.01, 100),  # Randomly sample C values from 0.01 to 100.01
    'solver': ['newton-cg', 'lbfgs', 'liblinear'],
    'penalty': ['l2'],
    'max_iter': [100, 200, 300]
}

# Randomized search with cross-validation
random_search_lr = RandomizedSearchCV(estimator=LogisticRegression(random_state=42), param_distributions=param_dist_lr, n_iter=50, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
random_search_lr.fit(X_train_selected, y_train)

# Best parameters from the randomized search
best_params_lr = random_search_lr.best_params_
print(f"Best parameters for LogisticRegression: {best_params_lr}")

# Retrain the model with the best parameters
lr_best = LogisticRegression(random_state=42, **best_params_lr)
lr_best.fit(X_train_selected, y_train)

# Evaluate on validation and test sets
y_val_pred_lr_best = lr_best.predict(X_val_selected)
y_test_pred_lr_best = lr_best.predict(X_test_selected)

print("LogisticRegression Validation Performance with Best Parameters")
print(classification_report(y_val, y_val_pred_lr_best))
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred_lr_best):.4f}")

print("LogisticRegression Test Performance with Best Parameters")
print(classification_report(y_test, y_test_pred_lr_best))
print(f"Test Accuracy: {accuracy_score(y_test, y_test_pred_lr_best):.4f}")


### Ensemble Methods

In [None]:
# Create an ensemble using VotingClassifier
voting_clf = VotingClassifier(
    estimators=[
        ('svc', best_svc),
        ('rf', rf_best),
        ('gb', gb_best),
        ('lr', lr_best)
    ],
    voting='soft'
)

# Fit the ensemble model
voting_clf.fit(X_train_best_selected, y_train)

# Predict on the validation set
y_val_pred_ensemble = voting_clf.predict(X_val_best_selected)

# Evaluate the performance on the validation set
print("Ensemble Model Validation Performance")
print(classification_report(y_val, y_val_pred_ensemble))
print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred_ensemble):.4f}")

In [None]:
# Evaluate the performance on the test set
y_test_pred_ensemble = voting_clf.predict(X_test_best_selected)

print("Ensemble Model Test Performance")
print(classification_report(y_test, y_test_pred_ensemble))
print(f"Test Accuracy: {accuracy_score(y_test, y_test_pred_ensemble):.4f}")

## Error Analysis

#### Confusion Matrix

In [None]:
# Confusion matrix for validation set
cm_val = confusion_matrix(y_val, y_val_pred_ensemble)

# Confusion matrix for test set
cm_test = confusion_matrix(y_test, y_test_pred_ensemble)

# Plot confusion matrix for validation set
plt.figure(figsize=(8, 6))
sns.heatmap(cm_val, annot=True, fmt='d', cmap='Blues', xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Validation Set')
plt.show()

# Plot confusion matrix for test set
plt.figure(figsize=(8, 6))
sns.heatmap(cm_test, annot=True, fmt='d', cmap='Blues', xticklabels=['No', 'Yes'], yticklabels=['No', 'Yes'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix - Test Set')
plt.show()

#### Classification Report

In [None]:
# Classification report for validation set
print("Classification Report - Validation Set")
print(classification_report(y_val, y_val_pred_ensemble))

# Classification report for test set
print("Classification Report - Test Set")
print(classification_report(y_test, y_test_pred_ensemble))

#### Error Analysis on Validation Set

In [None]:
# Identify false positives and false negatives in validation set
val_errors = X_val[y_val != y_val_pred_ensemble]
val_errors['actual'] = y_val[y_val != y_val_pred_ensemble]
val_errors['predicted'] = y_val_pred_ensemble[y_val != y_val_pred_ensemble]

print("Validation Set Errors")
print(val_errors.head())

####  Error Analysis on Test Set

In [None]:
# Identify false positives and false negatives in test set
test_errors = X_test[y_test != y_test_pred_ensemble]
test_errors['actual'] = y_test[y_test != y_test_pred_ensemble]
test_errors['predicted'] = y_test_pred_ensemble[y_test != y_test_pred_ensemble]

print("Test Set Errors")
print(test_errors.head())

#### Visualize Feature Distributions

In [None]:
# Visualize feature distribution for validation set errors
plt.figure(figsize=(14, 10))
for i, feature in enumerate(selected_features):
    plt.subplot(4, 2, i + 1)
    sns.histplot(val_errors[feature], kde=True)
    plt.title(f'Distribution of {feature} in Validation Errors')
plt.tight_layout()
plt.show()

# Visualize feature distribution for test set errors
plt.figure(figsize=(14, 10))
for i, feature in enumerate(selected_features):
    plt.subplot(4, 2, i + 1)
    sns.histplot(test_errors[feature], kde=True)
    plt.title(f'Distribution of {feature} in Test Errors')
plt.tight_layout()
plt.show()

## Regularization

#### Learning curves

In [None]:
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    plt.figure(figsize=(10, 6))
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")

    plt.legend(loc="best")
    return plt

# Plot learning curve for the best model
plot_learning_curve(best_model, "Learning Curve for Best Model", X_train_selected, y_train, cv=5)
plt.show()