In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from imblearn.ensemble import EasyEnsembleClassifier

## What question is being answered?

The question being addressed in this project is a binary classification problem, specifically within the context of financial risk management. The target is categorical, aiming to predict whether individual account holders will default on their payment in the upcoming month. The dataset, including payment history and related banking metrics, serves as the basis for modeling this predictive task.

## Dataset EDA

In [None]:
file = 'default of credit card clients.xls'

df = pd.read_excel(file)

# Drop the first column from the DataFrame
df.drop(df.columns[0], axis=1, inplace=True)

# Replace the column names with the values from the second row
new_column_names = df.iloc[0]
df.columns = new_column_names

# Resetting the index
df = df.iloc[1:].reset_index(drop=True)

df = df.rename(columns={"default payment next month": "Y"})

# Display the DataFrame with the updated column names
df.head(5)

In [None]:
df.info()

In [None]:
df.describe()

## Feature Engineering / Dataset Cleaning

In [None]:
# Average Payment Delay
df['avg_payment_delay'] = df[['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']].mean(axis=1)

# Outstanding Balance for each month
for i in range(1, 7):
    df[f'outstanding_balance_{i}'] = df[f'BILL_AMT{i}'] - df[f'PAY_AMT{i}']

# Credit Utilization for each month
for i in range(1, 7):
    df[f'credit_utilization_{i}'] = df[f'BILL_AMT{i}'] / df['LIMIT_BAL']

# Replacing any infinity values with NaN
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Debt Ratio for each month
for i in range(1, 7):
    # Avoid division by zero
    df[f'debt_ratio_{i}'] = (df[f'BILL_AMT{i}'] - df[f'PAY_AMT{i}']) / df[f'BILL_AMT{i}']
    df.loc[df[f'BILL_AMT{i}'] == 0, f'debt_ratio_{i}'] = 0

# Age Binning
age_bins = [20, 30, 40, 50, 60, 70, 80]
age_labels = ['20-30', '30-40', '40-50', '50-60', '60-70', '70-80']
df['age_group'] = pd.cut(df['AGE'], bins=age_bins, labels=age_labels, right=False)

# Consistent Bill Amount
df['consistent_bill_amount'] = df[['BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']].std(axis=1)

# List of columns to drop
columns_to_drop = ['AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6',
                   'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6',
                   'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']

# Drop the specified columns
df.drop(columns=columns_to_drop, inplace=True)

df.head(10)

In [None]:
df.info()

In [None]:
# Check for missing values in the DataFrame
df.isnull().sum()

In [None]:
# Count of each class in 'Y' column
y_counts = df['Y'].value_counts()

plt.figure(figsize=(8,6))  # Set the figure size
sns.barplot(x=y_counts.index, y=y_counts.values, palette="muted")  # Create a barplot

plt.title('Default Payment Next Month (Y)', fontsize=14)  # Set the title and its fontsize
plt.xlabel('Default Payment', fontsize=12)  # Set x-axis label and its fontsize
plt.ylabel('Count', fontsize=12)  # Set y-axis label and its fontsize

# Annotate the heights of bars for better visual understanding
for i, v in enumerate(y_counts.values):
    plt.text(i, v, str(v), ha='center', va='bottom', fontsize=12)

plt.xticks(ticks=[0, 1], labels=['(0) No', '(1) Yes'])  # Replace 0,1 labels with No and Yes for better understanding
plt.show()

In [None]:
df.info()

In [None]:
# Compute correlation matrix for all features
correlation_matrix = df.corr()

# Set up the figure size
plt.figure(figsize=(18, 15))

# Create a heatmap with reduced font size in annotations and set a threshold for displaying annotations
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", annot_kws={'size': 8})

# Add a title to the plot
plt.title('Correlation Heatmap of All Features', fontsize=14)

# Show the plot
plt.show()


In [None]:
# Compute correlation matrix for all features
correlation_matrix = df.corr()

# Set a threshold for significant correlations
threshold = 0.5

# Mask the values that are below the threshold
mask = np.abs(correlation_matrix) < threshold

plt.figure(figsize=(18, 15))

# Create a heatmap with masked insignificant correlations, and reduced font size in annotations
sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap="coolwarm", fmt=".2f", annot_kws={'size': 8})

plt.title('Correlation Heatmap of Significant Features (> 0.5)', fontsize=14)
plt.show()

## Model Selection and Confusion Matrices

In our project, we've opted to use recall as the primary metric for evaluating model performance, a decision guided by the nature of our dataset, which is heavily imbalanced towards the negative class. Recall, also known as sensitivity or true positive rate, focuses on the correct identification of the positive class and is less influenced by the large number of true negatives. In scenarios like ours, where the positive class is of higher importance and relatively rare, accuracy can be misleading, as a model predicting only the negative class would still achieve high accuracy. By prioritizing recall, we're concentrating on the model's ability to correctly identify the minority positive class, ensuring that the imbalanced distribution doesn't overshadow the evaluation of how effectively our models are pinpointing the most crucial insights in our data.

In [None]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=10000),
    'Support Vector Machines': LinearSVC(max_iter=10000, C=10),
    'Decision Trees': DecisionTreeClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Random Forest': RandomForestClassifier(),
    'EasyEnsemble': EasyEnsembleClassifier(n_estimators = 200)
}

# Identify categorical and numerical columns
cat_cols = ['SEX', 'EDUCATION', 'MARRIAGE', 'age_group']
numeric_cols = df.columns.drop(cat_cols + ["Y"]).tolist()

# Create a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), cat_cols)])

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df.drop('Y', axis=1), df['Y'], test_size=0.2, random_state=42)

# Convert y_train and y_test to integer
y_train = y_train.astype('int')
y_test = y_test.astype('int')

for model_name, model in models.items():
    X_train_temp = X_train.copy()
    y_train_temp = y_train.copy()
    X_test_temp = X_test.copy()
    y_test_temp = y_test.copy()

    # Create preprocessing and training pipeline
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               (model_name, model)])

    # Fit the model
    pipeline.fit(X_train_temp, y_train_temp)

    # Make predictions
    y_pred = pipeline.predict(X_test_temp)

    # Print accuracy
    print(f'Accuracy of {model_name}: {accuracy_score(y_test_temp, y_pred)}')

    # Print and plot confusion matrix
    cm = confusion_matrix(y_test_temp, y_pred)
    print(f'Confusion Matrix of {model_name}:\n {cm}')
    
    sns.heatmap(cm, annot=True, fmt='d')
    plt.title(f'Confusion Matrix for {model_name}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

    # Print classification report
    print(f'Classification Report of {model_name}:\n {classification_report(y_test_temp, y_pred)}')

In this stage of the project, several models were analyzed to identify the most effective approach for predicting the positive class, with a particular emphasis on recall. Logistic Regression and Support Vector Machines, despite having decent overall accuracy, performed poorly in identifying the positive class, making them less suitable for our specific goals. AdaBoost, Decision Trees, and Random Forest exhibited better performance in predicting the positive class with acceptable recall values, making them potential candidates for further exploration. However, EasyEnsemble stands out in this context, with a significantly higher recall for the positive class, albeit at the cost of overall accuracy. Given the project's focus on predicting the positive class correctly, EasyEnsemble might be the most suitable option to pursue further. Other models like AdaBoost and Random Forest could be considered for additional experimentation and tuning, aiming to enhance recall without sacrificing too much overall accuracy. In summary, EasyEnsemble seems to align best with the project's objectives, while AdaBoost and Random Forest may serve as complementary or alternative approaches, depending on the final desired balance between recall and overall performance.

## Using class weights to improve recall for class 1

The original model results showed something that wasn't too surprising. All of the models had pretty good accuracy of at least 70%. This wasn't true for all metrics, especially recall for class 1. As Recall for class 1 represents the percent of people who will default being predicted correctly, it is an important statistic. The cause for the low value of this statistic is mainly due to to the class imbalance, where Class 0 outnumber Class 1 approxamately 4 to 1. Of the models tried above, EasyEnsemble is the only model with built in features to handle this issue, which is why it had the best recall for class 1. We wanted to see if using the other models with a strategy to deal with class imbalance could help. Therefore, below we try the same models, but with using class weights in order to account for the class imbalance.

In [None]:
models_weights = {
    'Logistic Regression': LogisticRegression(max_iter=10000, class_weight ={0:1, 1:4}),
    'Support Vector Machines': LinearSVC(max_iter=10000, C=10, class_weight ={0:1, 1:4}),
    'Decision Trees': DecisionTreeClassifier( class_weight ={0:1, 1:4}),
    'AdaBoost': AdaBoostClassifier(base_estimator=DecisionTreeClassifier(class_weight ={0:1, 1:4})),
    'Random Forest': RandomForestClassifier(class_weight ={0:1, 1:4}),
}


for model_name, model in models_weights.items():
    
    if model_name == 'AdaBoost':
        # Apply one-hot encoding to categorical columns
        X_encoded = preprocessor.fit_transform(df.drop('Y', axis=1))
        y = df['Y'].astype('int')

        # Split data into train and test sets
        X_train1, X_test1, y_train1, y_test1 = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

        # Fit the model with weighted samples
        sample_weights = y_train.map({0: 1, 1: 4})
        model.fit(X_train1, y_train1, sample_weight=sample_weights)
        
        # Make predictions
        y_pred = model.predict(X_test1)
        
        # Print accuracy
        print(f'Accuracy of {model_name}: {accuracy_score(y_test1, y_pred)}')

        # Print confusion matrix
        print(f'Confusion Matrix of {model_name}:\n {confusion_matrix(y_test1, y_pred)}')

        # Print classification report
        print(f'Classification Report of {model_name}:\n {classification_report(y_test1, y_pred)}')
        
        
    else:
        X_train_temp = X_train.copy()
        y_train_temp = y_train.copy()
        X_test_temp = X_test.copy()
        y_test_temp = y_test.copy()

        # Create preprocessing and training pipeline
        pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                   (model_name, model)])

        # Fit the model
        pipeline.fit(X_train_temp, y_train_temp)

        # Make predictions
        y_pred = pipeline.predict(X_test_temp)

    # Print accuracy
    print(f'Accuracy of {model_name}: {accuracy_score(y_test_temp, y_pred)}')

    # Print confusion matrix
    print(f'Confusion Matrix of {model_name}:\n {confusion_matrix(y_test_temp, y_pred)}')

    # Print classification report
    print(f'Classification Report of {model_name}:\n {classification_report(y_test_temp, y_pred)}')


The confusion matrices above reveal the impact of class weighting on the evaluation metrics of our supervised learning models, emphasizing improvements in the recall of Class 1, the minority class. The adaptation of class weights is a strategy to mitigate class imbalance, by attributing greater significance to the minority class in the loss function. The analysis indicates diverse effects across the models. For both Logistic Regression and Support Vector Machines, there is a notable decline in accuracy, offset by a substantial enhancement in the recall for the minority class. This indicates a heightened sensitivity to the minority class but at the expense of increased misclassification of the majority class. Decision Trees exhibit stability in performance, whereas AdaBoost demonstrates a minor decrement in both accuracy and recall, and the Random Forest model also shows a marginal reduction in performance. These trends are corroborated by the confusion matrices, with a general increase in both true positives (TP) and false positives (FP) for the minority class following the adjustment of class weights. In summary, the implementation of class weighting has led to a trade-off between accuracy and recall for the minority class in some models, and compromised performance in others. This underscores the necessity of careful calibration of class weights, considering the specific problem domain, the criticality of accurately identifying instances of the minority class, and the characteristics of the model in question.

## What techniques are being used for modeling? (Justifications on Model Selection from High to Low Bias)

1. Logistic Regression: Logistic Regression is our starting point as it's a simple yet effective algorithm for binary classification problems like ours. Despite its simplicity, Logistic Regression can perform well when features have a linear relationship with the log-odds of the outcome (default or not default). Moreover, logistic regression models are very interpretable, which is a bonus when we are first trying to understand our data.

2. Support Vector Machines: We move to SVMs when we suspect that the boundaries between default and non-default credit card users might not be linear. SVMs can use kernel functions to handle such non-linearity. Also, SVMs can handle high-dimensional data well, which is relevant given the number of features we might have in a credit card dataset.

3. Decision Trees: Decision Trees are chosen for their ability to handle non-linear relationships and their interpretability. They are more flexible than both Logistic Regression and SVMs and do not require any assumptions about the relationship between features and the target variable. Moreover, they can handle both numerical and categorical variables which are common in credit card data.

4. AdaBoost: AdaBoost, an ensemble method, is used to potentially improve the performance of our Decision Tree. By combining multiple weak learners (small decision trees), AdaBoost forms a more robust model that can generalize better to unseen data. It is especially useful if some of our features are weakly associated with the outcome but collectively they can predict the outcome well.

5. Random Forest: Random Forest is another ensemble model that creates a bunch of decision trees and aggregates their predictions. It's less likely to overfit than a single decision tree, which makes it a good choice if we have a lot of features and are worried about overfitting. Given the high-dimensional nature of credit card data, Random Forest is likely to improve our predictions.

6. EasyEnsemble: We finally move to EasyEnsemble when tackling the class imbalance in our dataset. Given that defaults are typically less frequent than non-defaults in credit card datasets, EasyEnsemble helps by creating balanced subsets of data and using an ensemble of classifiers (each trained on a different subset). This approach ensures that our model is exposed to enough default examples during training and hence can generalize better to default cases in unseen data.

# PCA Analysis

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Standardize the numeric features
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler()), 
                                      ('pca', PCA(n_components=0.95))])

# One hot encode the categorical features
categorical_transformer = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Create the preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, cat_cols)])

# Fit and transform the train data, then transform the test data
X_train_pca = preprocessor.fit_transform(X_train)
X_test_pca = preprocessor.transform(X_test)

# For this setup, you're not directly accessing the PCA object, so if you want the number of components and explained variance,
# you'll have to extract it from the transformer:
pca_components = preprocessor.named_transformers_['num']['pca'].n_components_
explained_variance = preprocessor.named_transformers_['num']['pca'].explained_variance_ratio_

print("Number of PCA components: ", pca_components)
print("Explained variance: ", explained_variance)


We utilized Principal Component Analysis (PCA) to transform our original dataset into a new coordinate system. The results revealed that 95% of the variance in the data could be explained by 13 principal components. The first component explained approximately 39.3% of the variance, and the second component contributed an additional 14.3%. Together, the first two components accounted for over 53% of the total variance in the data. This dimensionality reduction technique enabled us to retain the most essential features while minimizing information loss, making the dataset more manageable. Ultimately, using PCA in our project will streamline the modeling process, potentially improving the efficiency and accuracy of predicting credit defaulters

In [None]:
for model_name, model in models.items():
    # Fit the model
    model.fit(X_train_pca, y_train)

    # Make predictions
    y_pred = model.predict(X_test_pca)

    # Print accuracy
    print(f'Accuracy of {model_name}: {accuracy_score(y_test, y_pred)}')

    # Print confusion matrix
    print(f'Confusion Matrix of {model_name}:\n {confusion_matrix(y_test, y_pred)}')

    # Print classification report
    print(f'Classification Report of {model_name}:\n {classification_report(y_test, y_pred)}')

In our project to predict credit defaulters, we compared different models both before and after applying Principal Component Analysis (PCA). Initially, models like Logistic Regression, AdaBoost, and Random Forest showed relatively good performance with around 80% accuracy, but were generally weak in recalling the positive class (defaulters). After applying PCA, the results were mixed. For instance, Logistic Regression and Support Vector Machines maintained their accuracy, while Decision Trees and EasyEnsemble experienced slight decreases.

Post-PCA, some models saw a slight increase in true positives, indicating better identification of defaulters, while others remained stagnant or even decreased. The overall accuracy across the models remained relatively consistent, with minor fluctuations. The changes suggest that PCA's transformation may have affected the models' sensitivities differently. While PCA helped maintain or slightly improve the performance in predicting defaulters in some models, further investigation and tuning might be needed to fully leverage PCA for this specific problem.

# Hyperparameter Tuning (Dataset Features)

In [None]:
from sklearn.model_selection import GridSearchCV

# Specify parameter grid for each model
param_grid = {
    'Logistic Regression': {
        'Logistic Regression__C': [0.1, 1.0, 10.0, 100.0]
    },
    'Support Vector Machines': {
        'Support Vector Machines__C': [0.1, 1.0, 10.0, 100.0]
    },
    'Decision Trees': {
        'Decision Trees__max_depth': [50, 100, 500, 1000],
        'Decision Trees__min_samples_split': [2, 5, 10],
        'Decision Trees__min_samples_leaf': [1, 2, 5]
    },
    'Random Forest': {
        'Random Forest__n_estimators': [50, 100, 200, 500],
        'Random Forest__max_depth': [50, 100, 500],
        'Random Forest__min_samples_split': [2, 5, 10, 20],
        'Random Forest__min_samples_leaf': [1, 2, 5, 10]
    },
    'AdaBoost': {
        'AdaBoost__n_estimators': [10, 50, 100, 200, 500, 1000],
        'AdaBoost__learning_rate': [0.001, 0.01, 0.1, 1.0, 10.0]
    },
    'EasyEnsemble': {
        'EasyEnsemble__n_estimators': [10, 50, 100, 200],
        'EasyEnsemble__sampling_strategy': ['auto', 0.5, 0.7]
    }

}

# Define a function to execute grid search on a model
def perform_grid_search(model_name, model):
    # Create the pipeline
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               (model_name, model)])
    
    # Create the KFold object
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)


    # Define the GridSearchCV object
    grid_search = GridSearchCV(pipeline, param_grid[model_name], cv=kfold, scoring='recall')

    # Fit the model to the training data
    grid_search.fit(X_train_temp, y_train_temp)

    # Print the best parameters and the corresponding score
    print(f'Model: {model_name}')
    print(f'Best parameters: {grid_search.best_params_}')
    print(f'Best recall: {grid_search.best_score_}\n')

# Execute grid search on each model
for model_name, model in models.items():
    perform_grid_search(model_name, model)


After performing hyperparameter tuning using grid search on the raw features of the dataset, several models were analyzed. Logistic Regression achieved a best recall of 0.1687. Support Vector Machines had a best recall of 0.1195 but raised convergence warnings, possibly needing more iterations or regularization adjustments. This low recall suggests that the model might not be suitable for this problem. Decision Trees showed a decent performance with a best recall of 0.3940. Random Forest gave a moderate performance with a best recall of 0.2863. EasyEnsemble yielded a strong performance with a best recall of 0.6259, second only to AdaBoost.

The standout performer among the models on the raw features was AdaBoost, achieving the highest recall of 0.7476. This high recall suggests that AdaBoost is effectively capturing the positive class in the dataset, making it the best-performing model in terms of recall on the raw features. The continuous warnings regarding the failure of convergence in the Support Vector Machines indicate that the model may need more attention to properly fit the dataset. Other models, like EasyEnsemble, also show promising results and could be further explored. The analysis on raw features, as opposed to principal components, gives an insight into the models' performance without any dimensionality reduction, providing a baseline for further evaluations.

## Hyperparameter Tuning (PCA)

In [None]:
# Performing grid search on PCA dataset

def perform_grid_search_pca(model_name, model):
    # Create a pipeline with preprocessor and model
    pipeline = Pipeline(steps=[
        (model_name, model)
    ])
    
    # Creating the KFold object
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    
    # Define GridSearchCV object
    grid_search = GridSearchCV(pipeline, param_grid[model_name], cv=kfold, scoring='recall')

    # Fit to training data
    grid_search.fit(X_train_pca, y_train)

    # Print results
    print(f'Model: {model_name}')
    print(f'Best parameters: {grid_search.best_params_}')
    print(f'Best recall: {grid_search.best_score_}\n')

# Execute grid search on each model
for model_name, model in models.items():
    perform_grid_search_pca(model_name, model)


The results from hyperparameter tuning on the principal components offer an insightful comparison to the performance obtained on raw features. AdaBoost emerged as the best-performing model with a recall of 0.7479, demonstrating strong class identification. Other models, such as EasyEnsemble, Random Forest, and Decision Trees, also performed well but did not reach the same level as AdaBoost.

However, the performance of some models, particularly Support Vector Machines (SVM), was noticeably lower compared to the raw features. In this case, PCA did not seem to benefit SVM, as the best recall achieved was only 0.1365.

In terms of evaluating whether PCA was a beneficial preprocessing step, the results are mixed. While some models like AdaBoost performed admirably, others such as SVM did not benefit from the dimensionality reduction provided by PCA. The decision to utilize PCA would likely be dependent on the specific model and problem domain, considering both computational efficiency and model performance.

In conclusion, the adoption of PCA in this hyperparameter tuning process has shown some promising results, particularly with models like AdaBoost. However, it is not a one-size-fits-all solution, and careful consideration must be given to the choice of model and the characteristics of the data to determine whether PCA will enhance or possibly hinder performance.

## Best Model Performances

In all the previous sections, we tried out various different models and saw some worked better than others. Depending on your goal, your most important metrics will be better than others. If you have all this credita card data, and our trying to figure out as many at risk accounts as possible, recall for class 1 is very important. However, if you are using this to give new credit cards to people who are unlikely to default, precision of class 0 is most important. 

The best models for class 1 recall that were SVM and logistic regresssion while using class weights. Neither of these models were improved by using PCA, so we won't use PCA in these final models. We will use the hyperparamters found in tuning for these models. Below we will run these models that combine various things done earlier in this report.

The model with the highest precision of class 0 was the easy ensemble model. This model had the same class 0 precision both with and without PCA. I had overall higher accuracy without PCA, so we will use that version.

In [None]:
# defining new list of models to test
models = {
    'Logistic Regression': LogisticRegression(max_iter=10000, C = 100.0, class_weight ={0:1, 1:4}),
    'Support Vector Machines': LinearSVC(max_iter=10000, C=1, class_weight ={0:1, 1:4}),
    'AdaBoost': AdaBoostClassifier(n_estimators = 10, learning_rate = 10.0)
    'EasyEnsemble': EasyEnsembleClassifier(n_estimators = 200)
}

for model_name, model in models.items():
    X_train_temp = X_train.copy()
    y_train_temp = y_train.copy()
    X_test_temp = X_test.copy()
    y_test_temp = y_test.copy()
    
    # Apply PCA if the model is AdaBoost
    if model_name == 'AdaBoost':
        pca = PCA(n_components=0.95)
        X_train_temp = pca.fit_transform(X_train_temp)
        X_test_temp = pca.transform(X_test_temp)


    # Create preprocessing and training pipeline
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               (model_name, model)])

    # Fit the model
    pipeline.fit(X_train_temp, y_train_temp)

    # Make predictions
    y_pred = pipeline.predict(X_test_temp)

    # Print accuracy
    print(f'Accuracy of {model_name}: {accuracy_score(y_test_temp, y_pred)}')

    # Print classification report
    print(f'Classification Report of {model_name}:\n {classification_report(y_test_temp, y_pred)}')
    
    
    conf_mat = confusion_matrix(y_true=y_test_temp, y_pred=y_pred)
    
    conf_mat_disp = ConfusionMatrixDisplay(conf_mat, display_labels=np.unique(y_test_temp))
    conf_mat_disp.plot()

    plt.gcf().set_size_inches(8, 8)

    plt.grid(False)
    
    plt.show()


## Complexity of the dataset:

1. **Did we use raw features or apply feature engineering?** 
We applied feature engineering. Using PCA, we changed the original data into a set of new features that capture most of the data's variation. This helps when dealing with big datasets by reducing unnecessary details and potentially improving our model's predictions.

2. **How did we manage the dataset's size and complexity?** 
We used PCA to handle the dataset's complexity. This method finds and uses the main patterns in the data, reducing the number of features. The number of these main patterns or "principal components" we kept changes the dataset's size.
On top of that, we tweaked settings in our machine learning algorithms, a process called hyperparameter tuning. While this doesn't directly reduce data size, it helps our model work better with the transformed data.
By using PCA and tuning hyperparameters, we aim to make our model predict better. Still, it's essential for us to keep checking how well our model works, especially on new data it hasn't seen before.