# Set Up code

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from tqdm.autonotebook import tqdm
import matplotlib.pyplot as plt
import torch
from torch import nn
from sklearn.preprocessing import PowerTransformer
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,\
    HistGradientBoostingClassifier,GradientBoostingClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from torch.utils.data import DataLoader
from xgboost import XGBClassifier
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import classification_report,confusion_matrix,ConfusionMatrixDisplay
%matplotlib inline

  from tqdm.autonotebook import tqdm


In [2]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

In [3]:
#mouting google drive to the notebook to retrive data files - not necessary if running locally
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
#Setting a PyTorch seed for the rest of the code
torch.manual_seed(71)
torch.cuda.manual_seed(71)

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [6]:
train_path = r'/content/drive/MyDrive/_shared/Data-Mining/data-clean/train.csv'
# test_path = r'/content/drive/MyDrive/_shared/Data-Mining/data-clean/test.csv' Not used due to lack of labels available
df = pd.read_csv(train_path, low_memory=False)

# Preparing Data for Models

In [7]:
df['Credit_Score'] = df['Credit_Score'].map({'Good': 0, 'Poor': 1, 'Standard': 2})

In [8]:
'''Removing columns that provide any useful information.'''

all_features = df.columns.to_list()
values_to_remove = {'ID', 'Customer_ID', 'Month', 'Name', 'SSN'}
all_features = [x for x in all_features if x not in values_to_remove]
df = df[all_features]
all_features

['Age',
 'Occupation',
 'Annual_Income',
 'Monthly_Inhand_Salary',
 'Num_Bank_Accounts',
 'Num_Credit_Card',
 'Interest_Rate',
 'Num_of_Loan',
 'Type_of_Loan',
 'Delay_from_due_date',
 'Num_of_Delayed_Payment',
 'Changed_Credit_Limit',
 'Num_Credit_Inquiries',
 'Credit_Mix',
 'Outstanding_Debt',
 'Credit_Utilization_Ratio',
 'Credit_History_Age',
 'Payment_of_Min_Amount',
 'Total_EMI_per_month',
 'Amount_invested_monthly',
 'Payment_Behaviour',
 'Monthly_Balance',
 'Credit_Score']

In [9]:
# '''Feature List extracted from Random Forest Classifier feature importance scores in the aonther notebook '''
# feature_list = ['Monthly_Inhand_Salary','Annual_Income', 'Amount_invested_monthly','Num_of_Delayed_Payment', 'Num_Bank_Accounts', 'Interest_Rate','Delay_from_due_date','Changed_Credit_Limit','Outstanding_Debt','Credit_History_Age', 'Credit_Score']

# df = df[feature_list]

In [11]:
obj_to_num_dict={}
obj_to_num_dict['Month']={'January' : 1,
                          'February' : 2,
                          'March' : 3,
                          'April' : 4,
                          'May' : 5,
                          'June' : 6,
                          'July' : 7,
                          'August': 8,
                          'September':9,
                          'October':10,
                          'November':11,
                          'December':12}
num_to_obj_dict={}
def convert_str_to_num(df=df):
    """
    Function to convert all categorical columns in the dataframe to
    numerical values (starting from 0) while storing the mappings.

    Params:
        df - Pandas DataFrame

    Output:
        Returns the modified dataframe with all cotegorical columns coverted to numerical values.
    """

    # df=pd.get_dummies(df,columns=['Occupation'])
    for column in (df.select_dtypes(include='object').columns):

        obj_to_num_dict[column]={}
        num_to_obj_dict[column]={}

        for n,unique_value in enumerate(df[column].unique()):
            df[column]=df[column].replace( unique_value , n)
            obj_to_num_dict[ column][unique_value]= n
            num_to_obj_dict[ column][n+1]= unique_value

        df[column]=df[column].astype('uint8')

    return df


In [12]:
df = convert_str_to_num(df)

In [13]:
df.head(10)

Unnamed: 0,Monthly_Inhand_Salary,Annual_Income,Amount_invested_monthly,Num_of_Delayed_Payment,Num_Bank_Accounts,Interest_Rate,Delay_from_due_date,Changed_Credit_Limit,Outstanding_Debt,Credit_History_Age,Credit_Score
0,1824.843333,19114.12,21.46538,7.0,3.0,3.0,3.0,11.27,809.98,265.0,0
1,1824.843333,19114.12,21.46538,4.0,3.0,3.0,3.0,11.27,809.98,266.0,0
2,1824.843333,19114.12,21.46538,7.0,3.0,3.0,3.0,11.27,809.98,267.0,0
3,1824.843333,19114.12,21.46538,4.0,3.0,3.0,5.0,6.27,809.98,268.0,0
4,1824.843333,19114.12,21.46538,4.0,3.0,3.0,6.0,11.27,809.98,269.0,0
5,1824.843333,19114.12,21.46538,4.0,3.0,3.0,8.0,9.27,809.98,270.0,0
6,1824.843333,19114.12,21.46538,8.0,3.0,3.0,3.0,11.27,809.98,271.0,0
7,1824.843333,19114.12,21.46538,6.0,3.0,3.0,3.0,11.27,809.98,272.0,2
8,3037.986667,34847.84,39.684018,4.0,2.0,6.0,3.0,5.42,605.03,319.0,2
9,3037.986667,34847.84,39.684018,1.0,2.0,6.0,7.0,7.42,605.03,320.0,0


## Applying SMOTE

In [14]:
def split_and_rebalance(dataframe = df, test_size = 0.2):
    """
    Takes in a dataframe and oversamples minority classes using SMOTE,
    based on a ratio provided in wider literature (explained in the report).

    Params:
        df - dataframe with the target data
        test_size - Ratio of the testing data split (0-1)

    Output:
        x_train - Dataframe with the training data
        x_test - Dataframe with the testing data
        y_train - Series with the target values for traning data
        y_test - Series with target values for test data

    """

    y = dataframe['Credit_Score']
    X = dataframe.drop('Credit_Score', axis=1)  # Replace 'label_column_name' with the actual column name of the label

    # splitting with 30% of the data being test data
    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=71)

    strategy = {0: 27000, 1: 27000}
    sampler = SMOTE(sampling_strategy=strategy)

    x_train, y_train = sampler.fit_resample(x_train, y_train)
    print(y_train.value_counts())

    return x_train, x_test, y_train, y_test

# Training Classifiers for the Ensemble

In [15]:
def train_model(estimator,df,target='Credit_Score',test_size=0.2):
    """
    Takes in target classifiers, splits and trains on balanced data (using SMOTE)
    and returns the trained classifier and its classification report.

    Params:
        estimator - Name of the Classifier
        df - dataframe with the target data
        target - The target variable to predict
        test_size - Ratio of the testing data split (0-1)
        report - Toggle classication report being returned
        random_state - ,return_pipeline=False

    Output:
        model_pipeline - Trained model pipeline
        report_test - Full classification report proviiding precision, recall, accuracy and F1 score

    """

    x_train, x_test, y_train, y_test = split_and_rebalance(df, test_size)

    model_pipeline=make_pipeline(None,estimator)
    model_pipeline.fit(x_train,y_train)


    print(f'test report:\n{classification_report(model_pipeline.predict(x_test),y_test)}')
    print(f'train report:\n{classification_report(model_pipeline.predict(x_train), y_train)}')


    report_test=pd.DataFrame.from_dict(classification_report(
    model_pipeline.predict(x_test),y_test,output_dict=True))

    return model_pipeline,report_test

In [16]:
models_and_reports = {}

In [17]:
params = {
 'base_score': 0.5,
 'colsample_bylevel': 1,
 'colsample_bytree': 1,
 'gamma': 1,
 'learning_rate': 1e-1,
 'max_delta_step': 1,
 'max_depth': 10,
 'min_child_weight': 1,
 'n_estimators': 232,
 'objective': 'binary:logistic',
 'reg_alpha': 9e-1,
 'reg_lambda': 0.8,
 'scale_pos_weight': 1,
 'seed': 21,
 'silent': 1,
 'subsample': 1,
}

xgBoost = XGBClassifier(**params)

In [18]:
models_and_reports['RandomForestClassifier'] = train_model(
    estimator=RandomForestClassifier(max_depth=25,n_estimators=100,random_state=3,
                                     min_samples_split=3,criterion='entropy',
                                     bootstrap=True,n_jobs=-1),
                                        df=df,test_size=0.3)

Credit_Score
2    37344
1    27000
0    27000
Name: count, dtype: int64
test report:
              precision    recall  f1-score   support

           0       0.81      0.68      0.74      6466
           1       0.83      0.79      0.81      9202
           2       0.76      0.84      0.80     14332

    accuracy                           0.79     30000
   macro avg       0.80      0.77      0.78     30000
weighted avg       0.79      0.79      0.79     30000

train report:
              precision    recall  f1-score   support

           0       0.99      0.92      0.96     29047
           1       0.98      0.96      0.97     27556
           2       0.92      0.99      0.95     34741

    accuracy                           0.96     91344
   macro avg       0.96      0.96      0.96     91344
weighted avg       0.96      0.96      0.96     91344



In [19]:
models_and_reports['OneVsRestClassifier'] = train_model(
    estimator=OneVsRestClassifier(RandomForestClassifier(max_depth=22,n_estimators=100,random_state=3,
                                     min_samples_split=3,criterion='entropy',
                                     bootstrap=True,n_jobs=-1)),
                                        df=df,test_size=0.3)  # - F1 -> 0.8012

Credit_Score
2    37344
1    27000
0    27000
Name: count, dtype: int64
test report:
              precision    recall  f1-score   support

           0       0.80      0.66      0.73      6582
           1       0.83      0.79      0.81      9162
           2       0.76      0.84      0.80     14256

    accuracy                           0.79     30000
   macro avg       0.80      0.77      0.78     30000
weighted avg       0.79      0.79      0.79     30000

train report:
              precision    recall  f1-score   support

           0       0.98      0.90      0.94     29513
           1       0.95      0.93      0.94     27606
           2       0.89      0.97      0.92     34225

    accuracy                           0.93     91344
   macro avg       0.94      0.93      0.94     91344
weighted avg       0.94      0.93      0.93     91344



In [20]:
models_and_reports['XGBoost'] = train_model(estimator=xgBoost, df=df,test_size=0.3)

Credit_Score
2    37344
1    27000
0    27000
Name: count, dtype: int64


Parameters: { "scale_pos_weight", "silent" } are not used.



test report:
              precision    recall  f1-score   support

           0       0.77      0.59      0.67      7069
           1       0.78      0.77      0.77      8843
           2       0.72      0.81      0.76     14088

    accuracy                           0.75     30000
   macro avg       0.76      0.72      0.74     30000
weighted avg       0.75      0.75      0.74     30000

train report:
              precision    recall  f1-score   support

           0       0.91      0.80      0.85     30770
           1       0.85      0.85      0.85     26967
           2       0.77      0.86      0.81     33607

    accuracy                           0.83     91344
   macro avg       0.84      0.83      0.84     91344
weighted avg       0.84      0.83      0.84     91344



In [21]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.3-cp310-cp310-manylinux2014_x86_64.whl (98.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.3


In [22]:
from catboost import CatBoostClassifier

models_and_reports['CatBoostClassifier'] = train_model(
    estimator=OneVsRestClassifier(CatBoostClassifier(iterations=300, depth=10, learning_rate=0.01, loss_function='MultiClass')),
                    df=df,test_size=0.2)

Credit_Score
2    42648
0    27000
1    27000
Name: count, dtype: int64
0:	learn: 0.6886313	total: 183ms	remaining: 54.8s
1:	learn: 0.6840636	total: 311ms	remaining: 46.3s
2:	learn: 0.6796233	total: 575ms	remaining: 56.9s
3:	learn: 0.6752734	total: 850ms	remaining: 1m 2s
4:	learn: 0.6709321	total: 972ms	remaining: 57.3s
5:	learn: 0.6666420	total: 1.11s	remaining: 54.3s
6:	learn: 0.6625649	total: 1.23s	remaining: 51.5s
7:	learn: 0.6584840	total: 1.37s	remaining: 50s
8:	learn: 0.6545134	total: 1.5s	remaining: 48.3s
9:	learn: 0.6505684	total: 1.7s	remaining: 49.3s
10:	learn: 0.6467066	total: 2.02s	remaining: 53s
11:	learn: 0.6429129	total: 2.34s	remaining: 56.1s
12:	learn: 0.6391611	total: 2.63s	remaining: 58s
13:	learn: 0.6354982	total: 2.87s	remaining: 58.6s
14:	learn: 0.6319315	total: 3.12s	remaining: 59.2s
15:	learn: 0.6284150	total: 3.4s	remaining: 1m
16:	learn: 0.6249361	total: 3.69s	remaining: 1m 1s
17:	learn: 0.6214967	total: 3.93s	remaining: 1m 1s
18:	learn: 0.6181335	total: 4.14

# Voting Ensemble

In [23]:
def ensemble_predictor(dataset=None, target_labels=None, model_lookup=models_and_reports,
                       predictors=None, use_weights=False, scoring_metric='precision'):
    """
    Performs weighted/mojority ensemble predictions using multiple classifiers. This function aggregates predictions
    from various classifiers defined in `model_lookup` (whcih must also be present in the 'models_and_reports' dictionary)
    based on a specified scoring metric.

    Parameters:
        dataset - Dataframe with the input dataset for which predictions are to be made.
        target_labels - Series with the actual labels for the dataset, used for calculating accuracy.
        model_lookup - A dictionary with model names as keys with tuples (model_pipeline, performance_report) as values.
        predictors - List of model names specifically to be used for prediction. If None, all models in `model_lookup` are employed.
        use_weights - Toggles whether to use weighted predictions based on `scoring_metric`.
        scoring_metric - Performance metric used to determine weights from model performance reports. Default is set to 'precision'.

    Returns:
        Union - A dataframe with weighted or majority voted predictions depending on `use_weights`.
        When `target_labels` is provided, also prints accuracy and classification report.

    """
    baseline_predictions = pd.DataFrame()
    model_weights = pd.DataFrame()
    final_predictions = pd.DataFrame()

    if predictors is None:
        predictors = model_lookup.keys()

    for idx, model in enumerate(predictors):
        if idx == 0:
            baseline_predictions = pd.DataFrame({str(model): model_lookup[model][0].predict(dataset)})
            if use_weights:
                model_weights = pd.DataFrame({str(model): model_lookup[model][-1].T[scoring_metric][:3]})
        else:
            baseline_predictions[str(model)] = model_lookup[model][0].predict(dataset)
            if use_weights:
                model_weights[str(model)] = model_lookup[model][-1].T[scoring_metric][:3]

    if use_weights:
        for model_idx, model_name in enumerate(baseline_predictions):
            model_weighted_factor = pd.get_dummies(baseline_predictions[model_name]) * model_weights[model_name][-1].tolist()
            if model_idx == 0:
                final_predictions = model_weighted_factor
            else:
                final_predictions += model_weighted_factor

        print(final_predictions.head(5))
        weighted_votes = np.argmax(final_predictions.to_numpy(), axis=1)

        if target_labels is not None:
            accuracy = np.mean(weighted_votes == target_labels)
            print(f'Weighted (by {scoring_metric}) prediction accuracy: {accuracy}\n'
                  f'{classification_report(weighted_votes, target_labels)}')

        return weighted_votes

    else:
        majority_votes = baseline_predictions.mode(axis=1)[0].to_numpy()
        if target_labels is not None:
            accuracy = np.mean(majority_votes == target_labels) # comparing baseline predictins with target labels
            print(f'Unweighted prediction accuracy: {accuracy}'
                  f'\n{classification_report(majority_votes, target_labels)}')
        return pd.DataFrame(majority_votes)


In [25]:
main_metric='f1-score'

y = df['Credit_Score']
X = df.drop('Credit_Score', axis=1)  # Replace 'label_column_name' with the actual column name of the label

estim_list = ['RandomForestClassifier','XGBoost','OneVsRestClassifier', 'CatBoostClassifier']
x_train, x_test, y_train, y_test = split_and_rebalance(df, 0.2)
weighted_pre=ensemble_predictor(dataset=x_test, target_labels=y_test, model_lookup=models_and_reports,
                       predictors=estim_list, use_weights=True, scoring_metric=main_metric)

Credit_Score
2    42648
0    27000
1    27000
Name: count, dtype: int64
          0         1         2
0  0.000000  3.082632  0.000000
1  0.000000  1.482713  1.599919
2  3.082632  0.000000  0.000000
3  1.599919  0.000000  1.482713
4  0.000000  0.000000  3.082632
Weighted (by f1-score) prediction accuracy: 0.78905
              precision    recall  f1-score   support

           0       0.81      0.67      0.73      4400
           1       0.83      0.79      0.81      6143
           2       0.76      0.84      0.80      9457

    accuracy                           0.79     20000
   macro avg       0.80      0.77      0.78     20000
weighted avg       0.79      0.79      0.79     20000



In [26]:
# End of the Notebbok