# Set Up code

In [37]:
import pandas as pd
import numpy as np
import seaborn as sns
from tqdm.autonotebook import tqdm
import matplotlib.pyplot as plt
import torch
from torch import nn
from sklearn.preprocessing import PowerTransformer
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,\
    HistGradientBoostingClassifier,GradientBoostingClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from torch.utils.data import DataLoader
from xgboost import XGBClassifier
from imblearn.over_sampling import ADASYN,RandomOverSampler
from sklearn.metrics import classification_report,confusion_matrix,ConfusionMatrixDisplay
%matplotlib inline

In [2]:
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

In [3]:
#mouting google drive to the notebook to retrive data files - not necessary if running locally
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
#Setting a PyTorch seed for the rest of the code
torch.manual_seed(71)
torch.cuda.manual_seed(71)

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [6]:
pd.set_option('display.max_columns',35)
train_path = r'/content/drive/MyDrive/_shared/Data-Mining/data-clean/train.csv'
# test_path = r'/content/drive/MyDrive/_shared/Data-Mining/data-clean/test.csv' Not used due to lack of labels available
df = pd.read_csv(train_path, low_memory=False)

In [7]:
# Split the dataset into features and the label
X = df.drop('Credit_Score', axis=1)  # Replace 'label_column_name' with the actual column name of the label
y = df['Credit_Score']  # The label column

In [8]:
# Split the data - 80% training, 20% testing
X_train, X_test, labels, test_labels = train_test_split(X, y, test_size=0.2, random_state=71)

In [9]:
len(labels)

80000

In [10]:
X_train.head()

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Type_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,Num_Credit_Inquiries,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance
52694,84672,48725,7,Kimv,33.0,784169686.0,Accountant,16907.7,1212.944231,6.0,7.0,24.0,8.0,"credit-builder loan,personal loan,home equity ...",53.0,22.0,24.0,9.0,Bad,2887.41,27.213067,74.0,Yes,82.044258,23.883016,Low_spent_Large_value_payments,224.368765
45619,74061,20559,4,Guptap,30.0,235751663.0,Media_Manager,15548.34,1291.695,7.0,10.0,33.0,5.0,"payday loan,debt consolidation loan,not specif...",39.0,25.0,15.71,11.0,Bad,1398.98,24.802695,210.0,Yes,58.154986,33.162773,Low_spent_Large_value_payments,274.329947
35571,58989,34281,4,Anna Driveri,45.0,288627562.0,Musician,20929.295,1704.107917,5.0,3.0,9.0,3.0,"student loan,payday loan,debt consolidation loan",17.0,5.0,1.99,4.0,Good,858.09,26.292352,260.0,No,40.398175,21.651777,Low_spent_Small_value_payments,342.937266
78511,123397,20083,8,Yantoultra Nguic,47.0,493467198.0,Developer,113192.68,9376.723333,3.0,7.0,3.0,0.0,No Data,0.0,2.0,6.15,2.0,Good,1179.64,40.204174,228.0,No,0.0,121.944556,Low_spent_Small_value_payments,735.919556
93071,145237,17260,8,Johnson Jilianq,52.0,807286080.0,Journalist,41942.1,3355.175,0.0,3.0,1.0,1.0,student loan,8.0,12.0,16.26,6.0,Good,1303.34,37.202522,398.0,No,19.64905,34.341542,High_spent_Medium_value_payments,445.273448


In [11]:
X_test.head()

Unnamed: 0,ID,Customer_ID,Month,Name,Age,SSN,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Type_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,Num_Credit_Inquiries,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance
88035,137685,27934,4,Steveu,43.0,549546864.0,Mechanic,16144.57,1280.380833,6.0,6.0,27.0,5.0,"personal loan,personal loan,payday loan,studen...",36.0,22.0,8.65,11.0,Bad,1496.51,23.265241,215.0,Yes,34.307808,44.335663,Low_spent_Small_value_payments,301.055453
43377,70699,35117,2,Guip,15.0,148151292.0,Manager,9427.945,921.662083,6.0,6.0,28.0,7.0,"home equity loan,credit-builder loan,student l...",37.0,25.0,9.67,8.0,Bad,2358.57,24.52084,93.0,NM,48.094712,33.17489,Low_spent_Medium_value_payments,271.523174
88253,138011,25667,6,Gernot Hellerl,41.0,846270367.0,Developer,147781.64,12090.136667,0.0,4.0,4.0,0.0,No Data,6.0,8.0,6.83,3.0,Good,484.81,33.896689,230.0,No,0.0,84.405999,High_spent_Medium_value_payments,1060.686258
2004,8638,31796,5,Norihiko Shirouzun,27.0,326054047.0,Entrepreneur,121688.04,10006.67,7.0,3.0,12.0,4.0,"personal loan,mortgage loan,not specified,payd...",20.0,15.0,0.66,4.0,Standard,54.68,40.188421,313.0,No,288.665455,75.129703,High_spent_Large_value_payments,832.570558
87510,136896,44995,7,Alister Bullv,24.0,270737020.0,Developer,14901.88,1362.823333,7.0,7.0,13.0,1.0,personal loan,8.0,20.0,10.38,3.0,Standard,932.33,35.766078,236.0,No,7.29328,54.73228,Low_spent_Small_value_payments,302.383071


# Preparing Data for Models

In [22]:
all_features = df.columns.to_list()
values_to_remove = {'ID', 'Customer_ID', 'Month', 'Name', 'SSN'}
all_features = [x for x in all_features if x not in values_to_remove]
df = df[all_features]
all_features

['Age',
 'Occupation',
 'Annual_Income',
 'Monthly_Inhand_Salary',
 'Num_Bank_Accounts',
 'Num_Credit_Card',
 'Interest_Rate',
 'Num_of_Loan',
 'Type_of_Loan',
 'Delay_from_due_date',
 'Num_of_Delayed_Payment',
 'Changed_Credit_Limit',
 'Num_Credit_Inquiries',
 'Credit_Mix',
 'Outstanding_Debt',
 'Credit_Utilization_Ratio',
 'Credit_History_Age',
 'Payment_of_Min_Amount',
 'Total_EMI_per_month',
 'Amount_invested_monthly',
 'Payment_Behaviour',
 'Monthly_Balance',
 'Credit_Score']

In [35]:
feature_list = ['Monthly_Inhand_Salary','Annual_Income', 'Amount_invested_monthly','Num_of_Delayed_Payment', 'Num_Bank_Accounts', 'Interest_Rate','Delay_from_due_date','Changed_Credit_Limit','Outstanding_Debt','Credit_History_Age', 'Credit_Score']

df = df[all_features]

In [23]:
obj_to_num_dict={}
obj_to_num_dict['Month']={'January' : 1,
                          'February' : 2,
                          'March' : 3,
                          'April' : 4,
                          'May' : 5,
                          'June' : 6,
                          'July' : 7,
                          'August': 8,
                          'September':9,
                          'October':10,
                          'November':11,
                          'December':12}
num_to_obj_dict={}
def convert_str_to_num(df=df):
    # df=pd.get_dummies(df,columns=['Occupation'])
    for column in (df.select_dtypes(include='object').columns):

        obj_to_num_dict[column]={}
        num_to_obj_dict[column]={}

        for n,unique_value in enumerate(df[column].unique()):
            df[column]=df[column].replace( unique_value , n)
            obj_to_num_dict[ column][unique_value]= n
            num_to_obj_dict[ column][n+1]= unique_value

        df[column]=df[column].astype('uint8')

    return df


In [14]:
df = convert_str_to_num(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[column]=df[column].replace( unique_value , n)


In [26]:
df.head(10)

Unnamed: 0,Age,Occupation,Annual_Income,Monthly_Inhand_Salary,Num_Bank_Accounts,Num_Credit_Card,Interest_Rate,Num_of_Loan,Type_of_Loan,Delay_from_due_date,Num_of_Delayed_Payment,Changed_Credit_Limit,Num_Credit_Inquiries,Credit_Mix,Outstanding_Debt,Credit_Utilization_Ratio,Credit_History_Age,Payment_of_Min_Amount,Total_EMI_per_month,Amount_invested_monthly,Payment_Behaviour,Monthly_Balance,Credit_Score
0,23.0,0,19114.12,1824.843333,3.0,4.0,3.0,4.0,0,3.0,7.0,11.27,4.0,0,809.98,26.82262,265.0,0,49.574949,21.46538,0,312.494089,0
1,23.0,0,19114.12,1824.843333,3.0,4.0,3.0,4.0,0,3.0,4.0,11.27,4.0,0,809.98,31.94496,266.0,0,49.574949,21.46538,1,284.629162,0
2,23.0,0,19114.12,1824.843333,3.0,4.0,3.0,4.0,0,3.0,7.0,11.27,4.0,0,809.98,28.609352,267.0,0,49.574949,21.46538,2,331.209863,0
3,23.0,0,19114.12,1824.843333,3.0,4.0,3.0,4.0,0,5.0,4.0,6.27,4.0,0,809.98,31.377862,268.0,0,49.574949,21.46538,3,223.45131,0
4,23.0,0,19114.12,1824.843333,3.0,4.0,3.0,4.0,0,6.0,4.0,11.27,4.0,0,809.98,24.797347,269.0,0,49.574949,21.46538,4,341.489231,0
5,23.0,0,19114.12,1824.843333,3.0,4.0,3.0,4.0,0,8.0,4.0,9.27,4.0,0,809.98,27.262259,270.0,0,49.574949,21.46538,4,340.479212,0
6,23.0,0,19114.12,1824.843333,3.0,4.0,3.0,4.0,0,3.0,8.0,11.27,4.0,0,809.98,22.537593,271.0,0,49.574949,21.46538,3,244.565317,0
7,23.0,0,19114.12,1824.843333,3.0,4.0,3.0,4.0,0,3.0,6.0,11.27,4.0,0,809.98,23.933795,272.0,0,49.574949,21.46538,4,358.124168,1
8,28.0,1,34847.84,3037.986667,2.0,4.0,6.0,1.0,1,3.0,4.0,5.42,2.0,0,605.03,24.464031,319.0,0,18.816215,39.684018,3,470.690627,1
9,28.0,1,34847.84,3037.986667,2.0,4.0,6.0,1.0,1,7.0,1.0,7.42,2.0,0,605.03,38.550848,320.0,0,18.816215,39.684018,5,484.591214,0


# Ensemble Pipeline

In [31]:
df['Credit_Score']

0        0
1        0
2        0
3        0
4        0
        ..
99995    2
99996    2
99997    2
99998    1
99999    2
Name: Credit_Score, Length: 100000, dtype: uint8

In [32]:
def predict_model(estimator,df,target='Credit_Score',test_size=0.2,
                  scaler=None,report=True,random_state=0,
                  imbalance=None,return_pipeline=False):

    y = df['Credit_Score']
    X = df.drop('Credit_Score', axis=1)  # Replace 'label_column_name' with the actual column name of the label


    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=71)

    model_pipeline=make_pipeline(scaler,estimator)
    model_pipeline.fit(x_train,y_train)

    if report:
        print(f'test report:\n{classification_report(model_pipeline.predict(x_test),y_test)}')
        print(f'train report:\n{classification_report(model_pipeline.predict(x_train), y_train)}')

    if return_pipeline:
        report_test=pd.DataFrame.from_dict(classification_report(
            model_pipeline.predict(x_test),y_test,output_dict=True))

        return model_pipeline,report_test

In [33]:
pipelines_and_scores = {}

In [41]:
params = {
 'base_score': 0.5,
 'colsample_bylevel': 1,
 'colsample_bytree': 1,
 'gamma': 1,
 'learning_rate': 1e-1,
 'max_delta_step': 1,
 'max_depth': 10,
 'min_child_weight': 1,
 'n_estimators': 232,
 'objective': 'binary:logistic',
 'reg_alpha': 9e-1,
 'reg_lambda': 0.8,
 'scale_pos_weight': 1,
 'seed': 21,
 'silent': 1,
 'subsample': 1,
}

xgBoost = XGBClassifier(**params)

In [34]:
pipelines_and_scores['RandomForestClassifier'] = predict_model(
    estimator=RandomForestClassifier(max_depth=22,n_estimators=100,random_state=3,
                                     min_samples_split=3,criterion='entropy',
                                     bootstrap=True,n_jobs=-1),
    df=df,test_size=0.2,scaler=None,imbalance=RandomOverSampler(random_state=24),
    return_pipeline=True,report=False)

In [40]:
pipelines_and_scores['OneVsRestClassifier'] = predict_model(
    estimator=OneVsRestClassifier(RandomForestClassifier(max_depth=22,n_estimators=100,random_state=3,
                                     min_samples_split=3,criterion='entropy',
                                     bootstrap=True,n_jobs=-1)),
    df=df,test_size=0.2,scaler=None,imbalance=RandomOverSampler(random_state=24),
    return_pipeline=True,report=False)  # - F1 -> 0.8012

In [56]:
pipelines_and_scores['XGBoost'] = predict_model(
    estimator=xgBoost,
    df=df,test_size=0.2,scaler=None,imbalance=RandomOverSampler(random_state=24),
    return_pipeline=True,report=False)

Parameters: { "scale_pos_weight", "silent" } are not used.



In [57]:
pipelines_and_scores['XGBoost']

(Pipeline(steps=[('nonetype', None),
                 ('xgbclassifier',
                  XGBClassifier(base_score=0.5, booster=None, callbacks=None,
                                colsample_bylevel=1, colsample_bynode=None,
                                colsample_bytree=1, device=None,
                                early_stopping_rounds=None,
                                enable_categorical=False, eval_metric=None,
                                feature_types=None, gamma=1, grow_policy=None,
                                importance_type=None,
                                interaction_constraints=None, learning_rate=0.1,
                                max_bin=None, max_cat_threshold=None,
                                max_cat_to_onehot=None, max_delta_step=1,
                                max_depth=10, max_leaves=None,
                                min_child_weight=1, missing=nan,
                                monotone_constraints=None, multi_strategy=None,
       

# Voting Ensemble

In [74]:
def voting_ensemble(data=None,target=False,pipe_dict=pipelines_and_scores,
                    estimators=None,weights=False,
                    weights_method='precision'):

    predict_df=pd.DataFrame()
    weights_df=pd.DataFrame()
    weighted_predict_df=pd.DataFrame()

    if estimators is None:
        estimators=pipe_dict.keys()

    for i,estimator in enumerate(estimators):

        if i==0:
            predict_df=pd.DataFrame({str(estimator):
                                         pipelines_and_scores[estimator][0].predict(data)})
            if weights:
                weights_df=pd.DataFrame({str(estimator):
                                             pipelines_and_scores[estimator][-1].T[weights_method][:3]})

        # elif estimator=='FC_nn':
        #     predict_df['FC_nn']=nn_predict(nn_scale.transform(data),
        #                                    return_prediction=True,return_report=False)

        #     if weights:
        #         weights_df['FC_nn']=pipelines_and_scores['FC_nn'][-1].T[weights_method][:3].to_list()

        else:
            predict_df[str(estimator)]=pipelines_and_scores[estimator][0].predict(data)
            if estimator=='XGBoost':
                predict_df[str(estimator)]+=1

            if weights:
                weights_df[str(estimator)]=pipelines_and_scores[estimator][-1].T[weights_method][:3]


    if weights:
        print(len(predict_df))
        for n_m,model_algo in enumerate(predict_df):
            print(model_algo)
            if n_m==0:
                weighted_predict_df=pd.get_dummies(predict_df
                                                   .loc[:,model_algo])\
                                    *weights_df[model_algo][-1].tolist()
            else:
                weighted_predict_df+=pd.get_dummies(predict_df
                                                    .loc[:,model_algo])\
                                     *weights_df[model_algo][-1].tolist()

    print (weighted_predict_df.head(10))
    if weights:
        weighted_prediction=np.argmax(weighted_predict_df.to_numpy(),axis=1)+1

        if target is not False:
            print(f'Weighted (by {weights_method}) prediction accuracy: '
                  f'{(weighted_prediction==target).sum()/target.shape[0]}\n'
                  f'{classification_report(weighted_prediction,target)}')

        return weighted_prediction

    else:
        unweighted_predictions=predict_df.mode(axis=1)[0].to_numpy()
        print(f'Unweighted prediction accuracy: {(unweighted_predictions==target).sum()/target.shape[0]}'
              f'\n{classification_report(unweighted_predictions,target)}')
        return pd.DataFrame(unweighted_predictions).replace(num_to_obj_dict['Credit_Score'])

In [75]:
weight_method='f1-score'

y = df['Credit_Score']
X = df.drop('Credit_Score', axis=1)  # Replace 'label_column_name' with the actual column name of the label

estim = ['RandomForestClassifier','XGBoost','OneVsRestClassifier']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=71)
weighted_pre=voting_ensemble(data=x_test,target=y_test,pipe_dict=pipelines_and_scores,
                             estimators=estim,weights=True,
                             weights_method=weight_method)#,nn_scale=nn_scal

20000
RandomForestClassifier
XGBoost
OneVsRestClassifier
    0         1         2
0 NaN  0.000000  1.634256
1 NaN  1.634256  0.776848
2 NaN  0.776848  0.000000
3 NaN  1.634256  0.776848
4 NaN  1.634256  0.776848
5 NaN  0.776848  0.000000
6 NaN  0.000000  1.634256
7 NaN  1.634256  0.776848
8 NaN  1.634256  0.776848
9 NaN  0.000000  1.634256
Weighted (by f1-score) prediction accuracy: 0.5263
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.53      0.69     20000
           2       0.00      0.00      0.00         0

    accuracy                           0.53     20000
   macro avg       0.33      0.18      0.23     20000
weighted avg       1.00      0.53      0.69     20000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
