In [12]:
import pandas as pd
import model_selection_pipeline as pipeline
from joblib import dump
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier, QuantileDMatrix, Booster, train
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.model_selection import GridSearchCV
from collections import Counter
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

<h1>Implementing SMOTE on the Cleaned Dataset

In [13]:
df = pd.read_csv('./datasets/train_cleaned.csv')
df.shape

(100000, 28)

In [14]:
feature_list = ['Num_of_Delayed_Payment', 'Num_Bank_Accounts', 'Interest_Rate','Delay_from_due_date','Changed_Credit_Limit','Outstanding_Debt','Credit_History_Age', 'Credit_Score']

In [15]:
subset_df = df[feature_list]

subset_df['Credit_Score'] = subset_df['Credit_Score'].map({'Good': 0, 'Poor': 1, 'Standard': 2})

X = subset_df[feature_list[:-1]].values
y = subset_df[["Credit_Score"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify = y, test_size = 0.3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_df['Credit_Score'] = subset_df['Credit_Score'].map({'Good': 0, 'Poor': 1, 'Standard': 2})


In [16]:
dict(Counter(y_train["Credit_Score"]))

{2: 37222, 1: 20299, 0: 12479}

In [17]:
over_sample_strategy = SMOTE(sampling_strategy={0: 20000})
under_sample_strategy = RandomUnderSampler(sampling_strategy={1: 20000, 2: 26000})

steps = [('o', over_sample_strategy), ('u', under_sample_strategy)]
pipeline = Pipeline(steps=steps)

X_train_new, y_train_new = pipeline.fit_resample(X_train, y_train)

In [18]:
y_train_new["Credit_Score"].value_counts()

Credit_Score
2    26000
0    20000
1    20000
Name: count, dtype: int64

In [19]:
params = {
 'base_score': 0.5,
 'colsample_bylevel': 1,
 'colsample_bytree': 1,
 'gamma': 0,
 'learning_rate': 1e-1,
 'max_delta_step': 1,
 'max_depth': 10,
 'min_child_weight': 1,
 'n_estimators': 232,
 'objective': 'binary:logistic',
 'reg_alpha': 9e-1,
 'reg_lambda': 0.8,
 'scale_pos_weight': 1,
 'seed': 21,
 'silent': 1,
 'subsample': 1,
 'scale_pos_weight' : 10.0
}

XGBClassifier().get_xgb_params()

{'objective': 'binary:logistic',
 'base_score': None,
 'booster': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'eval_metric': None,
 'gamma': None,
 'gpu_id': None,
 'grow_policy': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': None,
 'max_leaves': None,
 'min_child_weight': None,
 'monotone_constraints': None,
 'n_jobs': None,
 'num_parallel_tree': None,
 'predictor': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [20]:
over_sampled_model = XGBClassifier(**params)
over_sampled_model.fit(X_train_new, y_train_new)

Parameters: { "scale_pos_weight", "silent" } are not used.



In [10]:
y_pred= over_sampled_model.predict(X_test)

In [11]:
scores = {
    "F1:": "{:.4f}".format(f1_score(y_test, y_pred, average='weighted')),
    "Precision:": "{:.4f}".format(precision_score(y_test, y_pred, average='weighted')),
    "Recall:": "{:.4f}".format(recall_score(y_test, y_pred, average='weighted')),
    "Accuracy:": "{:.4f}".format(accuracy_score(y_test, y_pred))
}
print(scores)

{'F1:': '0.7643', 'Precision:': '0.7742', 'Recall:': '0.7626', 'Accuracy:': '0.7626'}
