In [50]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
#from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.cluster import KMeans
from kneed import KneeLocator
import pickle


In [51]:
df = pd.read_csv("..\Data\cleanedData\CleanData.csv")
df.head(3)

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,DEFAULT
0,-1.067662,0.985999,0.34981,-0.889338,-1.298055,1.807971,1.889545,-0.530789,-0.468738,-0.430297,...,-0.664005,-0.65532,-0.647128,-0.339107,-0.218078,-0.284863,-0.299474,-0.304047,-0.282104,1
1,-0.270995,0.985999,0.34981,1.054345,-1.071492,-0.71233,1.889545,-0.530789,-0.468738,-0.430297,...,-0.613355,-0.598983,-0.592892,-0.339107,-0.202123,-0.220571,-0.22683,-0.304047,-0.153811,1
2,-0.509995,0.985999,0.34981,1.054345,-0.16524,-0.71233,-0.570495,-0.530789,-0.468738,-0.430297,...,-0.442166,-0.411579,-0.388524,-0.229833,-0.176471,-0.220571,-0.22683,-0.229725,0.038628,0


In [52]:
X_train, X_test, y_train, y_test = train_test_split(df.drop("DEFAULT",axis=1), df["DEFAULT"],random_state=42, test_size=0.25)

# Model Building by making clusters of training data and finding best model for each cluster

In [12]:
# Determine the number of clusters
inertias = []
for k in range(1, 10):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_train)
    inertias.append(kmeans.inertia_)
kl = KneeLocator(range(1, 10), inertias, curve="convex", direction="decreasing")
n_clusters = kl.elbow

# Cluster the training data
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(X_train)
labels_train = kmeans.labels_

# Train a separate model for each cluster
models = []
for cluster in range(n_clusters):
    X_cluster = X_train[labels_train == cluster]
    y_cluster = y_train[labels_train == cluster]
    if len(y_cluster.unique()) == 1:  # only one class in this cluster
        models.append(None)
        continue
    # Try different models
    clf1 = RandomForestClassifier(random_state=42)
    clf2 = XGBClassifier(random_state=42)
    #clf3 = SVC(random_state=42)
    param_grids = [
        {"n_estimators": [10, 50, 100],"criterion":["entropy","gini"], "max_depth": [5, 10, None], "ccp_alpha" : [0.0, 0.01, 0.1, 1.0]},  # random forest
        {"n_estimators": [10, 50, 100], "max_depth": [5, 10, None], "learning_rate": [0.1, 0.5, 1]},  # xgboost
        #{'C': [0.1, 1, 10, 100],'kernel': ['linear', 'rbf', 'sigmoid'],'gamma': ['scale', 'auto', 0.1, 1, 10]}, #SVC
    ]
    clfs = [clf1, clf2]
    best_score = 0
    best_model = None
    for clf, param_grid in zip(clfs, param_grids):
        if clf is None:
            continue
        grid_search = GridSearchCV(clf, param_grid, cv=5, scoring="accuracy", n_jobs=-1)
        grid_search.fit(X_cluster, y_cluster)
        if grid_search.best_score_ > best_score:
            best_score = grid_search.best_score_
            best_model = grid_search.best_estimator_
    
    
    # Save the best model for the cluster as a pickle file
    with open(f"model_cluster_{cluster}.pkl", "wb") as f:
        pickle.dump(best_model, f)
    models.append(best_model)




# Predicting using X_test

In [24]:
# Load the models from disk
models = []
for cluster in range(n_clusters):
    with open(f"model_cluster_{cluster}.pkl", "rb") as f:
        model = pickle.load(f)
        models.append(model)

# Make predictions on the test data
y_pred = np.empty(len(X_test))
for i, x_test in enumerate(X_test.values):
    # Assign the test data to the nearest cluster
    cluster = kmeans.predict(x_test.reshape(1, -1))[0]
    # Use the corresponding model to make the prediction
    model = models[cluster]
    if model is None:
        # If there is only one class in the cluster, predict that class
        y_pred[i] = y_train[labels_train == cluster][0]
    else:
        y_pred[i] = model.predict(x_test.reshape(1, -1))[0]




In [26]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

         0.0       0.90      0.83      0.87      6272
         1.0       0.82      0.89      0.85      5396

    accuracy                           0.86     11668
   macro avg       0.86      0.86      0.86     11668
weighted avg       0.86      0.86      0.86     11668



# Model building Using RandomForest and XGBoost

In [56]:
rfc = RandomForestClassifier(random_state=42)
xgc = XGBClassifier()

rfc.fit(X_train,y_train)
ypred_rfc= rfc.predict(X_test)
print("Report for RFC")
print(classification_report(ypred_rfc,y_test),"\n")

xgc.fit(X_train, y_train)
ypred_xgc = xgc.predict(X_test)
print("Report for XGC")
print(classification_report(ypred_xgc,y_test))

Report for RFC
              precision    recall  f1-score   support

           0       0.90      0.84      0.87      6249
           1       0.83      0.89      0.86      5419

    accuracy                           0.86     11668
   macro avg       0.86      0.86      0.86     11668
weighted avg       0.86      0.86      0.86     11668
 

Report for XGC
              precision    recall  f1-score   support

           0       0.89      0.81      0.85      6368
           1       0.79      0.88      0.83      5300

    accuracy                           0.84     11668
   macro avg       0.84      0.84      0.84     11668
weighted avg       0.85      0.84      0.84     11668



### RandomForest is performing better

In [63]:
param_grid= [
    {"n_estimators": [10, 50, 100],"criterion":["entropy","gini"], "max_depth": [5, 10, None], "ccp_alpha" : [0.0, 0.01, 0.1, 1.0]}
]

grid_search = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=5, scoring="accuracy",n_jobs=-1)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Print the best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best score:", grid_search.best_score_)

Best parameters: {'ccp_alpha': 0.0, 'criterion': 'gini', 'max_depth': None, 'n_estimators': 100}
Best score: 0.8521227130818045


In [60]:
rfc =RandomForestClassifier(n_estimators=100, ccp_alpha=0.0, criterion='entropy')
rfc.fit(X_train,y_train)
predicted = rfc.predict(X_test)

print(classification_report(predicted,y_test))

              precision    recall  f1-score   support

           0       0.90      0.83      0.87      6283
           1       0.82      0.89      0.86      5385

    accuracy                           0.86     11668
   macro avg       0.86      0.86      0.86     11668
weighted avg       0.86      0.86      0.86     11668



## Making clusters of train data and training the clusters with different models to find the best model is also giving same results as training on RandomForest. 

So, I'm going with RandomForest model.

## I've tried Gradient Boosting ,SVM, LogisticRegression, naive Bayes, DecisionTrees, XGBoost, but Random Forest Classifier is giving best results.

In [55]:
# save the model to a file
# save the model to a custom path
path = '../Models/finalmodel.pkl'
with open(path, 'wb') as f:
    pickle.dump(rfc, f)

# # load the model from file
# with open('rfc.pkl', 'rb') as f:
#     rfc = pickle.load(f)
