In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import shap
from sklearn.preprocessing import StandardScaler
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from collections import Counter
from imblearn.over_sampling import RandomOverSampler 
import seaborn as sns
from sklearn.metrics import roc_auc_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, ConfusionMatrixDisplay, classification_report

In [None]:
%matplotlib inline

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df = pd.read_csv('data.csv')

In [None]:
df

In [None]:
df.drop([' Current Liability to Liability', ' Net Income Flag'], axis = 1, inplace= True)

In [None]:
df.shape

# Separating the dataset

In [None]:
X = df.drop(['Bankrupt?'], axis = 1)
y = df['Bankrupt?']

# Selecting features with the K-best method

In [None]:
X1 = X.copy()

In [None]:
from sklearn.preprocessing import MinMaxScaler 
transformer = MinMaxScaler().fit(X1)
x_normalized = transformer.transform(X1)
X2 = pd.DataFrame(x_normalized, columns= X1.columns)

In [None]:
from sklearn.feature_selection import SelectKBest, chi2

kbest = SelectKBest(chi2, k=40) # Here we choose 10 best features so that is easier to analyze results later
kbest.fit(X2,y)
X_new = kbest.transform(X2) 
selected_columns = [X2.columns[index] for index, value in enumerate(kbest.get_support().tolist()) if value == True]
selected = pd.DataFrame(X_new, columns = selected_columns)
selected.head()

In [None]:
ml = [elem for elem in zip(kbest.scores_, X2.columns.tolist())]
ml.sort(reverse=True)
scores = pd.DataFrame(data = ml, columns = ['score','Column'])
scores.head(25)

# Balancing the dataset: upsampling and downsampling

In [None]:
df_1 = pd.concat([selected, y], axis=1)


In [None]:
y = df_1['Bankrupt?'].astype('int')
X = df_1.drop(['Bankrupt?'], axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state = 17)

In [None]:
TRAIN = pd.concat([X_train, y_train], axis=1)
TRAIN.shape

In [None]:
(TRAIN[TRAIN["Bankrupt?"]==1])

## Upsampling the minority class

In [None]:
upsample_indexes = np.random.choice(TRAIN[TRAIN["Bankrupt?"]==1].index, size = 400, replace=True).tolist()

In [None]:
upsample = TRAIN.loc[upsample_indexes,:]
upsample

## Clustering to downsize the majority class

In [None]:
No_bankrupcies =  TRAIN[(TRAIN['Bankrupt?'] == 0)]

In [None]:
K = range(2, 21)

inertia = []
silhouette = []

for k in K:
    print("Training a K-Means model with {} clusters! ".format(k))
    print()
    kmeans = KMeans(n_clusters=k,
                    random_state=1234,
                    verbose=1)
    kmeans.fit(No_bankrupcies)
    
    inertia.append(kmeans.inertia_)
    silhouette.append(silhouette_score(No_bankrupcies, kmeans.predict(No_bankrupcies)))

In [None]:
fig, ax = plt.subplots(1,2,figsize=(16,8))
ax[0].plot(K, inertia, 'bx-')
ax[0].set_xlabel('k')
ax[0].set_ylabel('inertia')
ax[0].set_xticks(np.arange(min(K), max(K)+1, 1.0))
ax[0].set_title('Elbow Method showing the optimal k')
ax[1].plot(K, silhouette, 'bx-')
ax[1].set_xlabel('k')
ax[1].set_ylabel('silhouette score')
ax[1].set_xticks(np.arange(min(K), max(K)+1, 1.0))
ax[1].set_title('Silhouette Method showing the optimal k')

In [None]:
kmeans = KMeans(n_clusters=9, random_state=1)
kmeans.fit(No_bankrupcies)

In [None]:
clusters = kmeans.predict(No_bankrupcies)

elem_in_cluster = pd.Series(clusters).value_counts().sort_index() # Number of values in each cluster
elem_in_cluster

In [None]:
clusters_weight = []
for j in range(len(elem_in_cluster)):
    weight = elem_in_cluster[j]/len(No_bankrupcies)
    clusters_weight.append(weight)

In [None]:
No_bankrupcies["cluster"] = clusters
No_bankrupcies

In [None]:
new_No_bankrupcies = pd.DataFrame(columns=No_bankrupcies.columns, dtype=float)
 
for cluster, weight in enumerate(clusters_weight):
    new_No_bankrupcies = pd.concat([new_No_bankrupcies, No_bankrupcies[No_bankrupcies["cluster"]==cluster].sample(round(400*weight))], axis=0)
    
new_No_bankrupcies.drop(columns=['cluster'], inplace=True)

In [None]:
final1_df = pd.concat([new_No_bankrupcies, upsample], axis=0)
final1_df = final1_df.sort_index()

In [None]:
final1_df.shape

In [None]:
y_train = final1_df['Bankrupt?'].astype('int')
X_train = final1_df.drop(['Bankrupt?'], axis=1)

# Predicting with KNN

In [None]:
# entrenar distintos modelos con distintos valores de k
K = range(2, 14, 3)
accuracies = []
models = []

for k in K:
    knn = KNeighborsClassifier(n_neighbors=k, weights = 'uniform') 
    knn.fit(X_train, y_train)
    models.append(knn)
    ypred_train = knn.predict(X_train)
    accuracies.append(accuracy_score(y_train, ypred_train))
    print("The accuracy of the model n_neighbors={} is: {:.2f}".format(k, accuracy_score(y_train, ypred_train)))
    print("The kappa of the model n_neighbors={} is: {:.2f}".format(k, cohen_kappa_score(y_train, ypred_train)))

import pickle

pickel.dump(models[0], 

In [None]:
K = range(2, 14, 3)
#accuracies = []

for i in range(len(models)):
    #knn = KNeighborsClassifier(n_neighbors=k) 
    knn = models[i]
    ypred_test = knn.predict(X_test)
    accuracies.append(accuracy_score(y_test, ypred_test))
    print("The accuracy of the model n_neighbors={} is: {:.2f}".format(list(K)[i], accuracy_score(y_test, ypred_test)))
    print("The kappa score of the model n_neighbors={} is: {:.2f}".format(list(K)[i],cohen_kappa_score(y_test, ypred_test)))

In [None]:
knn = KNeighborsClassifier(n_neighbors=2) 
knn.fit(X_train, y_train)
ypred_train = knn.predict(X_train)
ypred_test = knn.predict(X_test)
display(y.value_counts())


In [None]:
display(confusion_matrix(ypred_train, y_train))

In [None]:
display(confusion_matrix(ypred_test, y_test))

# Random Forest

In [None]:
clf = RandomForestClassifier(max_depth=4,
                             min_samples_split=6,
                             min_samples_leaf =3,
                             max_samples=0.8, random_state=8)
                            
clf.fit(X_train, y_train)

print("The accuracy for the Random Forest in the TRAIN set is {:.2f}".format(clf.score(X_train, y_train)))
print("The accuracy for the Random Forest in the TEST set is {:.2f}".format(clf.score(X_test, y_test)))

y_pred = clf.predict(X_train)
display(pd.DataFrame(y_pred).value_counts())
display(y.value_counts())
display(confusion_matrix(y_train, y_pred))
display(cohen_kappa_score(y_train, y_pred))

y_pred = clf.predict(X_test)
display(pd.DataFrame(y_pred).value_counts())
display(y.value_counts())
display(confusion_matrix(y_test, y_pred))
display(cohen_kappa_score(y_test, y_pred))


In [None]:
##### from sklearn.model_selection import GridSearchCV

max_depth_choices= [2, 3, 5,7,9] 
min_samples_split_choices = [2,4,6,8,9]  
min_samples_leaf_choices = [1,3] 
max_samples=[0.8,0.5]
#n_jobs = [-1]

grid = {'max_depth': max_depth_choices,
        'min_samples_split': min_samples_split_choices,
        'min_samples_leaf': min_samples_leaf_choices,
        'max_samples':max_samples}

model = RandomForestClassifier()
grid_search = GridSearchCV(estimator = model, param_grid = grid, cv = 5) 
grid_search.fit(X_train, y_train)
grid_search.best_score_

In [None]:
grid_search.best_params_

In [None]:
clf = RandomForestClassifier(**grid_search.best_params_, random_state =8)
                          
clf.fit(X_train, y_train)

print("The accuracy for the Random Forest in the TRAIN set is {:.2f}".format(clf.score(X_train, y_train)))
print("The accuracy for the Random Forest in the TEST set is {:.2f}".format(clf.score(X_test, y_test)))

y_pred = clf.predict(X_train)
display(pd.DataFrame(y_pred).value_counts())
display(y.value_counts())
display(confusion_matrix(y_train, y_pred))
display(cohen_kappa_score(y_train, y_pred))

y_pred = clf.predict(X_test)
display(pd.DataFrame(y_pred).value_counts())
display(y.value_counts())
display(confusion_matrix(y_test, y_pred))
display(cohen_kappa_score(y_test, y_pred))


## Feature Extraction-SHAP

In [None]:
explainer = shap.TreeExplainer(clf)
shap_values = explainer.shap_values(X)

In [None]:
shap.dependence_plot(' Persistent EPS in the Last Four Seasons', shap_values[0], X)

In [None]:
shap.summary_plot(shap_values, X)

# XGBoost

In [None]:
import xgboost as xgb

In [None]:
xgb_cl = xgb.XGBClassifier()

In [None]:
xgb_cl.fit(X_train, y_train)

In [None]:
print("The accuracy for the XGB in the TRAIN set is {:.2f}".format(xgb_cl.score(X_train, y_train)))
print("The accuracy for the XGB in the TEST set is {:.2f}".format(xgb_cl.score(X_test, y_test)))

y_pred = pd.Series(xgb_cl.predict(X_train))
display(pd.DataFrame(y_pred).value_counts())
display(confusion_matrix(y_train, y_pred))
display(cohen_kappa_score(y_train, y_pred))

y_pred = pd.Series(xgb_cl.predict(X_test))
display(pd.DataFrame(y_pred).value_counts())
display(confusion_matrix(y_test, y_pred))
display(cohen_kappa_score(y_test, y_pred))

Learning_rate: also called eta, it specifies how quickly the model fits the residual errors by using additional base learners.typical values: 0.01–0.2

Gamma, reg_alpha, reg_lambda: these 3 parameters specify the values for 3 types of regularization done by XGBoost - minimum loss reduction to create a new split, L1 reg on leaf weights, L2 reg leaf weights respectively.Typical values for gamma: 0 - 0.5 but highly dependent on the data. Typical values for reg_alpha and reg_lambda: 0 - 1 is a good starting point but again, depends on the data.

Max_depth - how deep the tree's decision nodes can go. Must be a positive integer. typical values: 1–10

Subsample - fraction of the training set that can be used to train each tree. If this value is low, it may lead to underfitting or if it is too high, it may lead to overfitting. typical values: 0.5–0.9

Colsample_bytree- fraction of the features that can be used to train each tree. A large value means almost all features can be used to build the decision tree. typical values: 0.5–0.9

The above are the main hyperparameters people often tune. It is perfectly OK if you don’t understand them all completely (like me) but you can refer to this post which gives a thorough overview of how each of the above parameters works and how to tune them.



In [None]:
param_grid = {
    "max_depth": [3, 4, 5, 7],
    "learning_rate": [0.01, 0.2],
    "gamma": [0, 0.5, 1],
    "reg_lambda": [0, 0.5, 1, 10],
    "scale_pos_weight": [1, 3, 5],
    "subsample": [0.8],
    "colsample_bytree": [0.5],
    "reg_alpha": [0, 0.5, 1, 10]
}

In [None]:
# Init classifier
xgb_cl = xgb.XGBClassifier(objective="binary:logistic")

# Init Grid Search
grid_cv = GridSearchCV(xgb_cl, param_grid, n_jobs=-1, cv=3, scoring="roc_auc")

# Fit
_ = grid_cv.fit(X_train, y_train)

In [None]:
grid_cv.best_score_


In [None]:
grid_cv.best_params_

In [None]:
final_cl = xgb.XGBClassifier(
    **grid_cv.best_params_,
    objective="binary:logistic")
_ = final_cl.fit(X_train, y_train)
preds = final_cl.predict(X_test)

print("The accuracy for the XGB in the TRAIN set is {:.2f}".format(final_cl.score(X_train, y_train)))
print("The accuracy for the XGB in the TEST set is {:.2f}".format(final_cl.score(X_test, y_test)))

y_pred = pd.Series(final_cl.predict(X_train))
display(pd.DataFrame(y_pred).value_counts())
display(confusion_matrix(y_train, y_pred))
display(cohen_kappa_score(y_train, y_pred))

y_pred = pd.Series(final_cl.predict(X_test))
display(pd.DataFrame(y_pred).value_counts())
display(confusion_matrix(y_test, y_pred))
display(cohen_kappa_score(y_test, y_pred))


## Feature extraction-SHAP

In [None]:
explainer = shap.TreeExplainer(final_cl)
shap_values = explainer.shap_values(X)

In [None]:
shap.force_plot(explainer.expected_value, shap_values[0, :], X.iloc[0, :])

In [None]:
shap.dependence_plot( ' Persistent EPS in the Last Four Seasons', shap_values, X)

In [None]:
shap.summary_plot(shap_values, X)

# K-fold Cross validation

In [None]:
a = np.zeros(shape=(4, 12))

pd.DataFrame(a)

In [None]:
# Aplicamos CV-10 a todos los modelos para tner una mejor estimacion del accuracy y menor varianza
nfolds = 10
models = [knn, clf, xgb_cl]
cv_scores = np.zeros((len(models), nfolds))

for i, model in enumerate(models):
    scores = cross_val_score(model, X, y, scoring='roc', cv=nfolds)
    cv_scores[i] = scores
    
cv_df = pd.DataFrame(cv_scores, index=[models])
cv_df['mean_score'] = cv_df.mean(1)
cv_df['std_score'] = cv_df.std(1)