In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from collections import Counter
from imblearn.over_sampling import RandomOverSampler 
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, ConfusionMatrixDisplay, classification_report

In [13]:
%matplotlib inline

In [14]:
pd.set_option('display.max_columns', None)

In [15]:
df = pd.read_csv('data.csv')

# Separating the dataset

In [5]:
X = df.drop(['Bankrupt?'], axis = 1)
y = df['Bankrupt?']

# Selecting features with the Variance Threshold Method

In [6]:
X1 = X.copy()

In [7]:
from sklearn.preprocessing import MinMaxScaler 
transformer = MinMaxScaler().fit(X1)
x_normalized = transformer.transform(X1)
X_norm = pd.DataFrame(x_normalized)


In [8]:
from sklearn.feature_selection import VarianceThreshold

var_threshold = 0.02
sel = VarianceThreshold(threshold=(var_threshold))
sel = sel.fit(X_norm)
temp = sel.transform(X_norm)
VTM_df = pd.DataFrame(temp)
print(X1.shape)
print(VTM_df.shape)

(6819, 95)
(6819, 12)


In [10]:
list(sel.get_feature_names_out())

array(['x10', 'x11', 'x28', 'x47', 'x48', 'x54', 'x55', 'x63', 'x70',
       'x71', 'x73', 'x76'], dtype=object)

In [None]:
VTM_df

# Singular Value Decomposition on the variables selected by the variance threshold

scaler = StandardScaler()
scaler.fit(VTM_df)
scaler.transform(VTM_df)
X_SVD = pd.DataFrame(scaler.transform(VTM_df), columns=VTM_df.columns)

pca = PCA(random_state=5)
pca.fit(VTM_df)

print(pca.explained_variance_ratio_)

X_pca = pd.DataFrame(pca.transform(VTM_df)).iloc[:,0:10]

plt.rcParams['figure.figsize'] = [15, 10]

PC_values = np.arange(pca.n_components_) +1
plt.plot(PC_values, pca.explained_variance_ratio_, 'o-', linewidth=1, color='blue')
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Variance Explained')
plt.show()

variance_acc_temp = pd.Series(pca.explained_variance_ratio_.cumsum()) 
print (variance_acc_temp)

plt.bar(range(len(variance_acc_temp)), variance_acc_temp, color='royalblue' )
plt.grid(color='#95a5a6', linestyle='--', linewidth=0.2, axis='y' )
plt.xticks(range(len(variance_acc_temp)))
plt.xlabel('PCA')
plt.ylabel('Expl_var_ratio_accum')
plt.title('Accummulated explained variance ratio')
plt.rcParams['figure.figsize'] = [15, 10]


plt.show()

coefficients= pd.DataFrame(pca.components_)

coefficients

PCA0 = b00 * col0 + b01 * col1 + ....+ b085 * col95
eigen_vector1 = (b00, b01, b02,...)
eigen_value = vancianza


# Balancing the dataset: upsampling and downsampling

In [None]:
df_1 = pd.concat([VTM_df, y], axis=1)


In [None]:
y = df_1['Bankrupt?'].astype('int')
X = df_1.drop(['Bankrupt?'], axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state = 17)

In [None]:
TRAIN = pd.concat([X_train, y_train], axis=1)
TRAIN.shape

In [None]:
(TRAIN[TRAIN["Bankrupt?"]==1])

## Upsampling the minority class

In [None]:
upsample_indexes = np.random.choice(TRAIN[TRAIN["Bankrupt?"]==1].index, size = 400, replace=True).tolist()


In [None]:
upsample = TRAIN.loc[upsample_indexes,:]
upsample

## Clustering to downsize the majority class

In [None]:
No_bankrupcies =  TRAIN[(TRAIN['Bankrupt?'] == 0)]

In [None]:
kmeans = KMeans(n_clusters=7, random_state=1)
kmeans.fit(No_bankrupcies)

In [None]:
clusters = kmeans.predict(No_bankrupcies)

elem_in_cluster = pd.Series(clusters).value_counts().sort_index() # Number of values in each cluster
elem_in_cluster

In [None]:
clusters_weight = []
for j in range(len(elem_in_cluster)):
    weight = elem_in_cluster[j]/len(No_bankrupcies)
    clusters_weight.append(weight)

In [None]:
No_bankrupcies["cluster"] = clusters
No_bankrupcies

In [None]:
new_No_bankrupcies = pd.DataFrame(columns=No_bankrupcies.columns)
 
for cluster, weight in enumerate(clusters_weight):
    new_No_bankrupcies = pd.concat([new_No_bankrupcies, No_bankrupcies[No_bankrupcies["cluster"]==cluster].sample(round(400*weight))], axis=0)
    
new_No_bankrupcies.drop(columns=['cluster'], inplace=True)

In [None]:
final1_df = pd.concat([new_No_bankrupcies, upsample], axis=0)
final1_df = final1_df.sort_index()

In [None]:
final1_df.shape

In [None]:
y_train = final1_df['Bankrupt?'].astype('int')
X_train = final1_df.drop(['Bankrupt?'], axis=1)

# Predicting with KNN

In [None]:
# entrenar distintos modelos con distintos valores de k
K = range(2, 14)
accuracies = []
models = []

for k in K:
    knn = KNeighborsClassifier(n_neighbors=k) 
    knn.fit(X_train, y_train)
    models.append(knn)
    ypred_train = knn.predict(X_train)
    accuracies.append(accuracy_score(y_train, ypred_train))
    print("The accuracy of the model n_neighbors={} is: {:.2f}".format(k, accuracy_score(y_train, ypred_train)))   

In [None]:
import pickle

pickel.dump(models[0], 

In [None]:
K = range(2,14)
#accuracies = []

for i in range(len(models)):
    #knn = KNeighborsClassifier(n_neighbors=k) 
    knn = models[i]
    ypred_test = knn.predict(X_test)
    accuracies.append(accuracy_score(y_test, ypred_test))
    print("The accuracy of the model n_neighbors={} is: {:.2f}".format(k, accuracy_score(y_test, ypred_test)))

# Random Forest

In [None]:
clf = RandomForestClassifier(max_depth=4,
                             min_samples_split=6,
                             min_samples_leaf =3,
                             max_samples=0.8)
                             #random_state = 42)
clf.fit(X_train, y_train)

print("The accuracy for the Random Forest in the TRAIN set is {:.2f}".format(clf.score(X_train, y_train)))
print("The accuracy for the Random Forest in the TEST set is {:.2f}".format(clf.score(X_test, y_test)))

y_pred = clf.predict(X_train)
display(pd.DataFrame(y_pred).value_counts())
display(y.value_counts())
display(confusion_matrix(y_train, y_pred))

y_pred = clf.predict(X_test)
display(pd.DataFrame(y_pred).value_counts())
display(y.value_counts())
display(confusion_matrix(y_test, y_pred))


In [None]:
from sklearn.model_selection import GridSearchCV

max_depth_choices= [2, 3, 5,7,9] 
min_samples_split_choices = [2,4,6,8,9]  
min_samples_leaf_choices = [1,3] 
max_samples=[0.8,0.5]
#n_jobs = [-1]

grid = {'max_depth': max_depth_choices,
        'min_samples_split': min_samples_split_choices,
        'min_samples_leaf': min_samples_leaf_choices,
        'max_samples':max_samples}

model = RandomForestClassifier()
grid_search = GridSearchCV(estimator = model, param_grid = grid, cv = 5) 
grid_search.fit(X_train, y_train)
grid_search.best_score_

In [None]:
grid_search.best_params_

In [None]:
clf = RandomForestClassifier(max_depth= 5,
                             min_samples_split=9,
                             min_samples_leaf =3,
                             max_samples=0.8)
                             #random_state = 42)
clf.fit(X_train, y_train)

print("The accuracy for the Random Forest in the TRAIN set is {:.2f}".format(clf.score(X_train, y_train)))
print("The accuracy for the Random Forest in the TEST set is {:.2f}".format(clf.score(X_test, y_test)))

y_pred = clf.predict(X_train)
display(pd.DataFrame(y_pred).value_counts())
display(y.value_counts())
display(confusion_matrix(y_train, y_pred))

y_pred = clf.predict(X_test)
display(pd.DataFrame(y_pred).value_counts())
display(y.value_counts())
display(confusion_matrix(y_test, y_pred))
