In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df=pd.read_csv('data.csv')

In [3]:
df.shape

(6819, 96)

In [4]:
df.columns

Index(['Bankrupt?', ' ROA(C) before interest and depreciation before interest',
       ' ROA(A) before interest and % after tax',
       ' ROA(B) before interest and depreciation after tax',
       ' Operating Gross Margin', ' Realized Sales Gross Margin',
       ' Operating Profit Rate', ' Pre-tax net Interest Rate',
       ' After-tax net Interest Rate',
       ' Non-industry income and expenditure/revenue',
       ' Continuous interest rate (after tax)', ' Operating Expense Rate',
       ' Research and development expense rate', ' Cash flow rate',
       ' Interest-bearing debt interest rate', ' Tax rate (A)',
       ' Net Value Per Share (B)', ' Net Value Per Share (A)',
       ' Net Value Per Share (C)', ' Persistent EPS in the Last Four Seasons',
       ' Cash Flow Per Share', ' Revenue Per Share (Yuan ¥)',
       ' Operating Profit Per Share (Yuan ¥)',
       ' Per Share Net profit before tax (Yuan ¥)',
       ' Realized Sales Gross Profit Growth Rate',
       ' Operating Profit

In [5]:
#Removing space in columns
df.columns=df.columns.str.strip()
df.columns = df.columns.str.replace(" " ,"_")
df.rename(columns = {'Bankrupt?' :'Bankrupt' },inplace=True)

In [None]:
#Check data description

1.EDA(Data preprocessing and story telling)
2.Model building
3.Feature Transformation-(Feature selection (REF))
    Feature selection-If variance is zero we can remove the variable
4.Model Validation
5.Model Evaluation-(Cross validation -KFold)
6.Hyper parameter tuning (Grid Search)
    Can give L1_ratio, alpha as a parameter in Elastic net
7.Picking best model
8.Final submission

#To avoid overfitting in Linear regression, compare score of Train RMSE and Test RMSE should be similar

#EDA(Data preprocessing and story telling)
1.Univariate,Bivariate,Multivariate Analysis
2.check skewness and kurtosis
    skewness - Even after log transformation, can use reciprocal transformation
3.Data Anamolies
    1.Check for duplicate values
    2.Missing value treatment
    3.Handling Outliers
4.Transformation (Z-score,Minmax scaling)
5.Handling imbalance data set
6.Multicollinearity- VIF method -Threshold as 10 and remove variables above than 10

In [6]:
#Data Anamalies
print('Duplicate:',df.duplicated().sum())
print('Null values:',df.isnull().values.sum())
print('Nan values:',df.isna().values.sum())

Duplicate: 0
Null values: 0
Nan values: 0


In [7]:
#Removing Outliers
Q1 = df.quantile(0.01)
Q3 = df.quantile(0.99)
IQR = Q3 - Q1
df = df[~((df < (Q1-6*IQR)) | (df > (Q3 + 6*IQR))).any(axis = 1)]
df.shape

(6552, 96)

In [8]:
input_cols=df.drop('Bankrupt',axis=1)
target_cols=df['Bankrupt']

In [9]:
#Scaling Data
from sklearn.preprocessing import MinMaxScaler
minmax=MinMaxScaler()
input_sc=minmax.fit_transform(input_cols)
input_sc=pd.DataFrame(data=input_sc,columns=input_cols.columns)

In [9]:
#Handling Imbalanced dataset
from sklearn.feature_selection import mutual_info_classif 
from imblearn.over_sampling import SMOTE 
mutual_info = mutual_info_classif(X= input_sc,y= target_cols)  # get mutual info all predictors
pruned_features = input_sc.columns[np.where(mutual_info>0)]  # retain features only with mi >0
X_scaled_pruned = input_sc[pruned_features]
sm = SMOTE(random_state=123)
X_sm , y_sm = sm.fit_resample(X_scaled_pruned,target_cols)
X_sm.shape

(12738, 86)

In [10]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
x_train,x_test,y_train,y_test=train_test_split(X_sm,y_sm,
                                                       test_size=0.3,random_state=1)
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((8916, 86), (3822, 86), (8916,), (3822,))

In [None]:
#Baseline model

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, roc_auc_score, f1_score, precision_score

In [12]:
log_model=LogisticRegression().fit(x_train,y_train)
y_pred=log_model.predict(x_test)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

0.8950811093668236
[[1696  256]
 [ 145 1725]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
dt_model=DecisionTreeClassifier(max_depth=8,random_state=1).fit(x_train,y_train)
y_pred=dt_model.predict(x_test)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

0.9387755102040817
[[1753  199]
 [  35 1835]]


In [14]:
dt_model=RandomForestClassifier(max_depth=8,random_state=1).fit(x_train,y_train)
y_pred=dt_model.predict(x_test)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

0.9610151753008895
[[1815  137]
 [  12 1858]]


In [15]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 20)
new_X = pca.fit_transform(X_sm)
print(sum(pca.explained_variance_ratio_))
new_data = pd.DataFrame(data = new_X, columns = range(1, 21))
new_data.shape

0.9635306418852335


(12738, 20)

In [16]:
x_train,x_test,y_train,y_test=train_test_split(new_data,y_sm,test_size=0.3,random_state=1)
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((8916, 20), (3822, 20), (8916,), (3822,))

In [17]:
dt_model=DecisionTreeClassifier(max_depth=8,random_state=1).fit(x_train,y_train)
y_pred=dt_model.predict(x_test)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

0.9215070643642073
[[1731  221]
 [  79 1791]]


In [18]:
dt_model=RandomForestClassifier(max_depth=8,random_state=1).fit(x_train,y_train)
y_pred=dt_model.predict(x_test)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

0.945839874411303
[[1783  169]
 [  38 1832]]


In [20]:
naiveBayes=GaussianNB().fit(x_train,y_train)
y_pred=naiveBayes.predict(x_test)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

0.7752485609628467
[[1230  722]
 [ 137 1733]]


In [27]:
knnclassifier=KNeighborsClassifier(n_neighbors=3).fit(x_train,y_train)
y_pred=knnclassifier.predict(x_test)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

0.9552590266875981
[[1782  170]
 [   1 1869]]


In [None]:
#Hyper parameter Tuning
#params = [{"max_depth":[7,8,9,10,11], "max_features":[6,7,8,9], "n_estimators":[100,120,140,150]}]
#RF = RandomForestClassifier()
#RF_grid = GridSearchCV(estimator = RF, param_grid = params, cv = 5, scoring = "recall")
#RF_grid.fit(x_train,y_train)
#RF_grid.best_params_

In [19]:
#cross-validation
cross_val_score(estimator = dt_model, X = x_train, y = y_train, scoring = "accuracy", cv = 10)

array([0.95852018, 0.94506726, 0.93161435, 0.94506726, 0.94506726,
       0.94618834, 0.92817059, 0.94388328, 0.94612795, 0.9315376 ])

In [36]:
#Cross validation for KNN
cross_val_score(estimator = knnclassifier, X = x_train, y = y_train, scoring = "accuracy", cv = 10)

array([0.96860987, 0.96188341, 0.95964126, 0.96748879, 0.9529148 ,
       0.95403587, 0.95510662, 0.96296296, 0.96969697, 0.95847363])

In [38]:
#Hyper parameter tuning for KNN Algorithm
params = [{"n_neighbors":[1,2,3,4,5]}]
knnclassifier = KNeighborsClassifier()
RF_grid = GridSearchCV(estimator = knnclassifier, param_grid = params, cv = 5, scoring = "recall")
RF_grid.fit(x_train,y_train)
RF_grid.best_params_

{'n_neighbors': 1}

In [20]:
#XGBoost
XGB = XGBClassifier(learning_rate = 0.01, gamma = 2, use_label_encoder = False, eval_metric = "logloss", n_estimators = 200, max_depth = 6)
XGB.fit(x_train, y_train)
y_pred=dt_model.predict(x_test)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
#print(classification_report(ytest, XGB.predict(xtest)))

0.9416535845107273
[[1770  182]
 [  41 1829]]


In [21]:
#Stacking classifier
base_learners = [('Decision Tree', DecisionTreeClassifier()),
                 ('Random Forest', RandomForestClassifier())]

stack_model_gdBoost = StackingClassifier(estimators = base_learners, final_estimator = GradientBoostingClassifier(random_state = 8))
stack_model_gdBoost.fit(x_train, y_train)
y_pred = stack_model_gdBoost.predict(x_test)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

0.9811616954474097
[[1897   55]
 [  17 1853]]


In [35]:
base_learners = [('KNN', KNeighborsClassifier()),
                 ('KNN1', KNeighborsClassifier())]

stack_model_gdBoost = StackingClassifier(estimators = base_learners, final_estimator = KNeighborsClassifier())
stack_model_gdBoost.fit(x_train, y_train)
y_pred = stack_model_gdBoost.predict(x_test)
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

0.9774986917844061
[[1883   69]
 [  17 1853]]
