In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

import os

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GroupKFold

from sklearn import linear_model

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB

from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier,RadiusNeighborsClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
import xgboost as xgb

from sklearn.neural_network import MLPClassifier

from sklearn import model_selection


from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

# Load data

In [None]:
df=pd.read_stata('company_mlf.dta')
print(df.shape)
df.head()

# Preprocessing

In [None]:
from sklearn.model_selection import train_test_split

data = df.values
X = data[:,2:]
y = data[:,1]
X = X.astype(str)
y = y.reshape(len(y),1)
print(X.shape,y.shape)

train_X, test_X, train_y, test_y = train_test_split(X, y, 
                                                    train_size=0.9,
                                                    test_size=0.1,
                                                    random_state=1234,
                                                    stratify=y)
print('Train: ', train_X.shape,train_y.shape)
print('Test:', test_X.shape,test_y.shape)
train_X[:2]

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder

def prepare_inputs(X_train,X_test):
    oe = OneHotEncoder(sparse=False, handle_unknown='ignore')
    oe.fit(X_train)
    X_train_enc = oe.transform(X_train)
    X_test_enc = oe.transform(X_test)

    return X_train_enc, X_test_enc

def prepare_target(y_train,y_test):
    le = LabelEncoder()
    le.fit(y_train)
    y_train_enc=le.transform(y_train)
    y_test_enc=le.transform(y_test) 
    
    return y_train_enc, y_test_enc

X_train, X_test = prepare_inputs(train_X,test_X)
y_train, y_test = prepare_target(train_y,test_y)
X_test

In [None]:
import numpy as np 

print(pd.DataFrame(y_train)[0].value_counts())
print(pd.DataFrame(y_test)[0].value_counts())

# K-fold Cross validation

In [None]:
from sklearn.model_selection import KFold

models = []

models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('QDA', QuadraticDiscriminantAnalysis()))

models.append(('Linear-Ridge',linear_model.RidgeClassifier()))

models.append(('GausNB', GaussianNB()))
models.append(('BernNB', BernoulliNB()))

models.append(('KNN-k', KNeighborsClassifier()))

models.append(('SVM-SVC',svm.SVC()))
models.append(('SVM-LinearSVC',svm.LinearSVC()))

models.append(('RForest', RandomForestClassifier(n_jobs=-1)))
models.append(('GBST', xgb.XGBClassifier(nthread=-1)))

names = []
F1_score = []
roc_score = []
Precision = []
Recall = []

for name, clf in models:    
    print(name)  
    
        
    kf = KFold(n_splits=5)
   
    for train_i, test_i in kf.split(X_train,y_train):
        X_train_val,y_train_val=X_train[train_i],y_train[train_i]
        X_test_val, y_test_val = X_train[test_i], y_train[test_i]
        
        clf.fit(X_train_val,y_train_val)
    
        y_pred_test= clf.predict(X_test_val)
    
        confusion = confusion_matrix(y_test_val, y_pred_test)        
        roc = roc_auc_score(y_test_val, y_pred_test)*100
        
        F1 =  f1_score(y_test_val, y_pred_test,average='weighted')*100
        precision = precision_score(y_test_val.ravel(), y_pred_test)*100
        recall = recall_score(y_test_val.ravel(), y_pred_test)*100        
    
        names.append(name)  
        Precision.append(precision)
        Recall.append(recall)
        roc_score.append(roc)
        F1_score.append(F1)
        
df = pd.DataFrame({'classifier':names,'ROC':roc_score, 'F1':F1_score,
                  'precision':Precision,'recall':Recall})

df = df[['classifier','F1','ROC','precision','recall']]
df.boxplot(column='F1',by='classifier',figsize=(15,10),fontsize=15)
df.groupby(['classifier']).mean()