# **Cross-Validation & Modeling**

In [2]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot')

In [3]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer

In [4]:
# Importing Excel file, calling the DataFrame comp_df
df = pd.read_csv('Final5', header=0, index_col=0)
# Checking import 
df.head(5)

Unnamed: 0,X2,X3,X4,X5_Bins,X6_BI,X7_BI,X8_BI,X9_BI,X10_BI,X11_BI,...,X22_BI,X23_BI,X1,X12_ABS,X13_ABS,X14_ABS,X15_ABS,X16_ABS,X17_ABS,Y
1,2,2,1,0,1,1,0,0,0,0,...,0,0,0.034263,-0.866772,-0.935035,-0.794573,-0.699341,-0.63668,-0.606347,1
2,2,2,2,0,0,1,0,0,0,1,...,0,1,-0.271418,0.116536,-0.233725,-0.216424,-0.139167,-0.067265,-0.007117,1
3,2,2,2,1,0,0,0,0,0,0,...,1,1,-0.89598,0.423406,0.476861,0.528327,0.227939,0.296254,0.345181,0
4,2,2,1,1,0,0,0,0,0,0,...,1,1,-0.89598,-0.482617,-0.597292,0.313576,0.054974,0.059507,0.098518,0
5,1,2,1,2,0,0,0,0,0,0,...,1,1,-0.89598,0.655807,0.598303,0.64196,0.013707,0.072582,0.12276,0


In [5]:
X = df.iloc[:,:-1] # Features
y = df.Y # Target variable

# **Balancing Data**

In [6]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=2019)
X_resample, y_resample = ros.fit_resample(X, y)

# **AdaBoostClassifier with Cross Validation**

In [7]:
from sklearn.ensemble import AdaBoostClassifier

seed = 2019
num_trees = 70
model = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)

kfold = model_selection.KFold(n_splits=10, random_state=seed)

predicted = model_selection.cross_val_predict(model, X_resample, y_resample, cv=10)

print(metrics.accuracy_score(y_resample, predicted))
print(cross_val_score(model, X_resample, y_resample, cv=10, scoring='roc_auc').mean())
print(metrics.classification_report(y_resample, predicted))

0.7002011641842151
0.7484285484122406
              precision    recall  f1-score   support

           0       0.66      0.82      0.73     23364
           1       0.76      0.58      0.66     23364

   micro avg       0.70      0.70      0.70     46728
   macro avg       0.71      0.70      0.70     46728
weighted avg       0.71      0.70      0.70     46728



# **Decision Tree with Cross Validation**

In [8]:
# Import Decision Tree Classifier  
from sklearn.tree import DecisionTreeClassifier

# Create Decision Tree Classifier Object 
clf = DecisionTreeClassifier()
clf = clf.fit(X_resample, y_resample)

kfold = model_selection.KFold(n_splits=10, random_state=seed)

predicted = model_selection.cross_val_predict(clf, X_resample, y_resample, cv=kfold)

print(metrics.accuracy_score(y_resample, predicted))
#print(cross_val_score(clf, X_resample, y_resample, cv=10, scoring='roc_auc').mean())
print(metrics.classification_report(y_resample, predicted))

0.8877974661872967
              precision    recall  f1-score   support

           0       0.98      0.79      0.88     23364
           1       0.82      0.98      0.90     23364

   micro avg       0.89      0.89      0.89     46728
   macro avg       0.90      0.89      0.89     46728
weighted avg       0.90      0.89      0.89     46728



In [9]:
feature_importance = pd.Series(clf.feature_importances_, index=df.iloc[:,:-1].columns).sort_values(ascending=False)
feature_importance

X6_BI      0.158052
X13_ABS    0.102375
X12_ABS    0.099566
X14_ABS    0.095567
X15_ABS    0.093225
X16_ABS    0.087248
X17_ABS    0.084506
X1         0.080614
X3         0.022489
X5_Bins    0.022187
X4         0.017900
X12_BI     0.017059
X8_BI      0.014691
X23_BI     0.012463
X20_BI     0.011764
X2         0.011272
X11_BI     0.008257
X22_BI     0.007832
X21_BI     0.007103
X7_BI      0.006417
X18_BI     0.005637
X19_BI     0.005074
X10_BI     0.004846
X17_BI     0.004750
X13_BI     0.004450
X9_BI      0.004429
X16_BI     0.004023
X15_BI     0.003314
X14_BI     0.002891
X14_M      0.000000
X17_M      0.000000
X15_M      0.000000
X16_M      0.000000
X12_M      0.000000
X13_M      0.000000
dtype: float64

In [None]:
# Decision Tree Using Feauture Selection 

In [15]:
Xfeat10 = df.filter(['X6_BI', 'X13_ABS', 'X12_ABS', 'X14_ABS', 'X15_ABS', 'X16_ABS', 'X17_ABS', 'X1']) # Features
y = df.Y # Target variable

In [16]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=2019)
X_resample_feat10, y_resample = ros.fit_resample(Xfeat10, y)

In [17]:
# Import Decision Tree Classifier  
from sklearn.tree import DecisionTreeClassifier

# Create Decision Tree Classifier Object 
clf = DecisionTreeClassifier()
clf = clf.fit(X_resample_feat10, y_resample)

kfold = model_selection.KFold(n_splits=10, random_state=seed)

predicted = model_selection.cross_val_predict(clf, X_resample_feat10, y_resample, cv=kfold)

print(metrics.accuracy_score(y_resample, predicted))
#print(cross_val_score(clf, X_resample, y_resample, cv=10, scoring='roc_auc').mean())
print(metrics.classification_report(y_resample, predicted))

0.8735661701763396
              precision    recall  f1-score   support

           0       0.97      0.78      0.86     23364
           1       0.81      0.97      0.88     23364

   micro avg       0.87      0.87      0.87     46728
   macro avg       0.89      0.87      0.87     46728
weighted avg       0.89      0.87      0.87     46728



In [10]:
# **Decision Tree using TPOT Suggestion**

In [18]:
# Import Decision Tree Classifier  
from sklearn.tree import DecisionTreeClassifier

# Create Decision Tree Classifier Object 
clf = DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=9,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=16, min_samples_split=6,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
clf = clf.fit(X_resample_feat10, y_resample)

kfold = model_selection.KFold(n_splits=10, random_state=seed)

predicted = model_selection.cross_val_predict(clf, X_resample_feat10, y_resample, cv=kfold)

print(metrics.accuracy_score(y_resample, predicted))
#print(cross_val_score(clf, X_resample, y_resample, cv=10, scoring='roc_auc').mean())
print(metrics.classification_report(y_resample, predicted))

0.6835088169833933
              precision    recall  f1-score   support

           0       0.64      0.84      0.73     23364
           1       0.77      0.52      0.62     23364

   micro avg       0.68      0.68      0.68     46728
   macro avg       0.70      0.68      0.68     46728
weighted avg       0.70      0.68      0.68     46728



# **Logistic Regression with Cross Validation**

In [28]:
# import the class
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(solver='liblinear')
logreg = logreg.fit(X_resample,y_resample)

kfold = model_selection.KFold(n_splits=10, random_state=seed)

predicted = model_selection.cross_val_predict(logreg, X_resample, y_resample, cv=kfold)
print(metrics.accuracy_score(y_resample, predicted))
print(cross_val_score(logreg, X_resample, y_resample, cv=10, scoring='roc_auc').mean())
print(metrics.classification_report(y_resample, predicted))

0.6802559493237459
0.7449374834029472
              precision    recall  f1-score   support

           0       0.65      0.79      0.71     23364
           1       0.73      0.57      0.64     23364

   micro avg       0.68      0.68      0.68     46728
   macro avg       0.69      0.68      0.68     46728
weighted avg       0.69      0.68      0.68     46728



# **SVC with Cross Validation**

In [None]:
from sklearn.svm import SVC  
svclassifier = SVC(kernel='rbf', gamma='auto')  #Kernel=rbf, non-linearly separable data
svclassifier = svclassifier.fit(X_resample, y_resample)

kfold = model_selection.KFold(n_splits=10, random_state=seed)

results = model_selection.cross_val_score(svclassifier, X_resample, y_resample, cv=kfold)
print(results)
print(results.mean())