# **Cross-Validation & Modeling**

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot')

In [2]:
from sklearn.model_selection import train_test_split
from sklearn import metrics 
from sklearn import model_selection
from sklearn.model_selection import cross_val_score

In [3]:
# Importing Excel file, calling the DataFrame comp_df
df = pd.read_csv('Final9', header=0, index_col=0)
# Checking import 
df.head(5)

Unnamed: 0,X2,X3_Bins,X4_Bins,X5_Bins,X6_BI,X7_BI,X8_BI,X9_BI,X10_BI,X11_BI,...,X22_BI,X23_BI,X1,X12_ABS,X13_ABS,X14_ABS,X15_ABS,X16_ABS,X17_ABS,Y
2,2,1,0,0,0,1,0,0,0,1,...,0,1,-0.271418,0.116536,-0.233725,-0.216424,-0.139167,-0.067265,-0.007117,1
3,2,1,0,1,0,0,0,0,0,0,...,1,1,-0.89598,0.423406,0.476861,0.528327,0.227939,0.296254,0.345181,0
4,2,1,1,1,0,0,0,0,0,0,...,1,1,-0.89598,-0.482617,-0.597292,0.313576,0.054974,0.059507,0.098518,0
5,1,1,1,2,0,0,0,0,0,0,...,1,1,-0.89598,0.655807,0.598303,0.64196,0.013707,0.072582,0.12276,0
6,1,1,0,1,0,0,0,0,0,0,...,1,1,1.550666,2.491521,2.675594,2.833575,0.004727,0.030574,0.041317,0


In [4]:
X = df.iloc[:,:-1] # Features
y = df.Y # Target variable

# **Balancing Data**

In [5]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=2019)
X_resample, y_resample = ros.fit_resample(X, y)

# **AdaBoostClassifier with Cross Validation**

In [6]:
from sklearn.ensemble import AdaBoostClassifier
seed = 2019
num_trees = 70
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)
results = model_selection.cross_val_score(model, X_resample, y_resample, cv=kfold)
print(results)
print(results.mean())

[0.73057993 0.73806976 0.73721378 0.7453456  0.7774449  0.76567516
 0.65568158 0.54675797 0.54666096 0.56164384]
0.6805073465609783


# **Decision Tree with Cross Validation**

In [7]:
# Import Decision Tree Classifier  
from sklearn.tree import DecisionTreeClassifier

# Create Decision Tree Classifier Object 
clf = DecisionTreeClassifier()
clf = clf.fit(X_resample, y_resample)

kfold = model_selection.KFold(n_splits=10, random_state=seed)

predicted = model_selection.cross_val_predict(clf, X_resample, y_resample, cv=kfold)

print(metrics.accuracy_score(y_resample, predicted))
#print(cross_val_score(clf, X_resample, y_resample, cv=10, scoring='roc_auc').mean())
print(metrics.classification_report(y_resample, predicted))

0.8889744906694059
              precision    recall  f1-score   support

           0       0.98      0.79      0.88     23364
           1       0.83      0.98      0.90     23364

   micro avg       0.89      0.89      0.89     46728
   macro avg       0.90      0.89      0.89     46728
weighted avg       0.90      0.89      0.89     46728



In [8]:
# **Decision Tree using TPOT Suggestion**

In [9]:
# Import Decision Tree Classifier  
from sklearn.tree import DecisionTreeClassifier

# Create Decision Tree Classifier Object 
clf = DecisionTreeClassifier(criterion="entropy", max_depth=6, min_samples_leaf=3, min_samples_split=4)
clf = clf.fit(X_resample, y_resample)

kfold = model_selection.KFold(n_splits=10, random_state=seed)

predicted = model_selection.cross_val_predict(clf, X_resample, y_resample, cv=kfold)

print(metrics.accuracy_score(y_resample, predicted))
#print(cross_val_score(clf, X_resample, y_resample, cv=10, scoring='roc_auc').mean())
print(metrics.classification_report(y_resample, predicted))

0.6934814244136278
              precision    recall  f1-score   support

           0       0.66      0.78      0.72     23364
           1       0.74      0.60      0.66     23364

   micro avg       0.69      0.69      0.69     46728
   macro avg       0.70      0.69      0.69     46728
weighted avg       0.70      0.69      0.69     46728



# **Logistic Regression with Cross Validation**

In [8]:
# import the class
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(solver='liblinear')
logreg = logreg.fit(X_resample,y_resample)

kfold = model_selection.KFold(n_splits=10, random_state=seed)

results = model_selection.cross_val_score(logreg, X_resample, y_resample, cv=kfold)
print(results)
print(results.mean())

[0.73207789 0.73485983 0.72886796 0.74834154 0.77573293 0.76289322
 0.65610957 0.55253584 0.5505137  0.56014555]
0.6802078035361989


# **SVC with Cross**

In [None]:
from sklearn.svm import SVC  
svclassifier = SVC(kernel='rbf', gamma='auto')  #Kernel=rbf, non-linearly separable data
svclassifier = svclassifier.fit(X_resample, y_resample)

kfold = model_selection.KFold(n_splits=10, random_state=seed)

results = model_selection.cross_val_score(svclassifier, X_resample, y_resample, cv=kfold)
print(results)
print(results.mean())