# **Cross-Validation & Modeling**

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot')

In [2]:
from sklearn.model_selection import train_test_split
from sklearn import metrics 
from sklearn import model_selection
from sklearn.model_selection import cross_val_score

In [3]:
# Importing Excel file, calling the DataFrame comp_df
df = pd.read_csv('Final10', header=0, index_col=0)
# Checking import 
df.head(5)

Unnamed: 0,X2,X3_Bins,X4_Bins,X5_Bins,X6_BI,X7_BI,X8_BI,X9_BI,X10_BI,X11_BI,...,X15_ABS,X16_ABS,X17_ABS,X18,X19,X20,X21,X22,X23,Y
1,2,1,1,0,1,1,0,0,0,0,...,0.290992,0.304196,0.299765,0.0,0.446302,0.45419,0.47078,0.0,0.588415,1
2,2,1,0,0,0,1,0,0,0,1,...,0.476104,0.495677,0.504543,0.516562,0.510888,0.45419,0.47078,0.468896,0.798602,1
3,2,1,0,1,0,0,0,0,0,0,...,0.597415,0.61792,0.624935,0.566293,0.56408,0.482649,0.485977,0.479441,0.467025,0
4,2,1,1,1,0,0,0,0,0,0,...,0.540258,0.538308,0.540642,0.566293,0.763165,0.978523,0.979261,0.414142,0.410485,0
5,1,1,1,2,0,0,0,0,0,0,...,0.526621,0.542705,0.548927,0.610021,0.544403,0.394845,0.47078,0.468896,0.433548,0


In [4]:
X = df.iloc[:,:-1] # Features
y = df.Y # Target variable

# **Balancing Data**

In [5]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=2019)
X_resample, y_resample = ros.fit_resample(X, y)

# **AdaBoostClassifier with Cross Validation**

In [6]:
from sklearn.ensemble import AdaBoostClassifier
seed = 2019
num_trees = 70
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)
results = model_selection.cross_val_score(model, X_resample, y_resample, cv=kfold)
print(results)
print(results.mean())

[0.74320565 0.7453456  0.7410657  0.75390541 0.78001284 0.76695913
 0.6588915  0.55296383 0.54623288 0.54901541]
0.6837597955978529


# **Decision Tree with Cross Validation**

In [7]:
# Import Decision Tree Classifier  
from sklearn.tree import DecisionTreeClassifier

# Create Decision Tree Classifier Object 
clf = DecisionTreeClassifier()
clf = clf.fit(X_resample, y_resample)

kfold = model_selection.KFold(n_splits=10, random_state=seed)

predicted = model_selection.cross_val_predict(clf, X_resample, y_resample, cv=kfold)

print(metrics.accuracy_score(y_resample, predicted))
#print(cross_val_score(clf, X_resample, y_resample, cv=10, scoring='roc_auc').mean())
print(metrics.classification_report(y_resample, predicted))

0.8870270501626434
              precision    recall  f1-score   support

           0       0.98      0.79      0.88     23364
           1       0.82      0.98      0.90     23364

   micro avg       0.89      0.89      0.89     46728
   macro avg       0.90      0.89      0.89     46728
weighted avg       0.90      0.89      0.89     46728



In [8]:
# **Decision Tree using TPOT Suggestion**

In [9]:
# Import Decision Tree Classifier  
from sklearn.tree import DecisionTreeClassifier

# Create Decision Tree Classifier Object 
clf = DecisionTreeClassifier(criterion="entropy", max_depth=6, min_samples_leaf=3, min_samples_split=4)
clf = clf.fit(X_resample, y_resample)

kfold = model_selection.KFold(n_splits=10, random_state=seed)

predicted = model_selection.cross_val_predict(clf, X_resample, y_resample, cv=kfold)

print(metrics.accuracy_score(y_resample, predicted))
#print(cross_val_score(clf, X_resample, y_resample, cv=10, scoring='roc_auc').mean())
print(metrics.classification_report(y_resample, predicted))

0.6886235233692861
              precision    recall  f1-score   support

           0       0.66      0.78      0.72     23364
           1       0.73      0.59      0.66     23364

   micro avg       0.69      0.69      0.69     46728
   macro avg       0.70      0.69      0.69     46728
weighted avg       0.70      0.69      0.69     46728



# **Logistic Regression with Cross Validation**

In [8]:
# import the class
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(solver='liblinear')
logreg = logreg.fit(X_resample,y_resample)

kfold = model_selection.KFold(n_splits=10, random_state=seed)

results = model_selection.cross_val_score(logreg, X_resample, y_resample, cv=kfold)
print(results)
print(results.mean())

[0.74277766 0.74513161 0.74170768 0.75197946 0.78001284 0.76738712
 0.64797774 0.54290606 0.53767123 0.55072774]
0.6808279135239161


# **SVC with Cross**

In [None]:
from sklearn.svm import SVC  
svclassifier = SVC(kernel='rbf', gamma='auto')  #Kernel=rbf, non-linearly separable data
svclassifier = svclassifier.fit(X_resample, y_resample)

kfold = model_selection.KFold(n_splits=10, random_state=seed)

results = model_selection.cross_val_score(svclassifier, X_resample, y_resample, cv=kfold)
print(results)
print(results.mean())