# **Cross-Validation & Modeling**

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot')

In [2]:
from sklearn.model_selection import train_test_split
from sklearn import metrics 
from sklearn import model_selection
from sklearn.model_selection import cross_val_score

In [3]:
# Importing Excel file, calling the DataFrame comp_df
df = pd.read_csv('Final6', header=0, index_col=0)
# Checking import 
df.head(5)

Unnamed: 0,X2,X3,X4,X5_Bins,X6_BI,X7_BI,X8_BI,X9_BI,X10_BI,X11_BI,...,X22_BI,X23_BI,X1,X12_ABS,X13_ABS,X14_ABS,X15_ABS,X16_ABS,X17_ABS,Y
2,2,2,2,0,0,1,0,0,0,1,...,0,1,0.539312,0.564787,0.448541,0.452568,0.476104,0.495677,0.504543,1
3,2,2,2,1,0,0,0,0,0,0,...,1,1,0.428052,0.661555,0.677011,0.695874,0.597415,0.61792,0.624935,0
4,2,2,1,1,0,0,0,0,0,0,...,1,1,0.428052,0.375851,0.331646,0.625716,0.540258,0.538308,0.540642,0
5,1,2,1,2,0,0,0,0,0,0,...,1,1,0.428052,0.73484,0.716057,0.732997,0.526621,0.542705,0.548927,0
6,1,1,2,1,0,0,0,0,0,0,...,1,1,0.986753,0.745061,0.744032,0.743738,0.743413,0.742378,0.740974,0


In [4]:
X = df.iloc[:,:-1] # Features
y = df.Y # Target variable

# **Balancing Data**

In [5]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=2019)
X_resample, y_resample = ros.fit_resample(X, y)

# **AdaBoostClassifier with Cross Validation**

In [8]:
from sklearn.ensemble import AdaBoostClassifier
seed = 2019
num_trees = 70
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)
results = model_selection.cross_val_score(model, X_resample, y_resample, cv=kfold)
print(results)
print(results.mean())

[0.72779799 0.73314787 0.7316499  0.74662957 0.77487695 0.76481917
 0.64733576 0.54868393 0.54794521 0.55693493]
0.6779821288280974


# **Decision Tree with Cross Validation**

In [10]:
# Import Decision Tree Classifier  
from sklearn.tree import DecisionTreeClassifier

# Create Decision Tree Classifier Object 
clf = DecisionTreeClassifier()
clf = clf.fit(X_resample, y_resample)

kfold = model_selection.KFold(n_splits=10, random_state=seed)

predicted = model_selection.cross_val_predict(clf, X_resample, y_resample, cv=kfold)

print(metrics.accuracy_score(y_resample, predicted))
#print(cross_val_score(clf, X_resample, y_resample, cv=10, scoring='roc_auc').mean())
print(metrics.classification_report(y_resample, predicted))

0.8876904639616504
              precision    recall  f1-score   support

           0       0.98      0.79      0.88     23364
           1       0.83      0.98      0.90     23364

   micro avg       0.89      0.89      0.89     46728
   macro avg       0.90      0.89      0.89     46728
weighted avg       0.90      0.89      0.89     46728



In [11]:
# **Decision Tree using TPOT Suggestion**

In [14]:
# Import Decision Tree Classifier  
from sklearn.tree import DecisionTreeClassifier

# Create Decision Tree Classifier Object 
clf = DecisionTreeClassifier(criterion="entropy", max_depth=6, min_samples_leaf=3, min_samples_split=4)
clf = clf.fit(X_resample, y_resample)

kfold = model_selection.KFold(n_splits=10, random_state=seed)

predicted = model_selection.cross_val_predict(clf, X_resample, y_resample, cv=kfold)

print(metrics.accuracy_score(y_resample, predicted))
#print(cross_val_score(clf, X_resample, y_resample, cv=10, scoring='roc_auc').mean())
print(metrics.classification_report(y_resample, predicted))

0.6922829994863893
              precision    recall  f1-score   support

           0       0.66      0.79      0.72     23364
           1       0.74      0.60      0.66     23364

   micro avg       0.69      0.69      0.69     46728
   macro avg       0.70      0.69      0.69     46728
weighted avg       0.70      0.69      0.69     46728



# **Logistic Regression with Cross Validation**

In [8]:
# import the class
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(solver='liblinear')
logreg = logreg.fit(X_resample,y_resample)

kfold = model_selection.KFold(n_splits=10, random_state=seed)

results = model_selection.cross_val_score(logreg, X_resample, y_resample, cv=kfold)
print(results)
print(results.mean())

[0.73271988 0.73443184 0.72843997 0.74684357 0.77765889 0.76160924
 0.65482559 0.55232185 0.5505137  0.56035959]
0.6799724132952638


# **SVC with Cross**

In [None]:
from sklearn.svm import SVC  
svclassifier = SVC(kernel='rbf', gamma='auto')  #Kernel=rbf, non-linearly separable data
svclassifier = svclassifier.fit(X_resample, y_resample)

kfold = model_selection.KFold(n_splits=10, random_state=seed)

results = model_selection.cross_val_score(svclassifier, X_resample, y_resample, cv=kfold)
print(results)
print(results.mean())