In [139]:

# Our standard packages for data science.
import os
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import itertools
from time import time

# Our main packages for standard machine learning
from sklearn.linear_model import *
from sklearn.tree import *
from sklearn.svm import *
from sklearn.naive_bayes import *
from sklearn.ensemble import *
from sklearn.multiclass import *
from sklearn.neighbors import KNeighborsClassifier
from numba import jit
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from scipy.stats import randint as sp_randint
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.model_selection import GroupKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import *
from sklearn.model_selection import *
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
from scipy import stats
from scipy.stats import norm
from matplotlib import rcParams
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')
import gc
gc.enable()

print('Packages are ready!')

Packages are ready!


In [100]:
# Lets import our data
target = pd.read_csv('target.csv')
data = pd.read_csv('data.csv')
test = pd.read_csv('test.csv')
vdata = pd.read_csv('vdata.csv')
vtarget = pd.read_csv('vtarget.csv')
ltarget = pd.read_csv('ltarget.csv', header=None)
lvtarget = pd.read_csv('lvtarget.csv', header=None)
wdata = pd.read_csv('whole_data.csv')
wtarget = pd.read_csv('whole_target.csv', header=None)

winedata = pd.read_csv('winedata.csv')

ccdata = pd.read_csv('creditcard.csv')

irisdata = pd.read_csv('irisdata.csv')
iristarget = pd.read_csv('iristarget.csv')

print('Data is ready!')

Data is ready!


In [101]:
# Lets go ahead and set up our data. First lets make our target and drop it from wine.
winetarget = winedata['quality']
winedata = winedata.drop('quality', axis=1)

# Lets trim off time and amount from ccdata as those are independent features we don't want the model to learn.
cctarget = ccdata['Class']
ccdata = ccdata.drop(['Time','Amount', 'Class'], axis=1)

iristarget = iristarget['target']

target = target['surface']
vtarget = vtarget['surface']
ltarget = ltarget[0]
lvtarget = lvtarget[0]
wdata = wdata.drop(['series_id', 'group_id', 'surface'], axis=1)
wtarget = wtarget[0]

irisd, id_test, irist, it_test = train_test_split(irisdata, iristarget, test_size=0.33, random_state=42)
wined, wd_test, winet, wt_test = train_test_split(winedata, winetarget, test_size=0.33, random_state=42)
ccd, ccd_test, cct, cct_test = train_test_split(ccdata, cctarget, test_size=0.33, random_state=42)

# data = data.values
# vdata = vdata.values
# wdata = wdata.values
# test = test.values
# irisdata = irisdata.values
# ccdata = ccdata.values
# winedata = winedata.values
# id_test = id_test.values
# ccd_test = ccd_test.values
# wd_test = wd_test.values

In [8]:
# Now lets make sure our train and target variables are even for every dataset
print(irisdata.shape)
print(iristarget.shape)

print(winedata.shape)
print(winetarget.shape)

print(ccdata.shape)
print(cctarget.shape)

print(target.shape)
print(vtarget.shape)
print(ltarget.shape)
print(lvtarget.shape)
print(wtarget.shape)
print(wdata.shape)
print(vdata.shape)
print(data.shape)
test.shape

(150, 4)
(150,)
(6497, 11)
(6497,)
(284807, 28)
(284807,)
(2804,)
(1006,)
(358912,)
(128768,)
(487680,)
(487680, 23)
(128768, 23)
(358912, 23)


(488448, 23)

In [156]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(sampling_strategy=0.5, random_state=2)

ccd_SMOTE, cct_SMOTE = sm.fit_resample(ccd,cct)

ccd_SMOTE = pd.DataFrame(ccd_SMOTE, columns=ccd.columns)
# cct_SMOTE = pd.DataFrame(cct_SMOTE, columns=['class'])

In [125]:
# Lets see how many new minority samples there are.
print(len(cct))
len(cct_SMOTE)

190820


285715

So in this notebook we are going to go over some of the most common basic machine learning models for classification. The list will be Lasso, Logistic Regression, SVM, Decision Trees, Random Forest, KNeighbor, Multinomial NB, and One vs Rest. This should be a good coverage of models for most use cases. We won't apply all of them for every dataset as that would take a long time and this would turn into a MASSIVE notebook.

We will use Lasso, Decision Trees, Logistic Regression for the Iris and Wine datasets. We will use SVM and KNearest Neighbor for ICU, Wine, and Iris. We will use Random Forest for Wine and CCD. Multinomial NB for ICU and CCD. One vs Rest for ICU and Iris.

I am going to set the hyperparameters by hand. You can automate this process and create a better model using any of the following methods below! These are super helpful and can get you out of tight spots if you can't figure out the proper model hyperparameters!

https://autonomio.github.io/docs_talos/#introduction

https://towardsdatascience.com/automated-machine-learning-hyperparameter-tuning-in-python-dfda59b72f8a

https://medium.com/@mikkokotila/a-comprehensive-list-of-hyperparameter-optimization-tuning-solutions-88e067f19d9

https://tsfresh.readthedocs.io/en/latest/

https://towardsdatascience.com/machine-learning-introduction-a-comprehensive-guide-af6712cf68a3

In [24]:
# First we start with The classic. Logistic Regression! Lets compare this model on Iris and Wine.

x = irisdata
y = iristarget
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=59)
predicted = np.zeros((test.shape[0],2))
measured= np.zeros((x.shape[0]))
score = 0

In [26]:
# This will be how we set up our models. This will be VERY repetitive.
# In SKLearn every model can be changed by just changing names.
# So that is what we will do!

for times, (trn_idx, val_idx) in enumerate(folds.split(x.values,y.values)):
    # This is where we define our model.
    model = LogisticRegression(n_jobs=-1)
    # This is what trains our model
    model.fit(x.iloc[trn_idx],y[trn_idx])
    # And this is what tells us how good the model is doing on each fold.
    measured[val_idx] = model.predict(x.iloc[val_idx])
    score += model.score(x.iloc[val_idx],y[val_idx])
    print("Fold: {} score: {}".format(times,model.score(x.iloc[val_idx],y[val_idx])))

    gc.collect()
    
    
# We will use this set up for everything going forward.

Fold: 0 score: 0.9666666666666667
Fold: 1 score: 0.9333333333333333
Fold: 2 score: 0.9666666666666667
Fold: 3 score: 0.9
Fold: 4 score: 1.0


In [78]:
print('Avg Accuracy RF', score / folds.n_splits)

Avg Accuracy RF 0.9600000000000002


Here we see that the model is trained very easily and gets a perfect accuracy.

In [18]:
# Now again for the wine dataset.

x = winedata
y = winetarget
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=59)
predicted = np.zeros((test.shape[0],2))
measured= np.zeros((x.shape[0]))
score = 0

In [19]:
for times, (trn_idx, val_idx) in enumerate(folds.split(x.values,y.values)):
    model = LogisticRegression(n_jobs=-1)
    model.fit(x.iloc[trn_idx],y[trn_idx])
    measured[val_idx] = model.predict(x.iloc[val_idx])
    score += model.score(x.iloc[val_idx],y[val_idx])
    print("Fold: {} score: {}".format(times,model.score(x.iloc[val_idx],y[val_idx])))

    gc.collect()

Fold: 0 score: 0.7246153846153847
Fold: 1 score: 0.7269230769230769
Fold: 2 score: 0.7346153846153847
Fold: 3 score: 0.7444187836797537
Fold: 4 score: 0.7473035439137135


In [77]:
print('Avg Accuracy RF', score / folds.n_splits)

Avg Accuracy RF 0.9600000000000002


In [35]:
# Next lets do some Lasso! Lets compare this model on Iris and Wine.

x = irisdata
y = iristarget
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=59)
predicted = np.zeros((test.shape[0],2))
measured= np.zeros((x.shape[0]))
score = 0

In [50]:
for times, (trn_idx, val_idx) in enumerate(folds.split(x.values,y.values)):
    model = Lasso(alpha=0.1)
    model.fit(x.iloc[trn_idx],y[trn_idx])
    measured[val_idx] = model.predict(x.iloc[val_idx])
    score += model.score(x.iloc[val_idx],y[val_idx])
    print("Fold: {} score: {}".format(times,model.score(x.iloc[val_idx],y[val_idx])))

    gc.collect()

Fold: 0 score: 0.9016287285670783
Fold: 1 score: 0.8944459740452599
Fold: 2 score: 0.9126333960814434
Fold: 3 score: 0.8644030348019343
Fold: 4 score: 0.9014665263170626


In [76]:
print('Avg Accuracy RF', score / folds.n_splits)

Avg Accuracy RF 0.9600000000000002


In [51]:
# Now the wine

x = winedata
y = winetarget
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=59)
predicted = np.zeros((test.shape[0],2))
measured= np.zeros((x.shape[0]))
score = 0

In [65]:
for times, (trn_idx, val_idx) in enumerate(folds.split(x.values,y.values)):
    model = Lasso(alpha=.001)
    model.fit(x.iloc[trn_idx],y[trn_idx])
    measured[val_idx] = model.predict(x.iloc[val_idx])
    score += model.score(x.iloc[val_idx],y[val_idx])
    print("Fold: {} score: {}".format(times,model.score(x.iloc[val_idx],y[val_idx])))

    gc.collect()

Fold: 0 score: 0.172838249189872
Fold: 1 score: 0.16494204023358838
Fold: 2 score: 0.18369638444916603
Fold: 3 score: 0.17810253697300749
Fold: 4 score: 0.180558028921446


In [75]:
print('Avg Accuracy RF', score / folds.n_splits)

Avg Accuracy RF 0.9600000000000002


We see that Lasso works almost as well for iris but CONSIDERABLY worse for wine. It really needs L2 on top of L1 to make the model work well.

Onto the next model! We will do Decision Trees on both of them to see if we can finally crack that wine dataset!

In [72]:
# First Iris

x = irisdata
y = iristarget
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=59)
predicted = np.zeros((test.shape[0],2))
measured= np.zeros((x.shape[0]))
score = 0

In [73]:
for times, (trn_idx, val_idx) in enumerate(folds.split(x.values,y.values)):
    model = DecisionTreeClassifier()
    model.fit(x.iloc[trn_idx],y[trn_idx])
    measured[val_idx] = model.predict(x.iloc[val_idx])
    score += model.score(x.iloc[val_idx],y[val_idx])
    print("Fold: {} score: {}".format(times,model.score(x.iloc[val_idx],y[val_idx])))

    gc.collect()

Fold: 0 score: 1.0
Fold: 1 score: 0.9333333333333333
Fold: 2 score: 1.0
Fold: 3 score: 0.8666666666666667
Fold: 4 score: 1.0


In [74]:
print('Avg Accuracy RF', score / folds.n_splits)

Avg Accuracy RF 0.9600000000000002


In [82]:
# Now lest see if wine does any better

x = winedata
y = winetarget
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=59)
predicted = np.zeros((test.shape[0],2))
measured= np.zeros((x.shape[0]))
score = 0

In [83]:
for times, (trn_idx, val_idx) in enumerate(folds.split(x.values,y.values)):
    model = DecisionTreeClassifier()
    model.fit(x.iloc[trn_idx],y[trn_idx])
    measured[val_idx] = model.predict(x.iloc[val_idx])
    score += model.score(x.iloc[val_idx],y[val_idx])
    print("Fold: {} score: {}".format(times,model.score(x.iloc[val_idx],y[val_idx])))

    gc.collect()

Fold: 0 score: 0.7715384615384615
Fold: 1 score: 0.7692307692307693
Fold: 2 score: 0.7707692307692308
Fold: 3 score: 0.7729022324865281
Fold: 4 score: 0.7727272727272727


In [84]:
print('Avg Accuracy RF', score / folds.n_splits)

Avg Accuracy RF 0.7714335933504526


Nope! Hmmm Well maybe one of the other models will do better!

Lets try SVM!

In [85]:
# First iris because of my ocd

x = irisdata
y = iristarget
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=59)
predicted = np.zeros((test.shape[0],2))
measured= np.zeros((x.shape[0]))
score = 0

In [88]:
for times, (trn_idx, val_idx) in enumerate(folds.split(x.values,y.values)):
    model = LinearSVC()
    model.fit(x.iloc[trn_idx],y[trn_idx])
    measured[val_idx] = model.predict(x.iloc[val_idx])
    score += model.score(x.iloc[val_idx],y[val_idx])
    print("Fold: {} score: {}".format(times,model.score(x.iloc[val_idx],y[val_idx])))

    gc.collect()

Fold: 0 score: 1.0
Fold: 1 score: 0.9666666666666667
Fold: 2 score: 1.0
Fold: 3 score: 0.8666666666666667
Fold: 4 score: 1.0


In [None]:
print('Avg Accuracy RF', score / folds.n_splits)

same old same old.

In [89]:
# Moment of truth!

x = winedata
y = winetarget
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=59)
predicted = np.zeros((test.shape[0],2))
measured= np.zeros((x.shape[0]))
score = 0

In [90]:
for times, (trn_idx, val_idx) in enumerate(folds.split(x.values,y.values)):
    model = LinearSVC()
    model.fit(x.iloc[trn_idx],y[trn_idx])
    measured[val_idx] = model.predict(x.iloc[val_idx])
    score += model.score(x.iloc[val_idx],y[val_idx])
    print("Fold: {} score: {}".format(times,model.score(x.iloc[val_idx],y[val_idx])))

    gc.collect()

Fold: 0 score: 0.6776923076923077
Fold: 1 score: 0.6876923076923077
Fold: 2 score: 0.6892307692307692
Fold: 3 score: 0.41647421093148573
Fold: 4 score: 0.6879815100154083


In [None]:
print('Avg Accuracy RF', score / folds.n_splits)

NOPE! Well lets try another. But first lets see how our ICU data does with SVM.

In [102]:
# This will take a LONG time!

x = data
y = ltarget
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=59)
predicted = np.zeros((test.shape[0],2))
measured= np.zeros((x.shape[0]))
score = 0

In [103]:
for times, (trn_idx, val_idx) in enumerate(folds.split(x.values,y.values)):
    model = LinearSVC()
    model.fit(x.iloc[trn_idx],y[trn_idx])
    measured[val_idx] = model.predict(x.iloc[val_idx])
    score += model.score(x.iloc[val_idx],y[val_idx])
    print("Fold: {} score: {}".format(times,model.score(x.iloc[val_idx],y[val_idx])))

    gc.collect()

Fold: 0 score: 0.19039924219207088
Fold: 1 score: 0.16166555221219214
Fold: 2 score: 0.33030104620871237
Fold: 3 score: 0.26998787980106154
Fold: 4 score: 0.2780935662737886


print('Avg Accuracy RF', score / folds.n_splits)

Well that was awful. It was almost as bad as random guessing!

We will try the ICU data later on with two more models to see if we can get it better. Next lets try wine with a RandomForest and hope that we get higher then .75!

In [104]:
x = winedata
y = winetarget
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=59)
predicted = np.zeros((test.shape[0],2))
measured= np.zeros((x.shape[0]))
score = 0

In [105]:
for times, (trn_idx, val_idx) in enumerate(folds.split(x.values,y.values)):
    model = RandomForestClassifier()
    model.fit(x.iloc[trn_idx],y[trn_idx])
    measured[val_idx] = model.predict(x.iloc[val_idx])
    score += model.score(x.iloc[val_idx],y[val_idx])
    print("Fold: {} score: {}".format(times,model.score(x.iloc[val_idx],y[val_idx])))

    gc.collect()

Fold: 0 score: 0.8046153846153846
Fold: 1 score: 0.81
Fold: 2 score: 0.813076923076923
Fold: 3 score: 0.8013856812933026
Fold: 4 score: 0.8235747303543913


In [106]:
print('Avg Accuracy RF', score / folds.n_splits)

Avg Accuracy RF 0.8105305438680002


AN IMPROVEMENT!!! Yay, Randomforest is our best bet so far for wine. This means XGBoost will probably be the best for it.

In [107]:
# Lets try it on the CCD for both regular and SMOTE data. 

x = ccdata
y = cctarget
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=59)
predicted = np.zeros((test.shape[0],2))
measured= np.zeros((x.shape[0]))
score = 0

In [108]:
for times, (trn_idx, val_idx) in enumerate(folds.split(x.values,y.values)):
    model = RandomForestClassifier()
    model.fit(x.iloc[trn_idx],y[trn_idx])
    measured[val_idx] = model.predict(x.iloc[val_idx])
    score += model.score(x.iloc[val_idx],y[val_idx])
    print("Fold: {} score: {}".format(times,model.score(x.iloc[val_idx],y[val_idx])))

    gc.collect()

Fold: 0 score: 0.999420666409185
Fold: 1 score: 0.9994557775359011
Fold: 2 score: 0.9996313266972139
Fold: 3 score: 0.9995084355962852
Fold: 4 score: 0.9995259914678464


In [109]:
print('Avg Accuracy RF', score / folds.n_splits)

Avg Accuracy RF 0.9995084395412863


In [128]:
x = ccd_SMOTE
y = cct_SMOTE
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=59)
measured= np.zeros((x.shape[0]))
score = 0

In [129]:
for times, (trn_idx, val_idx) in enumerate(folds.split(x.values,y)):
    model = RandomForestClassifier()
    model.fit(x.iloc[trn_idx],y[trn_idx])
    measured[val_idx] = model.predict(x.iloc[val_idx])
    score += model.score(x.iloc[val_idx],y[val_idx])
    print("Fold: {} score: {}".format(times,model.score(x.iloc[val_idx],y[val_idx])))

    gc.collect()

KeyError: "None of [Int64Index([     0,      1,      2,      6,      7,      8,      9,     10,\n                11,     12,\n            ...\n            285702, 285703, 285704, 285705, 285706, 285707, 285709, 285710,\n            285712, 285714],\n           dtype='int64', length=228571)] are in the [columns]"

In [None]:
print('Avg Accuracy RF', score / folds.n_splits)

Cool We see a slight difference.

Now lets try K Nearest Neighbors to see if that will work on wine!

In [142]:
# Again Iris first :)

x = irisdata
y = iristarget
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=59)
predicted = np.zeros((test.shape[0],2))
measured= np.zeros((x.shape[0]))
score = 0

In [144]:
for times, (trn_idx, val_idx) in enumerate(folds.split(x.values,y.values)):
    model = KNeighborsClassifier()
    model.fit(x.iloc[trn_idx],y[trn_idx])
    measured[val_idx] = model.predict(x.iloc[val_idx])
    score += model.score(x.iloc[val_idx],y[val_idx])
    print("Fold: {} score: {}".format(times,model.score(x.iloc[val_idx],y[val_idx])))

    gc.collect()

Fold: 0 score: 1.0
Fold: 1 score: 0.9666666666666667
Fold: 2 score: 1.0
Fold: 3 score: 0.8666666666666667
Fold: 4 score: 1.0


In [145]:
print('Avg Accuracy RF', score / folds.n_splits)

Avg Accuracy RF 0.9666666666666668


In [146]:
# WORK FOR ME!

x = winedata
y = winetarget
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=59)
predicted = np.zeros((test.shape[0],2))
measured= np.zeros((x.shape[0]))
score = 0

In [148]:
for times, (trn_idx, val_idx) in enumerate(folds.split(x.values,y.values)):
    model = KNeighborsClassifier()
    model.fit(x.iloc[trn_idx],y[trn_idx])
    measured[val_idx] = model.predict(x.iloc[val_idx])
    score += model.score(x.iloc[val_idx],y[val_idx])
    print("Fold: {} score: {}".format(times,model.score(x.iloc[val_idx],y[val_idx])))

    gc.collect()

Fold: 0 score: 0.6684615384615384
Fold: 1 score: 0.6684615384615384
Fold: 2 score: 0.7061538461538461
Fold: 3 score: 0.6789838337182448
Fold: 4 score: 0.6810477657935285


In [149]:
print('Avg Accuracy RF', score / folds.n_splits)

Avg Accuracy RF 0.6806217045177393


:( Even worse. Looks like RandomForest will be the best wiht only 80%

In [150]:
# We will also try it for our ccd and smote ccd

x = ccdata
y = cctarget
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=59)
predicted = np.zeros((test.shape[0],2))
measured= np.zeros((x.shape[0]))
score = 0

In [152]:
for times, (trn_idx, val_idx) in enumerate(folds.split(x.values,y.values)):
    model = KNeighborsClassifier()
    model.fit(x.iloc[trn_idx],y[trn_idx])
    measured[val_idx] = model.predict(x.iloc[val_idx])
    score += model.score(x.iloc[val_idx],y[val_idx])
    print("Fold: {} score: {}".format(times,model.score(x.iloc[val_idx],y[val_idx])))

    gc.collect()

KeyboardInterrupt: 

In [None]:
print('Avg Accuracy RF', score / folds.n_splits)

In [157]:
x = ccd_SMOTE
y = cct_SMOTE
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=59)
predicted = np.zeros((test.shape[0],2))
measured= np.zeros((x.shape[0]))
score = 0

In [159]:
for times, (trn_idx, val_idx) in enumerate(folds.split(x.values,y)):
    model = KNeighborsClassifier()
    model.fit(x.iloc[trn_idx],y[trn_idx])
    measured[val_idx] = model.predict(x.iloc[val_idx])
    score += model.score(x.iloc[val_idx],y[val_idx])
    print("Fold: {} score: {}".format(times,model.score(x.iloc[val_idx],y[val_idx])))

    gc.collect()

KeyboardInterrupt: 

In [None]:
print('Avg Accuracy RF', score / folds.n_splits)

In [162]:
# We will also try these methods for our big datasets to see if we get any improvement
# as they can deal with large data better then the others.

x = abs(ccdata)
y = cctarget
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=59)
predicted = np.zeros((test.shape[0],2))
measured= np.zeros((x.shape[0]))
score = 0

In [163]:
for times, (trn_idx, val_idx) in enumerate(folds.split(x.values,y.values)):
    model = MultinomialNB()
    model.fit(x.iloc[trn_idx],y[trn_idx])
    measured[val_idx] = model.predict(x.iloc[val_idx])
    score += model.score(x.iloc[val_idx],y[val_idx])
    print("Fold: {} score: {}".format(times,model.score(x.iloc[val_idx],y[val_idx])))

    gc.collect()

Fold: 0 score: 0.9988413328183702
Fold: 1 score: 0.9986131104947158
Fold: 2 score: 0.9989290918347641
Fold: 3 score: 0.9988764242200804
Fold: 4 score: 0.9989290918347641


In [187]:
print('Avg Accuracy RF', score / folds.n_splits)

Avg Accuracy RF 0.9533333333333334


In [164]:
# Again for SMOTE

x = abs(ccd_SMOTE)
y = cct_SMOTE
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=59)
predicted = np.zeros((test.shape[0],2))
measured= np.zeros((x.shape[0]))
score = 0

In [165]:
for times, (trn_idx, val_idx) in enumerate(folds.split(x.values,y)):
    model = MultinomialNB()
    model.fit(x.iloc[trn_idx],y[trn_idx])
    measured[val_idx] = model.predict(x.iloc[val_idx])
    score += model.score(x.iloc[val_idx],y[val_idx])
    print("Fold: {} score: {}".format(times,model.score(x.iloc[val_idx],y[val_idx])))

    gc.collect()

Fold: 0 score: 0.9403786924261515
Fold: 1 score: 0.9403611927761445
Fold: 2 score: 0.9406926482683793
Fold: 3 score: 0.9400091001365021
Fold: 4 score: 0.9407091106366595


In [None]:
print('Avg Accuracy RF', score / folds.n_splits)

In [168]:
# Now our BIG big boy

x = abs(data)
y = ltarget
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=59)
predicted = np.zeros((test.shape[0],2))
measured= np.zeros((x.shape[0]))
score = 0

In [169]:
for times, (trn_idx, val_idx) in enumerate(folds.split(x.values,y.values)):
    model = MultinomialNB()
    model.fit(x.iloc[trn_idx],y[trn_idx])
    measured[val_idx] = model.predict(x.iloc[val_idx])
    score += model.score(x.iloc[val_idx],y[val_idx])
    print("Fold: {} score: {}".format(times,model.score(x.iloc[val_idx],y[val_idx])))

    gc.collect()

Fold: 0 score: 0.2706237985122447
Fold: 1 score: 0.26834670678702777
Fold: 2 score: 0.2708440717161445
Fold: 3 score: 0.2681350218024268
Fold: 4 score: 0.26852238847557747


In [170]:
print('Avg Accuracy RF', score / folds.n_splits)

Avg Accuracy RF 0.2692943974586842


In [184]:
# Now lets end this with a 1 v rest method.
# It creates a seperate classifier for each class. So lets run it on our multiclass problems

x = irisdata
y = iristarget
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=59)
predicted = np.zeros((test.shape[0],2))
measured= np.zeros((x.shape[0]))
score = 0

In [185]:
for times, (trn_idx, val_idx) in enumerate(folds.split(x.values,y.values)):
    model = OneVsRestClassifier(estimator=RandomForestClassifier(), n_jobs=-1)
    model.fit(x.iloc[trn_idx],y[trn_idx])
    measured[val_idx] = model.predict(x.iloc[val_idx])
    score += model.score(x.iloc[val_idx],y[val_idx])
    print("Fold: {} score: {}".format(times,model.score(x.iloc[val_idx],y[val_idx])))

    gc.collect()

Fold: 0 score: 1.0
Fold: 1 score: 0.9333333333333333
Fold: 2 score: 0.9666666666666667
Fold: 3 score: 0.8666666666666667
Fold: 4 score: 1.0


In [186]:
print('Avg Accuracy RF', score / folds.n_splits)

Avg Accuracy RF 0.9533333333333334


In [179]:
# LAST ONE!

x = data
y = ltarget
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=59)
predicted = np.zeros((test.shape[0],2))
measured= np.zeros((x.shape[0]))
score = 0

In [180]:
for times, (trn_idx, val_idx) in enumerate(folds.split(x.values,y.values)):
    model = OneVsRestClassifier(estimator=RandomForestClassifier(), n_jobs=-1)
    model.fit(x.iloc[trn_idx],y[trn_idx])
    measured[val_idx] = model.predict(x.iloc[val_idx])
    score += model.score(x.iloc[val_idx],y[val_idx])
    print("Fold: {} score: {}".format(times,model.score(x.iloc[val_idx],y[val_idx])))

    gc.collect()

Fold: 0 score: 0.9942746496531357
Fold: 1 score: 0.9935500947286303
Fold: 2 score: 0.9944973043756878
Fold: 3 score: 0.9941488694779955
Fold: 4 score: 0.9948870127337067


In [181]:
print('Avg Accuracy RF', score / folds.n_splits)

Avg Accuracy RF 0.9942715861938313


This last one is amazing results! Maaaaayyyybbbbeeee too amazing. We could use our validation data to test for overfitting but Its safe to assume that it is overfitting as our best model in the competition only got 85%!!!

That will be the last test we do for this notebook, To see the Deep learning or XGBoost/LightGBM notebook check those out!

Thanks for reading :)