## Feed Unprocessed Data into Classifiers, Score, and Measure Accuracy

In [80]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt, seaborn as sns
%matplotlib inline

In [62]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC 

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV


### Load the Data from Pickled DataFrames

In [63]:
cook_sample = pd.read_pickle('../assets/pickled_samples/cook_sample.p')
madelon_train10 = pd.read_pickle('../assets/pickled_samples/madelon_train_10.p')
madelon_train_label10 = pd.read_pickle('../assets/pickled_samples/madelon_train_label10.p')

**Madelon:** It's not necessary to load in the test set since that's the hold out data to test the classification model's accuracy. Train/test/split on the training data. 


### Run the Data through the Classifiers and obtain Train & Test scores

#### Madelon Dataset

In [64]:
madelon_train10.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
388,477,466,531,486,459,484,506,477,491,494,...,454,476,490,410,533,498,477,481,524,483
414,483,503,549,485,524,487,434,479,497,470,...,481,481,493,685,502,447,484,473,527,509
710,479,489,473,477,483,473,404,474,493,478,...,485,483,498,326,475,507,490,474,504,467
1165,483,491,471,485,494,474,490,477,470,479,...,493,476,464,281,527,487,491,468,467,485
1743,477,442,559,481,529,484,496,478,495,469,...,467,474,513,539,516,490,475,492,513,510


In [65]:
madelon_train10.shape

(200, 500)

In [66]:
madelon_train_label10.shape

(200,)

In [67]:
mad_X_train, mad_X_test, mad_y_train, mad_y_test = train_test_split(madelon_train10,\
                                                                    madelon_train_label10)

In [68]:
display(mad_X_train.shape)
display(mad_X_test.shape)
display(mad_y_train.shape)
display(mad_y_test.shape)

(150, 500)

(50, 500)

(150,)

(50,)

#### Madelon Dataset (Raw Benchmarking without any Preprocessing)
Uses the out of the box default parameters provided by `sklearn` for the selected classification models.

In [69]:
names_of_classifiers = ['LogisticRegression', 'KNeighbors', 'DecisionTree', 'SVClassifier']

classifiers = [
    LogisticRegression(n_jobs=-1, random_state=42),
    KNeighborsClassifier(n_jobs=-1),
    DecisionTreeClassifier(random_state=42),
    SVC(random_state=42)]

Store the results in a dictionary to subsequenty be able to throw the results to compare into a pandas DataFrame

In [76]:
raw_test_scores = {}
raw_train_scores = {}
raw_y_preds = {}

for name, clfr in zip(names_of_classifiers, classifiers):
    clfr.fit(mad_X_train, mad_y_train)
    
    train_score = clfr.score(mad_X_train, mad_y_train)
    test_score = clfr.score(mad_X_test, mad_y_test)
    y_pred = clfr.predict(mad_X_test)
    
    raw_train_scores[name] = train_score
    raw_test_scores[name] = test_score
    raw_y_preds[name] = y_pred
    

In [77]:
raw_test_scores

{'DecisionTree': 0.71999999999999997,
 'KNeighbors': 0.59999999999999998,
 'LogisticRegression': 0.68000000000000005,
 'SVClassifier': 0.40000000000000002}

In [78]:
raw_train_scores

{'DecisionTree': 1.0,
 'KNeighbors': 0.76000000000000001,
 'LogisticRegression': 1.0,
 'SVClassifier': 1.0}

In [79]:
raw_y_preds

{'DecisionTree': array([-1,  1, -1, -1,  1,  1, -1, -1, -1, -1, -1,  1, -1, -1,  1,  1,  1,
         1,  1, -1,  1,  1,  1,  1,  1, -1,  1, -1,  1,  1, -1, -1,  1, -1,
         1,  1, -1,  1, -1,  1,  1, -1, -1,  1, -1,  1,  1,  1,  1, -1]),
 'KNeighbors': array([-1,  1, -1,  1,  1,  1,  1,  1, -1, -1,  1,  1, -1, -1,  1,  1, -1,
        -1,  1,  1,  1, -1,  1,  1, -1,  1, -1, -1,  1, -1, -1,  1,  1, -1,
         1,  1,  1,  1, -1, -1,  1, -1,  1,  1, -1,  1, -1,  1, -1, -1]),
 'LogisticRegression': array([ 1,  1, -1,  1,  1,  1,  1,  1, -1, -1,  1,  1, -1,  1, -1,  1, -1,
         1, -1, -1,  1,  1, -1,  1,  1,  1, -1, -1,  1, -1, -1,  1,  1, -1,
        -1, -1, -1, -1,  1, -1,  1,  1, -1,  1,  1,  1, -1, -1,  1,  1]),
 'SVClassifier': array([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1])}

In [None]:
def calculate_log_loss (y_true, y_pred):
    