# Load Data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
train_df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')

all_df = pd.concat([train_df, test_df])

In [None]:
target_column = 'TARGET'

# Data Preprocessing and Feature Engineering

In [None]:
from sklearn.preprocessing import LabelEncoder, LabelBinarizer

# Model Selection

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit, LeaveOneOut, StratifiedKFold
from sklearn.metrics import accuracy_score

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

classifiers = [
    LogisticRegression(),
    RandomForestClassifier(n_estimators=100, max_depth=2, random_state=2),
    KNeighborsClassifier(n_neighbors=3),
    SVC()
]

In [None]:
log_cols = ['Classifier', 'Accuracy']
log = pd.DataFrame(columns=log_cols)

In [None]:
X = pd.DataFrame([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
y = pd.DataFrame([1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1])

In [None]:
n_splits = 2
splitter = StratifiedKFold(n_splits=n_splits)

acc_dict = {}

In [None]:
for train_index, test_index in splitter.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    for clf in classifiers:
        name = clf.__class__.__name__
        clf.fit(X_train, y_train)
        
        predictions = clf.predict(X_test)
        acc_score = accuracy_score(y_test, predictions)
        
        if name in acc_dict:
            acc_dict[name] += acc_score
        else:
            acc_dict[name] = acc_score

for clf in acc_dict:
    acc_dict[clf] = acc_dict[clf] / n_splits
    log_entry = pd.DataFrame([[clf, acc_dict[clf]]], columns=log_cols)
    log = log.append(log_entry)
    
plt.xlabel('Accuracy')
plt.title('Classifier Accuracy')

sns.set_color_codes("muted")
sns.barplot(x = 'Accuracy', y = 'Classifier', data = log, color = "b")
log