# 3. Modelling

## 3.1 Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score
# from sklearn.metrics import f1_score
# from sklearn.naive_bayes import GaussianNB

In [2]:
x_train = pd.read_csv('3.x_train_data.csv')
y_train = pd.read_csv('3.y_train_data.csv')

In [3]:
print(x_train.shape)
print(y_train.shape)

(7524, 231)
(7524, 1)


## 3.1 Train and Validation Split

In [4]:
x_train, x_val, y_train, y_val = train_test_split(x_train,y_train['score'],test_size=.25,random_state=42)

In [5]:
skf = StratifiedKFold(n_splits=5,random_state=42,shuffle=True)

In [6]:
params = {'min_samples_leaf':[1,2,3,4,5,10,15,30,50],
          'max_depth':[2,4,6,7,8,9,10]}

In [7]:
dec_tree = DecisionTreeClassifier(random_state=42)

In [8]:
fitted_dec_tree = dec_tree.fit(x_train,y_train)

In [9]:
opt_model = GridSearchCV(fitted_dec_tree,
                         param_grid=params,
                         scoring='accuracy',
                         n_jobs=-1, 
                         cv=skf)

In [10]:
best_tree = opt_model.fit(x_train,y_train)

In [11]:
print(best_tree.best_estimator_)
print(best_tree.score(x_train,y_train))
print(best_tree.score(x_val,y_val))

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=8, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best')
0.43930533404217614
0.366294524189261


In [12]:
params = {'min_samples_leaf':[1,2,3,4,5,10,15,30,50],
          'max_depth':[2,4,6,7,8,9,10]}

In [13]:
ran_for = RandomForestClassifier(random_state=42)

In [14]:
fitted_ran_for = ran_for.fit(x_train,y_train)

In [15]:
opt_model = GridSearchCV(fitted_ran_for,
                         param_grid=params,
                         scoring='accuracy',
                         n_jobs=-1,
                         cv=skf)

In [16]:
best_forest = opt_model.fit(x_train,y_train)

In [17]:
print(best_forest.best_estimator_)
print(best_forest.score(x_train,y_train))
print(best_forest.score(x_val,y_val))

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=10, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)
0.6673755094807726
0.47527910685805425


In [18]:
params = {}

In [19]:
log_reg = LogisticRegression(max_iter=1000,random_state=42)

In [20]:
fitted_log_reg = log_reg.fit(x_train,y_train)

In [21]:
opt_model = GridSearchCV(fitted_log_reg,
                         param_grid=params,
                         scoring='accuracy',
                         n_jobs=-1,
                         cv=skf)

In [22]:
best_log_reg = opt_model.fit(x_train,y_train)

In [23]:
print(best_log_reg.best_estimator_)
print(best_log_reg.score(x_train,y_train))
print(best_log_reg.score(x_val,y_val))

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)
0.5996810207336523
0.5199362041467305


In [24]:
params = {}

In [25]:
svm = SVC(random_state=42)

In [26]:
fitted_svm = svm.fit(x_train,y_train)

In [27]:
opt_model = GridSearchCV(fitted_svm,
                         param_grid=params,
                         scoring='accuracy',
                         n_jobs=-1,
                         cv=skf)

In [28]:
best_svm = opt_model.fit(x_train,y_train)

In [29]:
print(best_svm.best_estimator_)
print(best_svm.score(x_train,y_train))
print(best_svm.score(x_val,y_val))

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=42, shrinking=True, tol=0.001,
    verbose=False)
0.8312954102427786
0.5178096757044125
