# Linear Modeling

In [1]:
# importing basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# importing automation libraries
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

# importing dimensionality reduction
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# importing models
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

# Ignore futurewarnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# reading in the data
columns = pd.read_csv('data/features2.txt', header = None)

X_train = pd.read_csv('data/Train/X_train.txt', names = columns[0], sep=' ')
X_test = pd.read_csv('data/Test/X_test.txt', names = columns[0], sep=' ')

y_train = pd.read_csv('data/Train/y_train.txt', sep=' ', header = None)
y_test = pd.read_csv('data/Test/y_test.txt', sep=' ', header = None)

In [3]:
X = pd.concat([X_train,X_test])
y = pd.concat([y_train,y_test])

**PLAN**

try a quick pipeline with dim red and a linear model

makes use of what we learned about gridsearchCV to iterate through different models quickly

In [4]:
estimators = [
    ('dim_reduction', PCA()),
    ('model', svm.SVC())
]
pipe = Pipeline(estimators)

param_grid = [
            {
                'dim_reduction': [TSNE(),PCA()],
                'dim_reduction__n_components': [3, 5, 10],
                'model': [DecisionTreeClassifier()], 
                'model__max_depth': range(1, 10),
            },
            {
                'dim_reduction': [TSNE(),PCA()],
                'dim_reduction__n_components': [3, 5, 10],
                'model': [LogisticRegression()], 
                'model__C': [0.001, 0.01, 0.1, 1, 10, 100],
            },
             {
                'dim_reduction': [None],
                'model': [KNeighborsClassifier()], 
                'model__n_neighbors': range(1, 40, 2),
            }
] 

grid = GridSearchCV(pipe, param_grid, cv=5, verbose=1)

fittedgrid = grid.fit(X_train, y_train)

Fitting 5 folds for each of 110 candidates, totalling 550 fits


In [5]:
fittedgrid.best_estimator_

Pipeline(steps=[('dim_reduction', None),
                ('model', KNeighborsClassifier(n_neighbors=9))])

In [6]:
# all the 110 mean CV scores for the different param combinations
fittedgrid.cv_results_['mean_test_score']

array([       nan,        nan,        nan,        nan,        nan,
              nan,        nan,        nan,        nan,        nan,
              nan,        nan,        nan,        nan,        nan,
              nan,        nan,        nan,        nan,        nan,
              nan,        nan,        nan,        nan,        nan,
              nan,        nan, 0.34067206, 0.59070558, 0.69383516,
       0.69177678, 0.71108926, 0.71752775, 0.7112192 , 0.71752634,
       0.71173185, 0.34067206, 0.59070558, 0.7069639 , 0.70400603,
       0.71714016, 0.73825775, 0.7485565 , 0.74559739, 0.73774477,
       0.34067206, 0.59598229, 0.71340338, 0.73310237, 0.75627287,
       0.76477619, 0.76619214, 0.76632134, 0.76619513,        nan,
              nan,        nan,        nan,        nan,        nan,
              nan,        nan,        nan,        nan,        nan,
              nan,        nan,        nan,        nan,        nan,
              nan,        nan, 0.71031242, 0.7472661 , 0.76014

In [7]:
# final reported performance of the best estimator
fittedgrid.score(X_test, y_test)

0.8889943074003795