# Linear Modeling

In [41]:
# importing basic libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# importing automation libraries
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

# importing dimensionality reduction
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# importing models
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

# Ignore futurewarnings
import warnings
warnings.filterwarnings('ignore')

In [29]:
# reading in the data
columns = pd.read_csv('data/features2.txt', header = None)

X_train = pd.read_csv('data/Train/X_train.txt', names = columns[0], sep=' ')
X_test = pd.read_csv('data/Test/X_test.txt', names = columns[0], sep=' ')

y_train = pd.read_csv('data/Train/y_train.txt', sep=' ', header = None)
y_test = pd.read_csv('data/Test/y_test.txt', sep=' ', header = None)

In [56]:
X = pd.concat([X_train,X_test])
y = pd.concat([y_train,y_test])

**PLAN**

try a quick pipeline with dim red and a linear model

makes use of what we learned about gridsearchCV to iterate through different models quickly

In [73]:
estimators = [
    ('dim_reduction', PCA()),
    ('model', svm.SVC())
]
pipe = Pipeline(estimators)

param_grid = [
            {
                'dim_reduction': [TSNE(),PCA()],
                'dim_reduction__n_components': [3, 5, 10],
                'model': [DecisionTreeClassifier()], 
                'model__max_depth': range(1, 10),
            },
            {
                'dim_reduction': [TSNE(),PCA()],
                'dim_reduction__n_components': [3, 5, 10],
                'model': [LogisticRegression()], 
                'model__C': [0.001, 0.01, 0.1, 1, 10, 100],
            },
             {
                'dim_reduction': [None],
                'model': [KNeighborsClassifier()], 
                'model__n_neighbors': range(1, 40, 2),
            }
] 

grid = GridSearchCV(pipe, param_grid, cv=5, verbose=1)

fittedgrid = grid.fit(X_train, y_train)

Fitting 5 folds for each of 110 candidates, totalling 550 fits


In [74]:
fittedgrid.best_estimator_

Pipeline(steps=[('dim_reduction', None),
                ('model', KNeighborsClassifier(n_neighbors=9))])

In [75]:
# all the 110 mean CV scores for the different param combinations
fittedgrid.cv_results_['mean_test_score']

array([       nan,        nan,        nan,        nan,        nan,
              nan,        nan,        nan,        nan,        nan,
              nan,        nan,        nan,        nan,        nan,
              nan,        nan,        nan,        nan,        nan,
              nan,        nan,        nan,        nan,        nan,
              nan,        nan, 0.34067206, 0.59070558, 0.69383516,
       0.69177678, 0.71108926, 0.71752775, 0.7112192 , 0.71829854,
       0.71121663, 0.34067206, 0.59070558, 0.7069639 , 0.70387716,
       0.71932806, 0.73825725, 0.74598216, 0.74521087, 0.7380006 ,
       0.34067206, 0.59585359, 0.71353216, 0.73297359, 0.75730313,
       0.76245934, 0.76529083, 0.76876697, 0.76246025,        nan,
              nan,        nan,        nan,        nan,        nan,
              nan,        nan,        nan,        nan,        nan,
              nan,        nan,        nan,        nan,        nan,
              nan,        nan, 0.71005494, 0.74791009, 0.76065

In [76]:
# final reported performance of the best estimator
fittedgrid.score(X_test, y_test)

0.8889943074003795

In [46]:
def PlotBoundaries(model, X, Y, plotsize=(6,4)) :
    '''
    Helper function that plots the decision boundaries of a model and data (X,Y)
    code modified from: https://scikit-learn.org/stable/auto_examples/neighbors/plot_classification.html
    '''
    
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1,X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.01), np.arange(y_min, y_max, 0.01))

    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    plt.figure(figsize=plotsize)
    plt.contourf(xx, yy, Z, alpha=0.4)

    #Plot
    plt.scatter(X[:, 0], X[:, 1], c=Y,s=20, edgecolor='k')
    plt.show()

In [None]:
KNN_model = KNeighborsClassifier(n_neighbors=3)
KNN_model.fit(X_train, y_train)

# Score the model on the test set
test_predictions = KNN_model.predict(X_test)
test_accuracy = accuracy_score(test_predictions, y_test)
print(f"Test set accuracy: {test_accuracy}")

PlotBoundaries(KNN_model, X, y, plotsize=(10,7))