# Grid Search to determine the best parameters and best model to predict posts

I will take my cleaned data frame and use a gridsearch to determine the best parameter/ model combinations to use for my data for logisitic regression and k nearest neighbors (some of the only classification models I knew at the time I started this notebook)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier

In [2]:
posts_df = pd.read_csv('./data/cleaned_reddit_posts.csv')

In [3]:
posts_df.shape

(15137, 6)

In [4]:
posts_df.head()

Unnamed: 0.1,Unnamed: 0,author,subreddit,title,title_length,title_word_count
0,0,ManofTheNightsWatch,0,Severely Injured Woman Heroically Fights Off P...,93,14
1,2,Sanlear,0,Frat President Chews Out Brothers Infected Wit...,107,15
2,3,aresef,0,Jim Harbaugh Annoyed He Only Got $5.89 For Sel...,93,15
3,4,aresef,0,Congressional Republicans Grill Postmaster Gen...,89,9
4,5,aresef,0,Watchdog Groups Urge Bob Evans To Create Emerg...,93,16


### I will follow the 4.06 (hyperparameters) and 5.04 (NLP 2) lessons

### Split data into train and test

In [5]:
#I will just use the title with my grid search to find my best model. 
#Then I will scale and add my title length/ word counts to my most accurate model to see if I can improve it further
X = posts_df['title']
y = posts_df['subreddit']
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=17, stratify=y)

In [6]:
#determine a baseline from our test data
y_test.value_counts(normalize=True)

1    0.569617
0    0.430383
Name: subreddit, dtype: float64

### Set up two pipelines - one for K Nearest Neighbors and one for Logistic Regression

In [7]:
#Create pipes 
pipe_knn = Pipeline([
    ('cvec', CountVectorizer()),
    ('knn', KNeighborsClassifier())
])

pipe_lr = Pipeline([
    ('cvec', CountVectorizer()),
    ('lr', LogisticRegression())
])

In [8]:
#set parameters to search in each pipe
knn_params = {
    'cvec__max_features':[2000,3000,4000,5000], 
    'cvec__min_df':[2,3], 
    'cvec__max_df':[.90,.95], 
    'cvec__ngram_range':[(1,1), (1,2), (1,3)],
    'knn__n_neighbors': range(1,51,5),
    'knn__metric': ['euclidean', 'manhattan']
}

lr_params = {
    'cvec__max_features':[2000,3000,4000,5000], 
    'cvec__min_df':[2,3], 
    'cvec__max_df':[.90,.95], 
    'cvec__ngram_range':[(1,1), (1,2), (1,3)] 
}

In [9]:
#instantiate the gridsearches
gs_knn = GridSearchCV(pipe_knn,
                     param_grid = knn_params,
                     cv=5)
gs_lr = GridSearchCV(pipe_lr,
                    param_grid = lr_params,
                    cv=5)

In [11]:
#fit train data to each
gs_knn.fit(X_train, y_train)


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prep

In [12]:
gs_lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prep

In [13]:
#determine the best scores from each model
print(f'Best KNN Score: {gs_knn.best_score_} \n Best LR Score: {gs_lr.best_score_}')

Best KNN Score: 0.6764453160613519 
 Best LR Score: 0.8532416195780158


In [15]:
#Score both on train
print(f'KNN Train Score {gs_knn.score(X_train, y_train)}')
print(f'LR Train Score {gs_lr.score(X_train, y_train)}')

KNN Train Score 0.791138125440451
LR Train Score 0.9499647639182522


In [16]:
#Score both on train
print(f'KNN Test Score {gs_knn.score(X_test, y_test)}')
print(f'LR Test Score {gs_lr.score(X_test, y_test)}')

KNN Test Score 0.6792602377807133
LR Test Score 0.8557463672391017


In [17]:
#Get the best parameters for each model
gs_knn.best_estimator_

Pipeline(memory=None,
         steps=[('cvec',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=0.9,
                                 max_features=2000, min_df=2,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('knn',
                 KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                      metric='euclidean', metric_params=None,
                                      n_jobs=None, n_neighbors=6, p=2,
                                      weights='uniform'))],
         verbose=False)

In [18]:
gs_lr.best_estimator_

Pipeline(memory=None,
         steps=[('cvec',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=0.9,
                                 max_features=5000, min_df=2,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('lr',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='auto', n_jobs=None,
                           

I've now got the best parameters for both knn and lr models for this data. The variance is too high in both, but I have not removed stop words. I will try that next, using one notebook for each model! 
I will start with logistic regression as that has the higher accuracy scores.