# Model Iterations

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time
import nltk

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.linear_model import LogisticRegression

from sklearn.neighbors import KNeighborsClassifier
from mlxtend.preprocessing import DenseTransformer
from sklearn.preprocessing import StandardScaler

from sklearn.naive_bayes import MultinomialNB

from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier

from sklearn.svm import SVC

from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, plot_confusion_matrix

import pickle
import gzip

In [2]:
# importing the cleaned reddit posts
posts = pd.read_csv("../data/cleaned_reddit_posts.csv")

X = posts["title"]
y = posts["subreddit"]

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,random_state=2020)

#Calculating the baseline accuracy of the data
y_test.value_counts(normalize=True)

0    0.506923
1    0.493077
Name: subreddit, dtype: float64

In [3]:
stopwords = nltk.corpus.stopwords.words('english')
# adding eli5, aita, and wibta to the stopwords
stopwords.extend(['eli5','aita','wibta','friend'])

### Logistic Regression

In [4]:
# Used to create dictionary of model params and counter
logreg_model_df = pd.read_csv("../data/logreg_model_params.csv")

#getting the index from the nb_model_params and saving it to a count
logreg_count = logreg_model_df.tail(1).index.values[0]

In [5]:
logreg_pipe = Pipeline([
    ('cvec', CountVectorizer(stop_words=stopwords)),
    ('logreg', LogisticRegression(penalty="l2",solver="liblinear"))
])

logreg_pipe_params = {
    "cvec__max_features" : [2500],
    "cvec__min_df" : [2],
    "cvec__max_df" : [.80],
    "logreg__C" : [.5]
}

temp_dict = {}
t0 = time.time()

logreg_gs = GridSearchCV(logreg_pipe,param_grid=logreg_pipe_params,cv=5,verbose=1)

logreg_gs.fit(X_train,y_train)

best_logreg = logreg_gs.best_estimator_

logreg_count += 1

logreg_gs.best_params_["train score"] = best_logreg.score(X_train,y_train)
logreg_gs.best_params_["test score"] = best_logreg.score(X_test,y_test)
temp_dict[f'model_{logreg_count}'] = logreg_gs.best_params_

temp_df = pd.DataFrame.from_dict(temp_dict, orient='index')
logreg_model_df = pd.concat([logreg_model_df,temp_df])

t1 = time.time()
print(f'This took {t1-t0} seconds!')

logreg_model_df

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.8s finished


This took 1.2991981506347656 seconds!


Unnamed: 0,cvec__max_df,cvec__max_features,cvec__min_df,train score,test score,logreg__C
0,0.7,3000,2,0.97,0.923462,
1,0.7,2500,3,0.966667,0.922308,
2,0.7,2000,4,0.961026,0.919615,
3,0.6,2000,4,0.961026,0.919615,
4,0.8,1500,3,0.951154,0.913077,
5,0.8,2000,4,0.961026,0.919615,1.0
6,0.8,2000,3,0.950769,0.915385,0.5
7,0.8,2000,3,0.926538,0.901154,0.1
8,0.8,2500,3,0.929359,0.904231,0.1
9,0.8,3000,3,0.932821,0.905,0.1


In [6]:
logreg_model_df.to_csv("../data/logreg_model_params.csv",index=False)

## KNN

In [7]:
knn_pipe = Pipeline([
    ('cvec', CountVectorizer(stop_words=stopwords)),
    ('to_dense' , DenseTransformer()),
    ('ss', StandardScaler(with_mean=True)),
    ('knn', KNeighborsClassifier())
])

# knn_pipe_params = {
#     "cvec__max_features" : [1000,2000,3000],
#     "cvec__min_df" : [2,3,4],
#     "cvec__max_df" : [.7,.8,.9],
#     "knn__n_neighbors" : [3,11,25],
#     'knn__weights': ["uniform","distance"]
# }

This took 4418.813382148743 seconds!
 Best parameters {'cvec__max_df': 0.7, 'cvec__max_features': 1000, 'cvec__min_df': 4, 'knn__n_neighbors': 11, 'knn__weights': 'distance'}
 Training score 0.9876190476190476
 Training score 0.8447619047619047

In [8]:
#ran a gridsearch to search hyperparameters for KNN
# knn_gs = GridSearchCV(knn_pipe,param_grid=knn_pipe_params,cv=5,verbose=2)
# knn_gs.fit(X_train,y_train)

In [9]:
# best_knn = knn_gs.best_estimator_

# t1 = time.time()
# print(f'This took {t1-t0} seconds!')
# print(f' Best parameters {knn_gs.best_params_}')
# print(f' Training score {best_knn.score(X_train,y_train)}')
# print(f' Testing score {best_knn.score(X_test,y_test)}')

In [10]:
t0 = time.time()

knn_pipe_params = {
    "cvec__max_features" : [1000],
    "cvec__min_df" : [4],
    "cvec__max_df" : [.7],
    "knn__n_neighbors" : [11],
    'knn__weights': ["distance"]
}

knn_gs = GridSearchCV(knn_pipe,param_grid=knn_pipe_params,cv=5,verbose=1)
knn_gs.fit(X_train,y_train)

best_knn = knn_gs.best_estimator_

t1 = time.time()
print(f'This took {t1-t0} seconds!')
print(f' Best parameters {knn_gs.best_params_}')
print(f' Training score {best_knn.score(X_train,y_train)}')
print(f' Testing score {best_knn.score(X_test,y_test)}')

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.7min finished


This took 107.95248818397522 seconds!
 Best parameters {'cvec__max_df': 0.7, 'cvec__max_features': 1000, 'cvec__min_df': 4, 'knn__n_neighbors': 11, 'knn__weights': 'distance'}
 Training score 0.9793589743589743
 Testing score 0.8511538461538461


Overall KNN has too high of variance

# Naive Bayes

In [11]:
# Used to create dictionary of model params and counter
nb_model_df = pd.read_csv("../data/nb_model_para.csv")

In [12]:
#getting the index from the nb_model_params and saving it to a count
nb_count = nb_model_df.tail(1).index.values[0]

In [13]:
temp_dict = {}
t0 = time.time()
nb_pipe = Pipeline([
    ('cvec', CountVectorizer(stop_words=stopwords)),
    ('nb', MultinomialNB())
])

nb_pipe_params = {
    "cvec__max_features" : [2500],
    "cvec__min_df" : [3],
    "cvec__max_df" : [.8],
}

nb_gs = GridSearchCV(nb_pipe,param_grid=nb_pipe_params,cv=5,verbose=1)
nb_gs.fit(X_train,y_train)

best_nb = nb_gs.best_estimator_

nb_count += 1

nb_gs.best_params_["train score"] = best_nb.score(X_train,y_train)
nb_gs.best_params_["test score"] = best_nb.score(X_test,y_test)
temp_dict[f'model_{nb_count}'] = nb_gs.best_params_

temp_df = pd.DataFrame.from_dict(temp_dict, orient='index')
nb_model_df = pd.concat([nb_model_df,temp_df])

t1 = time.time()
print(f'This took {t1-t0} seconds!')

nb_model_df

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    1.0s finished


This took 1.5051231384277344 seconds!


Unnamed: 0,cvec__max_df,cvec__max_features,cvec__min_df,train score,test score
0,0.7,1500,3,0.959365,0.918095
1,0.6,2000,2,0.964444,0.914286
2,0.6,1500,3,0.959365,0.918095
3,0.7,1500,4,0.948889,0.909524
4,0.8,2000,3,0.949359,0.933846
5,0.8,2000,3,0.944872,0.929231
6,0.8,2500,3,0.952179,0.932308
7,0.8,2500,2,0.951538,0.931538
8,0.8,2500,3,0.952179,0.932308
model_9,0.8,2500,3,0.952179,0.932308


In [14]:
nb_model_df.to_csv("../data/nb_model_para.csv",index=False)

Naive Bayes does an equally good job of predicting subreddits as logisitic regression

### Decision Tree

In [15]:
# Used to create dictionary of model params and counter
dt_model_df = pd.read_csv("../data/dt_model_params.csv")

#getting the index from the nb_model_params and saving it to a count
dt_count = dt_model_df.tail(1).index.values[0]

In [16]:
dt_pipe = Pipeline([
    ('cvec', CountVectorizer(stop_words=stopwords)),
    ('dt', DecisionTreeClassifier(random_state=2020))
])

dt_pipe_params = {
    "cvec__max_features" : [2500],
    "cvec__min_df" : [3],
    "cvec__max_df" : [.80],
    "dt__max_depth" : [250],
    "dt__min_samples_leaf" : [3],
    "dt__min_samples_split" : [20],
    "dt__ccp_alpha" : [0]
}

temp_dict = {}
t0 = time.time()

dt_gs = GridSearchCV(dt_pipe,param_grid=dt_pipe_params,cv=5,verbose=1)

dt_gs.fit(X_train,y_train)

best_dt = dt_gs.best_estimator_

dt_count += 1

dt_gs.best_params_["train score"] = best_dt.score(X_train,y_train)
dt_gs.best_params_["test score"] = best_dt.score(X_test,y_test)
temp_dict[f'model_{dt_count}'] = dt_gs.best_params_

temp_df = pd.DataFrame.from_dict(temp_dict, orient='index')
dt_model_df = pd.concat([dt_model_df,temp_df])

t1 = time.time()
print(f'This took {t1-t0} seconds!')

dt_model_df

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    2.1s finished


This took 2.9272680282592773 seconds!


Unnamed: 0,cvec__max_df,cvec__max_features,cvec__min_df,dt__ccp_alpha,dt__max_depth,dt__min_samples_leaf,dt__min_samples_split,train score,test score
0,0.8,2000,2,0,9,3,5,0.731282,0.727308
1,0.8,2000,2,0,20,3,5,0.813077,0.805385
2,0.8,2000,2,0,40,3,5,0.870641,0.859231
3,0.8,2000,2,0,60,3,5,0.891026,0.875
4,0.8,2000,2,0,100,3,5,0.911795,0.888462
5,0.8,2000,2,0,150,3,15,0.921795,0.895385
6,0.8,2000,2,0,200,3,20,0.921923,0.892308
7,0.8,3000,3,0,200,3,20,0.924231,0.891538
8,0.8,2500,3,0,250,3,20,0.92359,0.891538
9,0.8,2500,3,0,250,3,20,0.92359,0.891538


In [17]:
dt_model_df.to_csv("../data/dt_model_params.csv",index=False)

Decsison tree provides a good (but not great) predicition of subreddits

### Random Forest

In [18]:
# Used to create dictionary of model params and counter
rf_model_df = pd.read_csv("../data/rf_model_params.csv")

#getting the index from the nb_model_params and saving it to a count
rf_count = rf_model_df.tail(1).index.values[0]

In [19]:
rf_pipe = Pipeline([
    ('cvec', CountVectorizer(stop_words=stopwords)),
    ('rf', RandomForestClassifier(random_state=2020))
])

rf_pipe_params = {
    "cvec__max_features" : [2500],
    "cvec__min_df" : [3],
    "cvec__max_df" : [.80],
    "rf__n_estimators" : [50],
    "rf__max_depth" : [225],
    "rf__min_samples_leaf" : [2],
    "rf__min_samples_split" : [25],
    "rf__ccp_alpha" : [0]
}

temp_dict = {}
t0 = time.time()

rf_gs = GridSearchCV(rf_pipe,param_grid=rf_pipe_params,cv=5,verbose=1)

rf_gs.fit(X_train,y_train)

best_rf = rf_gs.best_estimator_

rf_count += 1

rf_gs.best_params_["train score"] = best_rf.score(X_train,y_train)
rf_gs.best_params_["test score"] = best_rf.score(X_test,y_test)
temp_dict[f'model_{rf_count}'] = rf_gs.best_params_

temp_df = pd.DataFrame.from_dict(temp_dict, orient='index')
rf_model_df = pd.concat([rf_model_df,temp_df])

t1 = time.time()
print(f'This took {t1-t0} seconds!')

rf_model_df

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    3.7s finished


This took 5.077329397201538 seconds!


Unnamed: 0,cvec__max_df,cvec__max_features,cvec__min_df,rf__ccp_alpha,rf__max_depth,rf__min_samples_leaf,rf__min_samples_split,rf__n_estimators,train score,test score
0,0.8,2500,3,0,225,3,15,50,0.925513,0.902692
1,0.8,3000,3,0,200,3,20,50,0.922692,0.902308
2,0.8,2500,3,0,200,2,20,50,0.930641,0.904615
3,0.8,2500,3,0,225,2,25,50,0.934231,0.905769
4,0.8,2500,3,0,225,2,25,50,0.934231,0.905769
5,0.8,2500,3,0,225,2,25,50,0.923974,0.889615
model_6,0.8,2500,3,0,225,2,25,50,0.923974,0.889615


In [20]:
rf_model_df.to_csv("../data/rf_model_params.csv",index=False)

rf model conclusions

### AdaBoost Classifier

In [21]:
# Used to create dictionary of model params and counter
ada_model_df = pd.read_csv("../data/ada_model_params.csv")

#getting the index from the nb_model_params and saving it to a count
ada_count = ada_model_df.tail(1).index.values[0]

In [22]:
ada_pipe = Pipeline([
    ('cvec', CountVectorizer(stop_words=stopwords)),
    ('ada', AdaBoostClassifier(random_state=2020))
])

ada_pipe_params = {
    "cvec__max_features" : [1500],
    "cvec__min_df" : [2],
    "cvec__max_df" : [.80],
    "ada__n_estimators" : [250],

}

temp_dict = {}
t0 = time.time()

ada_gs = GridSearchCV(ada_pipe,param_grid=ada_pipe_params,cv=5,verbose=1)

ada_gs.fit(X_train,y_train)

best_ada = ada_gs.best_estimator_

ada_count += 1

ada_gs.best_params_["train score"] = best_ada.score(X_train,y_train)
ada_gs.best_params_["test score"] = best_ada.score(X_test,y_test)
temp_dict[f'model_{ada_count}'] = ada_gs.best_params_

temp_df = pd.DataFrame.from_dict(temp_dict, orient='index')
ada_model_df = pd.concat([ada_model_df,temp_df])

t1 = time.time()
print(f'This took {t1-t0} seconds!')

ada_model_df

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    9.4s finished


This took 12.34099006652832 seconds!


Unnamed: 0,ada__n_estimators,cvec__max_df,cvec__max_features,cvec__min_df,train score,test score
0,100,0.8,2000,2,0.892436,0.876538
1,200,0.8,2000,3,0.923333,0.895769
2,250,0.8,1500,2,0.932436,0.898462
3,250,0.8,1500,2,0.932436,0.898462
4,250,0.8,1500,2,0.921026,0.883077
5,250,0.8,2000,2,0.921026,0.882308
6,250,0.8,1500,2,0.921026,0.883077
model_7,250,0.8,1500,2,0.921026,0.883077


In [23]:
ada_model_df.to_csv("../data/ada_model_params.csv",index=False)

### Support Vector Machine

In [24]:
# Used to create dictionary of model params and counter
svm_model_df = pd.read_csv("../data/svm_model_params.csv")
#getting the index from the nb_model_params and saving it to a count
svm_count = svm_model_df.tail(1).index.values[0]

In [26]:
svm_pipe = Pipeline([
    ('cvec', CountVectorizer(stop_words=stopwords)),
    ('to_dense' , DenseTransformer()),
    ('ss', StandardScaler()),
    ('svm', SVC(probability=True))
])

svm_pipe_params = {
    "cvec__max_features" : [2500],
    "cvec__min_df" : [2],
    "cvec__max_df" : [.80],
    "svm__C" : [1],
    "svm__degree" : [2],
}

temp_dict = {}
t0 = time.time()

svm_gs = GridSearchCV(svm_pipe,param_grid=svm_pipe_params,cv=2,verbose=2)

svm_gs.fit(X_train,y_train)

best_svm = svm_gs.best_estimator_

svm_count += 1

svm_gs.best_params_["train score"] = best_svm.score(X_train,y_train)
svm_gs.best_params_["test score"] = best_svm.score(X_test,y_test)
temp_dict[f'model_{svm_count}'] = svm_gs.best_params_

temp_df = pd.DataFrame.from_dict(temp_dict, orient='index')
svm_model_df = pd.concat([svm_model_df,temp_df])

t1 = time.time()
print(f'This took {t1-t0} seconds!')

svm_model_df

Fitting 2 folds for each of 1 candidates, totalling 2 fits
[CV] cvec__max_df=0.8, cvec__max_features=2500, cvec__min_df=2, svm__C=1, svm__degree=2 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  cvec__max_df=0.8, cvec__max_features=2500, cvec__min_df=2, svm__C=1, svm__degree=2, total= 4.0min
[CV] cvec__max_df=0.8, cvec__max_features=2500, cvec__min_df=2, svm__C=1, svm__degree=2 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  4.0min remaining:    0.0s


[CV]  cvec__max_df=0.8, cvec__max_features=2500, cvec__min_df=2, svm__C=1, svm__degree=2, total= 4.0min


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  8.0min finished


This took 1340.445824623108 seconds!


Unnamed: 0,cvec__max_df,cvec__max_features,cvec__min_df,svm__C,train score,test score,svm__degree
0,0.8,2500,2,1,0.980128,0.917692,
1,0.8,2500,2,1,0.980128,0.917692,2.0
2,0.8,2500,2,1,0.980128,0.917692,2.0
3,0.8,2500,2,1,0.978205,0.913077,2.0
4,0.8,2500,2,1,0.978205,0.913077,2.0
model_5,0.8,2500,2,1,0.978205,0.913077,2.0


In [27]:
svm_model_df.to_csv("../data/svm_model_params.csv",index=False)

## Aggregating using VoteClassifier

In [28]:
vote = VotingClassifier([
    ("logreg" , best_logreg),
#     ("knn" , best_knn),
    ("nb" , best_nb),
#     ("dt" , best_dt),
#     ("rf" , best_rf),
#     ("ada" , best_ada),
    ("svm" , best_svm)
])

vote.fit(X_train,y_train)

print(f'Training {vote.score(X_train,y_train)}')
print(f'Testing {vote.score(X_test,y_test)}')

Training 0.9662820512820512
Testing 0.933076923076923


In [29]:
#with help on how to send pickle files to sepcific directory 
#https://stackoverflow.com/questions/17750422/how-to-pickle-an-object-to-a-certain-directory#:~:text=If%20you%20wish%20to%20save,added%20to%20a%20different%20machine.&text=Set%20root%20equal%20to%20your,%3D%20Path(%22.%22)
#https://stackoverflow.com/questions/18474791/decreasing-the-size-of-cpickle-objects
with open('../data/saved_models/nb_model.pkl', 'wb') as nb_file:
    pickle.dump(best_nb, nb_file)

with open('../data/saved_models/logreg_model.pkl', 'wb') as logreg_file:
    pickle.dump(best_logreg, logreg_file)

#using gzip to make sure svm model size is under the 100MB required for github
with gzip.GzipFile('../data/saved_models/svm_model.pgz', 'w') as svm_file:
    pickle.dump(best_svm, svm_file)