For all four models I ran I used both the CountVectorizer and TfidfVectorizer. The better vectorizer for each model was included in the main modeling notebook. This notebook contains the other version of each model.

In [2]:
# imports
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier

In [3]:
df = pd.read_csv('data.csv')

In [4]:
X = df['selftext']
y= df['subreddit']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    stratify=y, 
                                                    random_state=5)

These are the same steps that were taken in the modeling notebook to prepare the models to run.

# AdaBoost with Tfid Vectorizer

In [91]:
pipe1 = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('model', AdaBoostClassifier())
])

In [93]:
pipe_params1 = {
    'tvec__max_features': [2_000, 3_000, 4_000, 5_000],
    'tvec__stop_words': [None, 'english'],
    'tvec__ngram_range': [(1,1), (1,2)],
    'model__n_estimators': [50,100],
}

In [94]:
gs1 = GridSearchCV(pipe1, pipe_params1, cv =3)

In [95]:
gs1.fit(X_train, y_train)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('tvec', TfidfVectorizer()),
                                       ('model', AdaBoostClassifier())]),
             param_grid={'model__n_estimators': [50, 100],
                         'tvec__max_features': [2000, 3000, 4000, 5000],
                         'tvec__ngram_range': [(1, 1), (1, 2)],
                         'tvec__stop_words': [None, 'english']})

In [96]:
print(gs1.best_params_)

{'model__n_estimators': 100, 'tvec__max_features': 5000, 'tvec__ngram_range': (1, 1), 'tvec__stop_words': None}


In [97]:
print(gs1.best_score_)

0.8675572595834814


In [99]:
gs1.score(X_train, y_train), gs1.score(X_test, y_test)

(0.9443485763589301, 0.8796895213454075)

# Random Forests with Tfid Vecorizer

In [120]:
pipe2 = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('model', RandomForestClassifier())
])

In [121]:
pipe_params2 = {
     'tvec__max_features': [2_000, 3_000, 4_000, 5_000],
    'tvec__stop_words': [None, 'english'],
    'tvec__ngram_range': [(1,1), (1,2)],
    'model__n_estimators': [50,100],
    'model__max_depth': [None, 1, 2, 3]
}

In [122]:
gs2 = GridSearchCV(pipe2, pipe_params2, cv =3)

In [123]:
gs2.fit(X_train, y_train)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('tvec', TfidfVectorizer()),
                                       ('model', RandomForestClassifier())]),
             param_grid={'model__max_depth': [None, 1, 2, 3],
                         'model__n_estimators': [50, 100],
                         'tvec__max_features': [2000, 3000, 4000, 5000],
                         'tvec__ngram_range': [(1, 1), (1, 2)],
                         'tvec__stop_words': [None, 'english']})

In [124]:
print(gs2.best_params_)

{'model__max_depth': None, 'model__n_estimators': 100, 'tvec__max_features': 5000, 'tvec__ngram_range': (1, 2), 'tvec__stop_words': 'english'}


In [125]:
print(gs2.best_score_)

0.8895506147682917


In [126]:
gs2.score(X_train, y_train), gs2.score(X_test, y_test)

(0.9995685936151855, 0.8783958602846055)

# Logistic Regression with Count Vectorizer

In [10]:
pipe3 = Pipeline([
    ('cvec', CountVectorizer()),
    ('model', LogisticRegression())
])

In [11]:
pipe_params3 = {
    'cvec__stop_words': ['english'],
    'cvec__max_features': [500, 1000, 2000],
    'cvec__min_df': [2, 3],
    'cvec__max_df': [.9, .95, .98],
    'cvec__ngram_range': [(1,1), (1,2)],
    'model__C': [1, 0.1, 0.01], 
    'model__solver': ['liblinear']
}

In [12]:
gs3 = GridSearchCV(pipe, pipe_params3, cv =5)

In [13]:
gs3.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('model', LogisticRegression())]),
             param_grid={'cvec__max_df': [0.9, 0.95, 0.98],
                         'cvec__max_features': [500, 1000, 2000],
                         'cvec__min_df': [2, 3],
                         'cvec__ngram_range': [(1, 1), (1, 2)],
                         'cvec__stop_words': ['english'],
                         'model__C': [1, 0.1, 0.01],
                         'model__solver': ['liblinear']})

In [14]:
print(gs3.best_params_)

{'cvec__max_df': 0.9, 'cvec__max_features': 2000, 'cvec__min_df': 3, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': 'english', 'model__C': 0.1, 'model__solver': 'liblinear'}


In [15]:
print(gs3.best_score_)

0.9024865941759141


In [16]:
gs3.score(X_train, y_train), gs3.score(X_test, y_test)

(0.9508196721311475, 0.9003880983182406)

# Naive Bayes with Count Vectorizer

In [None]:
pipe4 = Pipeline([
    ('cvec', CountVectorizer()),
    ('mnb', MultinomialNB())
])

In [None]:
pipe_params4 = {
    'cvec__stop_words': ['english'],
    'cvec__max_features': [500, 1000, 2000],
    'cvec__min_df': [2, 3],
    'cvec__max_df': [.9, .95, .98],
    'cvec__ngram_range': [(1,1), (1,2)],
}

In [39]:
gs4 = GridSearchCV(pipe3, pipe_params4, cv =5)

In [40]:
gs4.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cvec', CountVectorizer()),
                                       ('mnb', MultinomialNB())]),
             param_grid={'cvec__max_df': [0.9, 0.95, 0.98],
                         'cvec__max_features': [500, 1000, 2000],
                         'cvec__min_df': [2, 3],
                         'cvec__ngram_range': [(1, 1), (1, 2)],
                         'cvec__stop_words': ['english']})

In [41]:
print(gs4.best_params_)

{'cvec__max_df': 0.9, 'cvec__max_features': 2000, 'cvec__min_df': 3, 'cvec__ngram_range': (1, 1), 'cvec__stop_words': 'english'}


In [42]:
print(gs4.best_score_)

0.8951552841289938


In [43]:
gs4.score(X_train, y_train), gs4.score(X_test, y_test)

(0.9257981018119068, 0.9094437257438551)

This model had a decent accuracy and not much overfitting. However the other Naive Bayes model was more accurate.