In [1]:
import pickle
import pandas as pd
import numpy as np
import re

from sklearn.naive_bayes import MultinomialNB

from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV, train_test_split, cross_validate, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
from sklearn.compose import ColumnTransformer

from sklearn.metrics import precision_score, recall_score, accuracy_score, SCORERS, multilabel_confusion_matrix, make_scorer, roc_curve, roc_auc_score, f1_score

In [2]:
test_df = pd.read_csv("./data/Processed/increment_train_size/5_to_20_words_preprocessed_TEST.csv")

test_df.head()

Unnamed: 0,created_utc,all_text_data,subreddit
0,1532973281,tron justin sun stop short reveal secret project,1
1,1550248670,reuters hsbc forex trading cost cut sharply bl...,1
2,1612224578,put expire 5th think clown brain read,0
3,1543091767,toilet french start name trone concidence,1
4,1571098829,apollo 11 jpeg yes roomba ltolgt microsoft wor...,1


In [3]:
# ===========================================================================================================================
# Helper function used to convert a gridsearch output into a pandas dataframe with the columns formatted the way I like them. 
# ===========================================================================================================================
def gs_to_clean_df(search_results, keep_split = False, keep_std = False, keep_time = False, keep_params = False, sort_by=None, make_RMSE=False):

    gs_results_df = pd.DataFrame(search_results)

    gs_result_columns = list(gs_results_df.columns)
    throw_away_columns = []
    columns_to_keep = []
    columns_renamed = []
    valid_metrics = []

    for column_name in gs_result_columns: 

        if column_name.startswith('split'):
            if keep_split == True: 
                columns_to_keep.append(column_name)
            else: 
                throw_away_columns.append(column_name)
        elif 'time' in column_name: 
            if keep_time == True: 
                columns_to_keep.append(column_name)
            else: 
                throw_away_columns.append(column_name)
        elif column_name.startswith('std'):
            if keep_std == True: 
                columns_to_keep.append(column_name)
            else: 
                throw_away_columns.append(column_name)
        elif column_name == 'params':
            if keep_params == True:
                columns_to_keep.append(column_name)
            else:
                throw_away_columns.append(column_name)
        else: 
            columns_to_keep.append(column_name)

    gs_results_df.drop(labels=throw_away_columns, axis='columns', inplace=True)
    renaming_dict = {}

    for column_name in columns_to_keep: 
        name = ""

        if column_name.startswith('param') and column_name != 'params': 
            name_components = column_name.split('__')

            name_components = name_components[1:]

            for component in name_components:
                name = name + '_' + component 
                
            name = name.lstrip('_')
            
        elif '_test' in column_name:
            name = column_name.replace('_test', '')

        renaming_dict[column_name] = name

        if name.startswith('rank') or name.startswith('mean'):
            valid_metrics.append(name)

    gs_results_df.rename(columns=renaming_dict, inplace=True)

    if sort_by in valid_metrics:
        gs_results_df.sort_values(by=sort_by, inplace=True, ignore_index=True)

    if make_RMSE:
        gs_results_df['mean_RMSE'] = (abs(gs_results_df['mean_MSE'])) ** (1/2)

    return gs_results_df


In [4]:
def print_test_score(test_df, model, params):
    
    X_test = test_df['all_text_data']
    y_true = test_df['subreddit']
    
    print("==================================================================================================")
    print(f"Model params:\n {params}\n")
    print(f"Test set score: {model.score(X_test, y_true)}")
    print("==================================================================================================\n")

In [5]:
# ===============================================================================================
# Print the number of words in the shortest and longest post in the dataset.
# ===============================================================================================
def print_word_counts(df):
    
    df = df.copy(deep=True)
    
    df['all_text_data'] = df['all_text_data'].astype(str)
    
    df['word_count'] = df['all_text_data'].apply(lambda text: len(text.split()))
    
    max_words = df['word_count'].max()
    min_words = df['word_count'].min()
    
    print(f"Maximum length post: {max_words} words")
    print(f"Minimum length post: {min_words} words")
    
    return df

In [6]:
# ===============================================================================================
# This function takes as input a dataframe containing reddit posts and returns the text data and
# target split apart.
#
# This function also performs some final checks to make sure the data is correct prior to
# attempting to build any models with it.
# ===============================================================================================
def data_check(train_df, pattern=r'(?u)\b\w+\b', verify_regex=False):
    
    print(f"Distribution:\n {train_df['subreddit'].value_counts(normalize=True)}\n")
    
    print(f"Missing values: {train_df['all_text_data'].isna().sum()}\n")
            
    X = train_df.loc[:, 'all_text_data'].to_numpy()
    y = train_df.loc[:, 'subreddit'].to_numpy().ravel() 
    
    print(f"Number of duplicates in training set: {train_df.duplicated().sum()}\n")
    print(f"Number of samples in training set: {len(train_df.index)}\n")
    
    print_word_counts(train_df)
    
    return X, y

## 5000 sample grid searches 

Data only includes posts 5 to 20 words in length.


Only given 5000 samples of training data.

In [7]:
# Read in the training dataset that contains 5000 samples.
train_df = pd.read_csv("./data/Processed/increment_train_size/train5000/train_5000.csv")

# Split into X and y. 
X, y = data_check(train_df)

Distribution:
 0    0.5026
1    0.4974
Name: subreddit, dtype: float64

Missing values: 0

Number of duplicates in training set: 0

Number of samples in training set: 5000

Maximum length post: 20 words
Minimum length post: 5 words


In [25]:
'''
model_pipeline = Pipeline([("Tfidf_Vect", TfidfVectorizer()),
                           ("nb", MultinomialNB())])

parameter_grid = [{'Tfidf_Vect__analyzer' : ['word'],
                   'Tfidf_Vect__ngram_range' : [(1,1), (1,2)],
                   'Tfidf_Vect__max_features':[8761],
                   'nb__alpha' : [0.1, 0.5, 1, 1.5], 
                   'nb__fit_prior' : [True, False]}]


gs = GridSearchCV(estimator=model_pipeline, param_grid=parameter_grid, scoring='accuracy', refit='accuracy', n_jobs=1)

gs.fit(X, y)

PATH = './data/Processed/increment_train_size/pickle/nb_5000_gs1.pkl'

with open(PATH, 'wb') as file:
    pickle.dump(gs, file)
''';

In [26]:
PATH = './data/Processed/increment_train_size/pickle/nb_5000_gs1.pkl'

with open(PATH, 'rb') as file:
    gs_results = pickle.load(file)

search_results = gs_results.cv_results_
top_estimator = gs_results.best_estimator_
top_parameters = gs_results.best_params_

gs1_df = gs_to_clean_df(search_results, sort_by='rank_score')
gs1_df.head()

Unnamed: 0,analyzer,max_features,ngram_range,alpha,fit_prior,mean_score,rank_score
0,word,8761,"(1, 2)",1.5,True,0.9084,1
1,word,8761,"(1, 1)",1.0,False,0.9082,2
2,word,8761,"(1, 1)",1.0,True,0.908,3
3,word,8761,"(1, 2)",1.5,False,0.9078,4
4,word,8761,"(1, 1)",1.5,False,0.907,5


In [28]:
'''
model_pipeline = Pipeline([("Tfidf_Vect", TfidfVectorizer()),
                           ("nb", MultinomialNB())])

parameter_grid = [{'Tfidf_Vect__analyzer' : ['word'],
                   'Tfidf_Vect__ngram_range' : [(1,2), (1,3)],
                   'Tfidf_Vect__max_features':[8761],
                   'nb__alpha' : [0.7, 0.8, 0.9, 1, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0], 
                   'nb__fit_prior' : [True, False]}]


gs = GridSearchCV(estimator=model_pipeline, param_grid=parameter_grid, scoring='accuracy', refit='accuracy', n_jobs=1)

gs.fit(X, y)

PATH = './data/Processed/increment_train_size/pickle/nb_5000_gs2.pkl'

with open(PATH, 'wb') as file:
    pickle.dump(gs, file)
''';

In [29]:
PATH = './data/Processed/increment_train_size/pickle/nb_5000_gs2.pkl'

with open(PATH, 'rb') as file:
    gs_results2 = pickle.load(file)
    
search_results2 = gs_results2.cv_results_
top_estimator2 = gs_results2.best_estimator_
top_parameters2 = gs_results2.best_params_

gs2_df = gs_to_clean_df(search_results2, sort_by='rank_score')
gs2_df.head()

Unnamed: 0,analyzer,max_features,ngram_range,alpha,fit_prior,mean_score,rank_score
0,word,8761,"(1, 2)",1.6,True,0.9088,1
1,word,8761,"(1, 2)",1.7,True,0.9086,2
2,word,8761,"(1, 2)",1.7,False,0.9086,2
3,word,8761,"(1, 2)",1.5,True,0.9084,4
4,word,8761,"(1, 2)",1.8,True,0.9084,5


In [30]:
'''
model_pipeline = Pipeline([("Tfidf_Vect", TfidfVectorizer()),
                           ("nb", MultinomialNB())])

parameter_grid = [{'Tfidf_Vect__analyzer' : ['word'],
                   'Tfidf_Vect__ngram_range' : [(1,1), (1,2), (1,3)],
                   'Tfidf_Vect__max_features':[8761],
                   'nb__alpha' : [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0], 
                   'nb__fit_prior' : [False]}]


gs = GridSearchCV(estimator=model_pipeline, param_grid=parameter_grid, scoring='accuracy', refit='accuracy', n_jobs=1)

gs.fit(X, y)

PATH = './data/Processed/increment_train_size/pickle/nb_5000_noFitPrior_gs1.pkl'

with open(PATH, 'wb') as file:
    pickle.dump(gs, file)
''';

In [32]:
PATH = './data/Processed/increment_train_size/pickle/nb_5000_noFitPrior_gs1.pkl'

with open(PATH, 'rb') as file:
    gs_results3 = pickle.load(file)

top_estimator3 = gs_results3.best_estimator_
search_results3 = gs_results3.cv_results_
top_parameters3 = gs_results3.best_params_

gs3_df = gs_to_clean_df(search_results3, sort_by='rank_score')
gs3_df.head()

Unnamed: 0,analyzer,max_features,ngram_range,alpha,fit_prior,mean_score,rank_score
0,word,8761,"(1, 2)",1.7,False,0.9086,1
1,word,8761,"(1, 2)",1.6,False,0.9082,2
2,word,8761,"(1, 2)",1.9,False,0.9082,2
3,word,8761,"(1, 1)",1.0,False,0.9082,2
4,word,8761,"(1, 1)",1.1,False,0.908,5


In [33]:
# Print the best parameters for each of the Gradient Boosted Decision Tree models. 
print_test_score(test_df, top_estimator, top_parameters)
print_test_score(test_df, top_estimator2, top_parameters2)
print_test_score(test_df, top_estimator3, top_parameters3)

Model params:
 {'Tfidf_Vect__analyzer': 'word', 'Tfidf_Vect__max_features': 8761, 'Tfidf_Vect__ngram_range': (1, 2), 'nb__alpha': 1.5, 'nb__fit_prior': True}

Test set score: 0.91264

Model params:
 {'Tfidf_Vect__analyzer': 'word', 'Tfidf_Vect__max_features': 8761, 'Tfidf_Vect__ngram_range': (1, 2), 'nb__alpha': 1.6, 'nb__fit_prior': True}

Test set score: 0.9128

Model params:
 {'Tfidf_Vect__analyzer': 'word', 'Tfidf_Vect__max_features': 8761, 'Tfidf_Vect__ngram_range': (1, 2), 'nb__alpha': 1.7, 'nb__fit_prior': False}

Test set score: 0.91288

