In [21]:
import pandas as pd
import sqlite3
conn = sqlite3.connect("db.sqlite3")
df = pd.read_sql_query("select * from submissions limit 5000;", conn)

In [22]:
df.shape

(5000, 4)

In [31]:
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
import itertools, string, operator, re, unicodedata, nltk
import nltk
from operator import itemgetter
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TweetTokenizer, RegexpTokenizer
from bs4 import BeautifulSoup
import numpy as np
from itertools import combinations
from gensim.models import Phrases
from collections import Counter
import spacy
from spacy.tokenizer import Tokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

'''Features'''
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import label_binarize

'''Classifiers'''
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier

'''Metrics/Evaluation'''
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix
from scipy import interp
from itertools import cycle

'''Plotting'''
import matplotlib.pyplot as plt
import seaborn as sns

In [10]:
df.head(10)

Unnamed: 0,subreddit,subscribers,title,text
0,AskReddit,26241971,"People who haven't pooped in 2019 yet, why are...",
1,AskReddit,26241971,Stan Lee has passed away at 95 years old,As many of you know today is day that many of ...
2,Home,10022,Beautiful Home :),
3,Home,10022,My roommate is kicking me out because having a...,"So, I am not asking for advice, really...mores..."
4,videos,22077202,This is what happens when one company owns doz...,
5,videos,22077202,Youtube is Facilitating the Sexual Exploitatio...,
6,news,19777808,Blizzard Employees Staged a Walkout After the ...,
7,news,19777808,Kobe Bryant killed in helicopter crash in Cali...,
8,worldnews,23055673,Trump Impeached for Abuse of Power,
9,worldnews,23055673,"Two weeks before his inauguration, Donald J. T...",


In [24]:
df['clean_text'] = df['text'].str.replace('[^\w\s]',' ')
df['clean_title'] = df['title'].str.replace('[^\w\s]',' ')

In [25]:
df['text'] = df['clean_text'] + df['clean_title']
df['text'].shape

(5000,)

In [26]:
from spacy.tokenizer import Tokenizer

nlp = spacy.load("en_core_web_md")

tokenizer = Tokenizer(nlp.vocab)
tokens = []

""" Make them tokens """

#stop words
STOP_WORDS = nlp.Defaults.stop_words.union(['', ' ', '-', 'reddit', 'post'])
    
tokens = []

for doc in tokenizer.pipe(df['text'], batch_size=500):
    
    doc_tokens = []
    
    for token in doc: 
        if token.text.lower() not in STOP_WORDS:
            doc_tokens.append(token.text.lower())
   
    tokens.append(doc_tokens)
    
df['tokens'] = tokens

In [27]:
def get_lemmas(text):
        """Return the Lemmas"""
        lemmas = []
        doc = nlp(text)
    
        for token in doc: 
            if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_ != 'PRON'):
                lemmas.append(token.lemma_)
    
        return lemmas

In [28]:
tfidf_vectorizer = TfidfVectorizer(tokenizer=get_lemmas, min_df=0.025, max_df=.98, ngram_range=(1,2))

In [37]:
#Creating the features (tf-idf weights) for the processed text


#texts = df['text'].astype('str')


#tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3), 
                                 #  min_df = 1, 
                                  # max_df = .95)

X = tfidf_vectorizer.fit_transform(df.text) #features
y = df['subreddit'].values #target

print(X)
print(X.shape)

  (0, 209)	0.6323333777801103
  (0, 257)	0.457409810351627
  (0, 5)	0.2226138083134785
  (0, 221)	0.3713266142454971
  (0, 177)	0.4510991051204054
  (1, 131)	0.12950290676325846
  (1, 22)	0.14038666849782563
  (1, 147)	0.1695834056692183
  (1, 37)	0.15215144985933243
  (1, 172)	0.14872061636910663
  (1, 2)	0.15689893562254523
  (1, 6)	0.1246666611279185
  (1, 162)	0.1546311721018806
  (1, 95)	0.17127526686218691
  (1, 211)	0.16744262294017062
  (1, 74)	0.12895313918848877
  (1, 130)	0.12941070423807302
  (1, 8)	0.2644404719205803
  (1, 251)	0.12941070423807302
  (1, 237)	0.14572707696501988
  (1, 161)	0.15593692542143042
  (1, 197)	0.17041957466149782
  (1, 241)	0.12171439122127248
  (1, 146)	0.2835588557760236
  (1, 200)	0.09509881808828502
  :	:
  (4997, 26)	0.750564444452597
  (4997, 155)	0.5996310305927084
  (4997, 5)	0.27766101972349483
  (4998, 47)	0.17100308555262075
  (4998, 180)	0.1519913552779669
  (4998, 141)	0.16406304158526636
  (4998, 49)	0.14187843476421208
  (4998, 220)

In [44]:
#dimensionality reduction
lsa = TruncatedSVD(n_components=258, 
                   n_iter=10, 
                   random_state=3)

X_text = lsa.fit_transform(X)
print(X_text.shape)
print(y.shape)

(5000, 258)
(5000,)


In [43]:
#Preliminary model evaluation using default parameters

import warnings
warnings.filterwarnings('ignore')
pd.options.display.float_format = '{:,.2f}'.format


#Creating a dict of the models
model_dict = {'Dummy' : DummyClassifier(random_state=3),
              'Stochastic Gradient Descent' : SGDClassifier(random_state=3, loss='log'),
              'Random Forest': RandomForestClassifier(random_state=3),
              'AdaBoost': AdaBoostClassifier(random_state=3),
              'Gaussian Naive Bayes': GaussianNB(),
              'K Nearest Neighbor': KNeighborsClassifier()}

#Train test split with stratified sampling for evaluation
X_train, X_test, y_train, y_test = train_test_split(X_text, 
                                                    y, 
                                                    test_size = .5, 
                                                    shuffle = True, 
                                                    stratify = y, 
                                                    random_state = 3)

#Function to get the scores for each model in a df
def model_score_df(model_dict):   
    model_name, ac_score_list, p_score_list, r_score_list, f1_score_list = [], [], [], [], []
    for k,v in model_dict.items():   
        model_name.append(k)
        v.fit(X_train, y_train)
        y_pred = v.predict(X_test)
        ac_score_list.append(accuracy_score(y_test, y_pred))
        p_score_list.append(precision_score(y_test, y_pred, average='macro'))
        r_score_list.append(recall_score(y_test, y_pred, average='macro'))
        f1_score_list.append(f1_score(y_test, y_pred, average='macro'))
        model_comparison_df = pd.DataFrame([model_name, ac_score_list, p_score_list, r_score_list, f1_score_list]).T
        model_comparison_df.columns = ['model_name', 'accuracy_score', 'precision_score', 'recall_score', 'f1_score']
        model_comparison_df = model_comparison_df.sort_values(by='f1_score', ascending=False)
    return model_comparison_df

model_score_df(model_dict)

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.