I orignally ran everything with jupyter lab. My first model was a KNN model with only 10k rows of data and 5000 max features in the vectorizer which was too small for a dataset of 1 million rows. Also tried a random forest classification pipeline with random search for hyper tuning. Both models had their drawbacks in terms of either memory use and accuracy score. I then used paperspace to run a model on their cloud notebook. My final model was a naive bayes classifier which I was about to train with 400k rows of data with 5000 max features on the tf-idfd vectorizer.  

## Tokenizing Data

In [0]:
import pandas as pd
df = pd.read_csv('kaggle_RC_2019-05.csv', engine='python', error_bad_lines=False)

In [0]:
df.head()

Unnamed: 0,subreddit,body,controversiality,score
0,gameofthrones,Your submission has been automatically removed...,0,1
1,aww,"Dont squeeze her with you massive hand, you me...",0,19
2,gaming,It's pretty well known and it was a paid produ...,0,3
3,news,You know we have laws against that currently c...,0,10
4,politics,"Yes, there is a difference between gentle supp...",0,1


In [0]:
#using spacy for processing our text data
import spacy
from spacy.tokenizer import Tokenizer

nlp = spacy.load("en_core_web_lg")

# Tokenizer
tokenizer = Tokenizer(nlp.vocab)

In [0]:
# Tokenizer Pipe

tokens = []

""" Make them tokens """
for doc in tokenizer.pipe(df['body'], batch_size=500):
    doc_tokens = [token.text for token in doc]
    tokens.append(doc_tokens)

df['tokens'] = tokens

## Using 500,000 rows of data

In [0]:
#Creating my subset data for training and testing
subset = df.sample(400000)
text = df.sample(100000)

In [0]:
# Viewing all subreddits
subset.subreddit.value_counts()

RoastMe                10152
unpopularopinion       10141
ChapoTrapHouse         10117
dankmemes              10095
movies                 10095
relationship_advice    10085
todayilearned          10062
FortNiteBR             10058
apexlegends            10052
teenagers              10051
SquaredCircle          10035
aww                    10032
AmItheAsshole          10027
Market76               10023
AskReddit              10021
hockey                 10020
nfl                    10009
trashy                 10006
gaming                 10005
freefolk               10005
Pikabu                 10002
marvelstudios          10002
videos                  9990
gonewild                9988
worldnews               9987
leagueoflegends         9984
soccer                  9977
gameofthrones           9975
nba                     9974
MortalKombat            9964
wallstreetbets          9960
news                    9957
asoiaf                  9952
funny                   9943
Showerthoughts

In [0]:
# Importing our vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Instantiate vectorizer object
tfidf = TfidfVectorizer(stop_words='english', lowercase=False, max_features=5000)

# Create a vocabulary and get word counts per document
# Similiar to fit_predict
dtm = tfidf.fit_transform(subset.body)

# Get feature names to use as dataframe column headers
dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())

# View Feature Matrix as DataFrame
dtm.head()

Unnamed: 0,00,000,01,02,03,04,05,06,07,08,...,тут,ты,уже,чем,что,чтобы,это,этого,этом,этот
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [0]:
#fit test data to tf-idf
dtm_test = tfidf.fit_transform(text.body)

# Get feature names to use as dataframe column headers
dtm_test = pd.DataFrame(dtm_test.todense(), columns=tfidf.get_feature_names())

# View Feature Matrix as DataFrame
dtm_test.head()

Unnamed: 0,00,000,01,02,03,039,04,05,06,07,...,тут,ты,уже,чем,что,чтобы,это,этого,этом,этот
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Naive Bayes Classification

In [0]:
# Importing naive bayes from sklearn
from sklearn.naive_bayes import MultinomialNB

# Instantiating our model
naive_bayes = MultinomialNB()

# fitting our model with our trained data
naive_bayes.fit(dtm, subset.subreddit)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [0]:
# Predict
predictions = naive_bayes.predict(dtm_test)

In [0]:
# Getting our accuracy score
from sklearn.metrics import accuracy_score
print('Accuracy score: ', accuracy_score(text.subreddit, predictions))

Accuracy score:  0.06768


In [0]:
import pickle
filename = "tfidf.pkl"
pickle.dump(tfidf, open(filename, 'wb'))

In [0]:
filename2 = "naive_bayes.pkl"
pickle.dump(naive_bayes, open(filename2, 'wb'))

In [0]:
subset.to_csv("data.csv")

In [0]:
subset.to_pickle("data.pkl")

In [0]:
dtm.shape

(500000, 1000)

In [0]:
dtm = pd.DataFrame(dtm.todense(), columns=vect.get_feature_names())

In [0]:
dtm.head()

Unnamed: 0,000,10,100,11,12,15,20,2019,2FMarket76,2Fr,...,young,your,yourself,youtube,как,на,не,то,что,это
0,0,0,0,0,0,0,0,0,0,0,...,0,3,1,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# K Nearest Neighbors

In [0]:
from sklearn.neighbors import NearestNeighbors

# Fit on DTM
nn = NearestNeighbors(n_neighbors=10, algorithm='kd_tree')
nn.fit(dtm)

NearestNeighbors(algorithm='kd_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=10, p=2,
                 radius=1.0)

In [0]:
# Query Using kneighbors 
nn.kneighbors([dtm.iloc[25000]])

(array([[0., 0., 0., 0., 1., 1., 1., 1., 1., 1.]]),
 array([[ 45651, 249566,  25000, 347143,   1140,   3245,   2785,   6893,
           3694,   5784]]))

In [0]:
subset.iloc[45651]

subreddit                                               SquaredCircle
body                    Yaayyyyy!!! Wrestling revolution is amazing!!
controversiality                                                    0
score                                                              11
tokens              [Yaayyyyy!!!, Wrestling, revolution, is, amazi...
Name: 784105, dtype: object

In [0]:
subset.iloc[249566]

subreddit                             FortNiteBR
body                     tbh faithful is amazing
controversiality                               0
score                                         11
tokens              [tbh, faithful, is, amazing]
Name: 719852, dtype: object

In [0]:
subset.head()

Unnamed: 0,subreddit,body,controversiality,score,tokens
595340,The_Donald,&gt;It was a desperate hail Mary.\n\nAnd Mary ...,0,112,"[&gt;It, was, a, desperate, hail, Mary., \n\n,..."
309314,memes,Happy cake day and true story btw,0,-1,"[Happy, cake, day, and, true, story, btw]"
588045,hockey,&gt;This was so dangerous\n\nWhat?,0,3,"[&gt;This, was, so, dangerous, \n\n, What?]"
320145,trashy,Had a good scoot on these in Lisbon! But... Po...,0,11,"[Had, a, good, scoot, on, these, in, Lisbon!, ..."
875311,RoastMe,ðŸ¦€ ðŸ¦€ contest winners were never sent thei...,0,39,"[ðŸ¦€, ðŸ¦€, contest, winners, were, never, se..."


In [0]:
post = """
In 1991 I won a contest for a pre-release copy of Dragon Warrior III. It came with a letter hand-signed by everyone at Enix America (now SquareEnix). 15-year-old me was blown away!
"""

In [0]:
# Query
new = vect.transform([post])

In [0]:
nn.kneighbors(new.todense())

(array([[13.3041347 , 13.52774926, 13.52774926, 13.74772708, 13.78404875,
         13.85640646, 13.85640646, 13.89244399, 13.92838828, 13.96424004]]),
 array([[4742, 6136, 9837, 9912, 1627, 1403, 5149, 8837, 1605,  910]],
       dtype=int64))

'\nIn 1991 I won a contest for a pre-release copy of Dragon Warrior III. It came with a letter hand-signed by everyone at Enix America (now SquareEnix). 15-year-old me was blown away!\n'

# Random Forest Classification Pipeline / GridSearchCV

In [0]:
# Import Statements
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [0]:
# Create Pipeline Components

vect = CountVectorizer(stop_words='english', ngram_range=(1,2))
rfc = RandomForestClassifier()

In [0]:
# Define the Pipeline
pipe = Pipeline([
                 #Vectorizer
                 ('vect', vect),
                 # Classifier
                 ('clf', rfc)
                ])

# The pipeline puts together a bunch fit then transform,fit then predict. 

In [0]:
parameters = {
    'vect__max_df': ( 0.75, 1.0),
    'vect__min_df': (.02, .05),
    'vect__max_features': (500,1000),
    'clf__n_estimators':(5, 10,),
    'clf__max_depth':(15,20)
}

grid_search = GridSearchCV(pipe,parameters, cv=2, verbose=1)
grid_search.fit(subset.body, subset.subreddit)

Fitting 2 folds for each of 32 candidates, totalling 64 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  64 out of  64 | elapsed:  5.5min finished


GridSearchCV(cv=2, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None, stop_words='english',
        ...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'vect__max_df': (0.75, 1.0), 'vect__min_df': (0.02, 0.05), 'vect__max_features': (500, 1000), 'clf__n_estimators': (5, 10), 'clf__max_depth': (15, 20)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=1)

In [0]:
grid_search.best_score_

0.17976

In [0]:
grid_search.predict(['In 1991 I won a contest for a pre-release copy of Dragon Warrior III. It came with a letter hand-signed by everyone at Enix America (now SquareEnix). 15-year-old me was blown away!'])

array(['AmItheAsshole'], dtype=object)