In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

import pandas as pd
import numpy as np
import string
import re
from sklearn.model_selection import RandomizedSearchCV

In [None]:
from google.colab import files
uploaded = files.upload()

Saving trimmed_model.csv to trimmed_model.csv


In [None]:
df = pd.read_csv('trimmed_model.csv')

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,subreddit,title,post_paragraph
0,0,Home,Contract signed and waiting,\nNJ homeowner. \n\nI signed a contract to hav...
1,1,Home,How to repair/re-plaster old ceiling? It had s...,
2,2,Home,Can I make my matte black countertops shiny?!,
3,3,Home,How do I replace my air filter on this thing???,
4,4,Home,"Got new concrete for my driveway yesterday, an...",


In [None]:
df = df.sample(frac=1).reset_index(drop=True)

In [None]:
df.shape

(12443, 4)

In [None]:
df.fillna('0')

Unnamed: 0.1,Unnamed: 0,subreddit,title,post_paragraph
0,2927,todayilearned,"TIL of Truett's Luau, a special Hawaiian Luau ...",0
1,7572,WTF,The rope wasn’t even tied off...,0
2,11043,Unexpected,Nice gun,0
3,10689,Piracy,Topaz Labs all apps,"Hey all, I found a way of getting all of the t..."
4,2779,todayilearned,TIL about the Fenn Treasure. Allegedly hidden ...,0
...,...,...,...,...
12438,9827,AmItheAsshole,AITA for calling my sister in law a snobby bitch?,Sorry for any mistakes! This is my first post....
12439,12112,soccer,Bayer Leverkusen 0-1 Wolfsburg - Marin Pongrač...,0
12440,939,worldnews,Japan eyes fresh $1.1 trillion stimulus to com...,0
12441,187,Home,Do trees raise property value?,My dad says that having trees on our property ...


In [None]:
df["content"] = df["title"].astype(str) + df["post_paragraph"].astype(str)

In [None]:
df=df.drop(['title', 'post_paragraph'], axis=1)

In [None]:
X = df['content']
y = df['subreddit']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(9954,)
(9954,)
(2489,)
(2489,)


In [None]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('classifier', RandomForestClassifier(n_estimators=50, random_state=42)),
])

In [None]:
pipeline.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words='english', strip_accents=None,
                                 sublinear_tf=False,
                                 token_patt...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None

In [None]:
predictions = pipeline.predict(X_test)
print(classification_report(y_test,predictions))

                     precision    recall  f1-score   support

      AmItheAsshole       0.68      0.89      0.77        54
                Amd       0.70      0.65      0.67        49
             AskMen       0.19      0.09      0.12        55
          AskReddit       0.42      0.48      0.45        56
           AskWomen       0.24      0.36      0.29        39
    ChoosingBeggars       0.32      0.30      0.31        53
        Coronavirus       0.43      0.62      0.51        50
                DnD       0.79      0.71      0.75        48
               Home       0.47      0.35      0.40        57
               IAmA       0.67      0.98      0.80        44
       IdiotsInCars       0.40      0.30      0.34        60
  NoStupidQuestions       0.25      0.10      0.15        48
      OldSchoolCool       0.43      0.35      0.39        51
       OutOfTheLoop       0.55      0.93      0.69        56
             Piracy       0.71      0.45      0.55        56
     Showerthoughts    

In [None]:
def get_predictions(post, num_answers=5):
  """ takes a potential post and returns the top options """

  preds = pd.Series(pipeline.predict_proba(post)[0])

  preds.index = pipeline.classes_

  preds = preds.sort_values(ascending=False)

  return preds[:5]

In [None]:
from pickle import dump
dump(pipeline, open('reddit_model_nc.pkl', 'wb'))

In [None]:
nba_post = [ """
               LeBron James and Kobe Bryant and both great NBA players and we should stop comparing them.
                """]

In [None]:
get_predictions(nba_post)

nba                  0.80
DnD                  0.08
OldSchoolCool        0.04
Wellthatsucks        0.02
explainlikeimfive    0.02
dtype: float64

In [None]:
politics_post = [ """
               Joe Biden will be the next president.
                """]

In [None]:
get_predictions(politics_post)

politics             0.98
IdiotsInCars         0.02
NoStupidQuestions    0.00
anime                0.00
Wellthatsucks        0.00
dtype: float64

In [None]:
personal_finance_post= ['I have $50,000 in student loan debt. Should I buy a home or rent an apartment']

In [None]:
get_predictions(personal_finance_post)

news                 0.28
personalfinance      0.24
Home                 0.20
NoStupidQuestions    0.08
worldnews            0.04
dtype: float64