In [0]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

import pandas as pd
import numpy as np
import string
import re
import nltk
from nltk.stem import WordNetLemmatizer 
from sklearn.model_selection import RandomizedSearchCV

In [3]:
from google.colab import files
uploaded = files.upload()

Saving trimmed_model.csv to trimmed_model.csv


In [0]:
df = pd.read_csv('trimmed_model.csv')

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,subreddit,title,post_paragraph
0,0,Home,Contract signed and waiting,\nNJ homeowner. \n\nI signed a contract to hav...
1,1,Home,How to repair/re-plaster old ceiling? It had s...,
2,2,Home,Can I make my matte black countertops shiny?!,
3,3,Home,How do I replace my air filter on this thing???,
4,4,Home,"Got new concrete for my driveway yesterday, an...",


In [0]:
df = df.sample(frac=1).reset_index(drop=True)

In [7]:
df.shape

(12443, 4)

In [8]:
df.fillna('0')

Unnamed: 0.1,Unnamed: 0,subreddit,title,post_paragraph
0,1509,leagueoflegends,Shout out to the game designers for Warwick,This is not a sarcastic post. My girlfriend st...
1,10307,unpopularopinion,People are uneducated on what it is that makes...,People like to believe that being happy or fee...
2,7236,IAmA,Hello my name is Kaitlyn Aurelia Smith and I a...,Link -&gt; [https://www.reddit.com/r/Music/com...
3,3089,teenagers,I swear everytime.,0
4,9601,OutOfTheLoop,The official Twitter account posts some things...,So I noticed that the official Twitter account...
...,...,...,...,...
12438,7491,WTF,Laying on a couch on the highway,0
12439,9637,OutOfTheLoop,What’s up with @adorrree on TikTok?,Saw some comments saying she needed to be canc...
12440,3255,teenagers,"So long, good grades",0
12441,11238,television,"The Safdie Brothers ('Uncut Gems', 'Good Time'...",0


In [0]:
df["content"] = df["title"].astype(str) + df["post_paragraph"].astype(str)

In [0]:
df=df.drop(['title', 'post_paragraph'], axis=1)

In [11]:
X = df['content']
y = df['subreddit']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(9954,)
(9954,)
(2489,)
(2489,)


In [0]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('classifier', RandomForestClassifier(n_estimators=50, random_state=42)),
])

In [13]:
pipeline.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words='english', strip_accents=None,
                                 sublinear_tf=False,
                                 token_patt...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None

In [14]:
predictions = pipeline.predict(X_test)
print(classification_report(y_test,predictions))

                     precision    recall  f1-score   support

      AmItheAsshole       0.59      0.98      0.74        45
                Amd       0.69      0.66      0.67        53
             AskMen       0.33      0.14      0.20        57
          AskReddit       0.28      0.47      0.35        40
           AskWomen       0.38      0.23      0.29        61
    ChoosingBeggars       0.54      0.43      0.48        46
        Coronavirus       0.44      0.54      0.49        48
                DnD       0.79      0.67      0.72        45
               Home       0.32      0.37      0.34        41
               IAmA       0.73      0.92      0.82        51
       IdiotsInCars       0.24      0.29      0.26        59
  NoStupidQuestions       0.25      0.13      0.18        52
      OldSchoolCool       0.53      0.43      0.48        46
       OutOfTheLoop       0.56      0.96      0.70        52
             Piracy       0.55      0.48      0.51        44
     Showerthoughts    

In [0]:
def get_predictions(post, num_answers=5):
  """ takes a potential post and returns the top options """

  preds = pd.Series(pipeline.predict_proba(post)[0])

  preds.index = pipeline.classes_

  preds = preds.sort_values(ascending=False)

  return preds[:5]

In [0]:
from pickle import dump
dump(pipeline, open('reddit_model_nc.pkl', 'wb'))

In [0]:
nba_post = [ """
               LeBron James and Kobe Bryant and both great NBA players and we should stop comparing them.
                """]

In [18]:
get_predictions(nba_post)

nba          0.90
soccer       0.06
AskReddit    0.02
DnD          0.02
worldnews    0.00
dtype: float64

In [0]:
politics_post = [ """
               Joe Biden will be the next president.
                """]

In [20]:
get_predictions(politics_post)

politics     0.84
AskMen       0.08
copypasta    0.04
worldnews    0.02
Tinder       0.02
dtype: float64

In [0]:
personal_finance_post= ['I have $50,000 in student loan debt. Should I buy a home or rent an apartment']

In [22]:
get_predictions(personal_finance_post)

personalfinance    0.20
news               0.18
Home               0.18
insaneparents      0.08
soccer             0.06
dtype: float64