In [174]:
# Imports
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

import pandas as pd
import string
import re
import nltk

In [175]:
# Load in data, mined by Jonathan
url = 'https://raw.githubusercontent.com/BW-Post-Here-01/DS/master/Data/reddit_data_slimmed.csv'
df = pd.read_csv(url)

In [176]:
df.head()

Unnamed: 0,content,subreddit
0,COMMUNITY ANNOUNCEMENT In solidarity with the ...,tattoos
1,Weekly r/Tattoos Question/FreeTalk Thread! - A...,tattoos
2,Enter Shikari and Architects album artwork and...,tattoos
3,"David Bowie Portrait - Healed, Done in April 2...",tattoos
4,Photo realism artist chicago As the title sugg...,tattoos


In [177]:
df.subreddit.unique()

array(['tattoos', 'technology', 'gadgets', 'europe', 'GetMotivated',
       'philosophy', 'listentothis', 'politics', 'soccer', 'Tinder',
       'Futurology', 'space', 'Showerthoughts', 'DIY', 'Games',
       'WritingPrompts', 'lifehacks', 'gaming', 'Android', 'trees',
       'Music', 'malefashionadvice', 'television', 'gameofthrones',
       'movies', 'pokemon', 'Overwatch', 'pcmasterrace',
       'explainlikeimfive', 'atheism', 'PS4', 'books', 'nba', 'webdev',
       'travel', 'LifeProTips', 'MachineLearning', 'leagueoflegends',
       'Fitness', 'askscience', 'IAmA', 'TwoXChromosomes',
       'relationships', 'history', 'tifu', 'dadjokes', 'nosleep',
       'personalfinance', 'Jokes', 'buildapc'], dtype=object)

In [178]:
df.drop(df.loc[df['subreddit']=='tattoos'].index, inplace=True)
df.drop(df.loc[df['subreddit']=='trees'].index, inplace=True)
df.drop(df.loc[df['subreddit']=='space'].index, inplace=True)
df.drop(df.loc[df['subreddit']=='pcmasterrace'].index, inplace=True)
df.drop(df.loc[df['subreddit']=='lifehacks'].index, inplace=True)
df.drop(df.loc[df['subreddit']=='Overwatch'].index, inplace=True)
df.drop(df.loc[df['subreddit']=='gaming'].index, inplace=True)
df.drop(df.loc[df['subreddit']=='gadgets'].index, inplace=True)
df.drop(df.loc[df['subreddit']=='Showerthoughts'].index, inplace=True)
df.drop(df.loc[df['subreddit']=='Futurology'].index, inplace=True)
df.drop(df.loc[df['subreddit']=='Tinder'].index, inplace=True)
df.drop(df.loc[df['subreddit']=='DIY'].index, inplace=True)
df.drop(df.loc[df['subreddit']=='television'].index, inplace=True)

In [179]:
df.subreddit.unique()

array(['technology', 'europe', 'GetMotivated', 'philosophy',
       'listentothis', 'politics', 'soccer', 'Games', 'WritingPrompts',
       'Android', 'Music', 'malefashionadvice', 'gameofthrones', 'movies',
       'pokemon', 'explainlikeimfive', 'atheism', 'PS4', 'books', 'nba',
       'webdev', 'travel', 'LifeProTips', 'MachineLearning',
       'leagueoflegends', 'Fitness', 'askscience', 'IAmA',
       'TwoXChromosomes', 'relationships', 'history', 'tifu', 'dadjokes',
       'nosleep', 'personalfinance', 'Jokes', 'buildapc'], dtype=object)

In [180]:
# Process data with this function
def cleaning_fn(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    4. Returns in lowercase.
    """
    # Check characters to see if they are in punctuation
    clean = [char for char in mess if char not in string.punctuation]

    # Join the characters again to form the string.
    clean = ''.join(clean)

    # Now just remove any stopwords
    return [word for word in clean.split() if word.lower() not in stopwords.words('english')]

In [181]:
# Randomize the rows of the df so we don't have the iloc 1-100 all classified
# as one class, the next 200 as another, etc., so we don't have issues with
# a train/test split
df = df.sample(frac=1).reset_index(drop=True)

In [182]:
df.head(25)

Unnamed: 0,content,subreddit
0,[D] Video Analysis - Backpropagation and the b...,MachineLearning
1,Couples who are newly living together: how do ...,TwoXChromosomes
2,[D] Intel removed MKL_DEBUG_CPU_TYPE workaroun...,MachineLearning
3,Idea for Gen 5 remake. So I've had this idea f...,pokemon
4,Feedback? PCPartPicker Part List: https://pcpa...,buildapc
5,I am Victoria from reddit. AMAA! [proof](http:...,IAmA
6,Lump Some or Pension? Hi if there was a opport...,personalfinance
7,Haitian slave rebellion As opposed to being a ...,history
8,Weird shit I've seen as a Marine 2b Weird shit...,nosleep
9,Schengen Visa question? Hello! I am hoping to ...,travel


In [183]:
# Apply train/test split
X = df['content']
y = df['subreddit']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(23350,)
(5838,)
(23350,)
(5838,)


In [184]:
# Create pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('classifier', RandomForestClassifier()),  # Originally trained with MulinomialNB() but had low accuracy
])

In [185]:
# Fit X_train and y_train on the pipe
pipeline.fit(X_train,y_train)



Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words='english', strip_accents=None,
                                 sublinear_tf=False,
                                 token_patt...
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                               

In [186]:
# Create a pipeline prediction object with X_test
predictions = pipeline.predict(X_test)

# Score the model with X_test and y_test
print(classification_report(y_test,predictions))

                   precision    recall  f1-score   support

          Android       0.55      0.52      0.54        63
          Fitness       0.62      0.75      0.68       210
            Games       0.33      0.08      0.13        37
     GetMotivated       0.50      0.15      0.24        13
             IAmA       0.64      0.91      0.75       247
            Jokes       0.29      0.51      0.37       366
      LifeProTips       0.68      0.63      0.65       202
  MachineLearning       0.71      0.71      0.71       185
            Music       0.29      0.23      0.25        53
              PS4       0.66      0.66      0.66       150
  TwoXChromosomes       0.37      0.32      0.34       245
   WritingPrompts       0.86      0.39      0.53        31
       askscience       0.45      0.39      0.42       228
          atheism       0.72      0.46      0.56       137
            books       0.66      0.59      0.63       133
         buildapc       0.79      0.87      0.83       

  'precision', 'predicted', average, warn_for)


In [187]:
# Looking at these categories, try four fake reviews and see how the model does:
df.subreddit.unique()

array(['MachineLearning', 'TwoXChromosomes', 'pokemon', 'buildapc',
       'IAmA', 'personalfinance', 'history', 'nosleep', 'travel',
       'dadjokes', 'PS4', 'Jokes', 'relationships', 'nba', 'Fitness',
       'leagueoflegends', 'tifu', 'explainlikeimfive', 'books', 'webdev',
       'soccer', 'Music', 'LifeProTips', 'WritingPrompts', 'atheism',
       'movies', 'askscience', 'malefashionadvice', 'gameofthrones',
       'Android', 'politics', 'europe', 'philosophy', 'technology',
       'listentothis', 'GetMotivated', 'Games'], dtype=object)

In [188]:
# Create fn that takes in a reddit post and returns the top five most likely categories:
def get_predictions(post, num_answers=5):
  """ takes a post and returns the top categories it fits in """

  # get the predicted probabilities for each class
  preds = pd.Series(pipeline.predict_proba(post)[0])

  # save each class to the Series index
  preds.index = pipeline.classes_

  # sort to get the most likely classes
  preds = preds.sort_values(ascending=False)

  # return the top num_answers results in dict format
  return preds[:5]

# Test one with a fake review about history

In [189]:
# Test with a fake review
history_post = [ """
                History if my favorite subject.  I love to read historical accounts about ancient Rome and Greece.
                I'm also a big World War 2 buff and I collect objects with historical significance.
                """]

In [190]:
get_predictions(history_post)

history            0.6
dadjokes           0.2
books              0.1
MachineLearning    0.1
PS4                0.0
dtype: float64

# Test two with a fake review about pokemon

In [191]:
# Try again and mention pokemon to see if the model correctly guesses pokemon:
pokemon_post = [ """
                My favorite pokemon are pikachu and charizard. I love pokemon.  Pokemon is great.
                """]

In [192]:
get_predictions(pokemon_post)

dadjokes    0.4
Jokes       0.2
pokemon     0.2
webdev      0.1
history     0.1
dtype: float64

# Test three with a fake post about android

In [193]:
# Try a fake prediction to see if android gets predicted:
android_post = [ """
                I use a galaxy note 5.  My favorite opperating system version was oreo.
                Android phones are better than iphones. I like to create apps for the app store.
                """]

In [194]:
get_predictions(android_post)

Android              0.4
explainlikeimfive    0.2
webdev               0.1
LifeProTips          0.1
dadjokes             0.1
dtype: float64

# Test four with a fake post about music


In [195]:
# Try a fake prediction to see if music gets predicted:
music_post = [ """
                I love to listen to music.  My favorite singer/songwriter is Foy Vance.  Every so often
                I like to listen to Bob Marley.  I have a large vinyl music collection but more recently I've
                been listening to everything on Spotify.
                """]

In [196]:
get_predictions(music_post)

Music              0.2
askscience         0.2
dadjokes           0.2
TwoXChromosomes    0.1
Fitness            0.1
dtype: float64

# Test five with fake post about politics

In [197]:
politics_post = ["""
                    Donald Trump and Bill Clinton.  Democrats, republicans and the tea party.
                """]

In [198]:
get_predictions(politics_post)

Jokes       0.7
dadjokes    0.3
PS4         0.0
buildapc    0.0
books       0.0
dtype: float64