# Here is a model for NLP that allows us to clean data and correctly categorize each post from reddit as it was classified.  Step by step the model will:
1) clean and process data
2) vectorize data
3) fit a model with spacy
4) score the model with a confusion matrix
5) test the model
6) pickle the model

In [1]:
# Imports
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

import pandas as pd
import string
import re
import nltk

In [3]:
# Load in data, mined by Jonathan
url = 'https://raw.githubusercontent.com/BW-Post-Here-01/DS/master/Data/reddit_data_slimmed.csv'
df = pd.read_csv(url)

In [4]:
# Visualize df
df.head(25)

Unnamed: 0,content,subreddit
0,COMMUNITY ANNOUNCEMENT In solidarity with the ...,tattoos
1,Weekly r/Tattoos Question/FreeTalk Thread! - A...,tattoos
2,Enter Shikari and Architects album artwork and...,tattoos
3,"David Bowie Portrait - Healed, Done in April 2...",tattoos
4,Photo realism artist chicago As the title sugg...,tattoos
5,Can you do colorful Japanese/Yakuza tattoos on...,tattoos
6,Tattoo Commission Question. Seperate Artist an...,tattoos
7,Weekly r/Tattoos Question/FreeTalk Thread! - A...,tattoos
8,Partial cover up / adding to a design with a d...,tattoos
9,"Lately I realized, that very famous people, ha...",tattoos


In [5]:
# Process data with this function
def cleaning_fn(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    4. Returns in lowercase.
    """
    # Check characters to see if they are in punctuation
    clean = [char for char in mess if char not in string.punctuation]

    # Join the characters again to form the string.
    clean = ''.join(clean)
    
    # Now just remove any stopwords
    return [word for word in clean.split() if word.lower() not in stopwords.words('english')]

In [6]:
# Randomize the rows of the df so we don't have the iloc 1-100 all classified
# as one class, the next 200 as another, etc., so we don't have issues with
# a train/test split
df = df.sample(frac=1).reset_index(drop=True)

In [8]:
# Show the df['subreddit'] is no longer grouped by class but it sorted at random.
df.head(25)

Unnamed: 0,content,subreddit
0,"I am Dr. Buzz Aldrin, back again on reddit. I ...",IAmA
1,In pottery class I made a container to hold fl...,Jokes
2,What happens if one star in a binary pair goes...,askscience
3,My girlfriend told me to take the spider out i...,Jokes
4,"Randomly wanting to break up with bf? Hey, so ...",TwoXChromosomes
5,Imagine having a job at a Candy Store That wou...,dadjokes
6,I was reading the history of the French Revolu...,dadjokes
7,ELI5: why is it that when I'm boiling pasta wi...,explainlikeimfive
8,Did you know shower heads are bisexual Every n...,Jokes
9,"PSA: The ""Fab Wallpapers"" app which is ranked ...",Android


# The actual predictive model with three examples:

In [9]:
# Apply train/test split
X = df['content']
y = df['subreddit']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(25652,)
(25652,)
(6413,)
(6413,)


In [10]:
# Create pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('classifier', RandomForestClassifier()),  # Originally trained with MulinomialNB() but had low accuracy
])

In [11]:
# Fit X_train and y_train on the pipe
pipeline.fit(X_train,y_train)



Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words='english', strip_accents=None,
                                 sublinear_tf=False,
                                 token_patt...
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                               

In [12]:
# Create a pipeline prediction object with X_test
predictions = pipeline.predict(X_test)

# Score the model with X_test and y_test
print(classification_report(y_test,predictions))

                   precision    recall  f1-score   support

          Android       0.39      0.39      0.39        54
              DIY       0.09      0.05      0.06        41
          Fitness       0.62      0.74      0.67       222
       Futurology       0.00      0.00      0.00        29
            Games       0.58      0.16      0.25        44
     GetMotivated       0.67      0.31      0.42        13
             IAmA       0.61      0.88      0.72       270
            Jokes       0.27      0.50      0.35       344
      LifeProTips       0.75      0.73      0.74       177
  MachineLearning       0.67      0.72      0.69       192
            Music       0.29      0.25      0.27        65
        Overwatch       0.44      0.33      0.38        82
              PS4       0.47      0.60      0.53       135
   Showerthoughts       0.20      0.05      0.08        39
           Tinder       0.45      0.26      0.33        19
  TwoXChromosomes       0.38      0.36      0.37       

  'precision', 'predicted', average, warn_for)


In [13]:
# Looking at these categories, try four fake reviews and see how the model does:
df.subreddit.unique()

array(['IAmA', 'Jokes', 'askscience', 'TwoXChromosomes', 'dadjokes',
       'explainlikeimfive', 'Android', 'personalfinance', 'nosleep',
       'buildapc', 'relationships', 'television', 'LifeProTips', 'webdev',
       'nba', 'MachineLearning', 'atheism', 'history', 'tifu',
       'WritingPrompts', 'space', 'movies', 'pokemon', 'leagueoflegends',
       'travel', 'pcmasterrace', 'philosophy', 'malefashionadvice',
       'Music', 'Fitness', 'GetMotivated', 'DIY', 'gameofthrones',
       'Games', 'books', 'trees', 'lifehacks', 'Showerthoughts', 'PS4',
       'politics', 'Tinder', 'Futurology', 'Overwatch', 'soccer',
       'gaming', 'listentothis', 'tattoos', 'gadgets', 'europe',
       'technology'], dtype=object)

In [14]:
# Create fn that takes in a reddit post and returns the top five most likely categories:
def get_predictions(post, num_answers=5):
  """ takes a post and returns the top categories it fits in """

  # get the predicted probabilities for each class
  preds = pd.Series(pipeline.predict_proba(post)[0])

  # save each class to the Series index
  preds.index = pipeline.classes_

  # sort to get the most likely classes
  preds = preds.sort_values(ascending=False)

  # return the top num_answers results in dict format
  return preds[:5]

# Test one with a fake review about history

In [15]:
# Test with a fake review
history_post = [ """
                History if my favorite subject.  I love to read historical accounts about ancient Rome and Greece.
                I'm also a big World War 2 buff and I collect objects with historical significance.
                """]

In [16]:
get_predictions(history_post)

history        0.8
PS4            0.1
LifeProTips    0.1
webdev         0.0
dadjokes       0.0
dtype: float64

# Test two with a fake review about pokemon

In [17]:
# Try again and mention pokemon to see if the model correctly guesses pokemon:
pokemon_post = [ """
                My favorite pokemon are pikachu and charizard.
                """]

In [18]:
get_predictions(pokemon_post)

pokemon     0.6
dadjokes    0.3
Jokes       0.1
webdev      0.0
PS4         0.0
dtype: float64

# Test three with a fake post about android

In [19]:
# Try a fake prediction to see if android gets predicted:
android_post = [ """
                I use a galaxy note 5.  My favorite opperating system version was oreo.
                Android phones are better than iphones. I like to create apps for the app store.
                """]

In [20]:
get_predictions(android_post)

Android           0.3
askscience        0.2
IAmA              0.1
dadjokes          0.1
Showerthoughts    0.1
dtype: float64

# Test four with a fake post about music

In [21]:
# Try a fake prediction to see if music gets predicted:
music_post = [ """
                I love to listen to music.  My favorite singer/songwriter is Foy Vance.  Every so often
                I like to listen to Bob Marley.  I have a large vinyl music collection but more recently I've
                been listening to everything on Spotify.
                """]

In [22]:
get_predictions(music_post)

Music         0.3
dadjokes      0.2
askscience    0.2
trees         0.1
gaming        0.1
dtype: float64

# Pickle the model:

In [177]:
from pickle import dump
# save the model
dump(pipeline, open('reddit_model_nc.pkl', 'wb'))

# How to load in the model again:

In [138]:
from pickle import load
# load the model
loaded_model = load(open('reddit_model.pkl', 'rb'))

# For the Flask app API

In [None]:
# Example code meant to be in a Flask app.  Won't run on colab

from pickle import load
# load the model
loaded_model = load(open('reddit_model.pkl', 'rb'))


from flask import jsonify

@app.route("/predict.json", methods=["POST"])
def predict():
  print("PREDICT ROUTE...")
  print("FORM DATA:", dict(request.form))
  #> {'title': 'example title', 'text': 'Example reddit post text here'}

  # concatenate title and text, passed in as one variable to the model
  post = request.form["title"] + ' ' + screen_name_b = request.form["text"]

  # get predictions, store as a Pandas Series
  preds = pd.Series(loaded_model.predict_proba(music_post)[0])

  # assign the subreddit classes to the index
  preds.index = loaded_model.classes_

  # sort by values to get the top results
  preds = preds.sort_values(ascending=False)

  # return the top 5 results as JSON
  return jsonify(subreddits=preds.index[:5],
                  probabilities=preds[:5])

# sklearn Version:

In [139]:
import sklearn
sklearn.__version__

'0.21.3'