## Imports

In [12]:
import spacy
#!python -m spacy download en_core_web_lg
#nlp = spacy.load("en_core_web_lg")
#nlp = spacy.load("en_core_web_md")
#nlp = spacy.load("en_core_web_sm")

import spacy.cli
spacy.cli.download("en_core_web_lg")
import en_core_web_lg
nlp = en_core_web_lg.load()

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


## Load data

In [13]:
# load data

!pip install PyDrive



In [0]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [0]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
downloaded = drive.CreateFile({'id':"1C4bfP-e2Uoiwc0xZYprS7jar3B9Iy6dB"})  
downloaded.GetContentFile('reddit.tsv')       

In [17]:
data = pd.read_csv('reddit.tsv', sep="\t")
sample = data.sample(frac=0.015, replace=True, random_state=42)
 
print(data.shape)
print(sample.shape)

(1013000, 4)
(15195, 4)


In [18]:
train, test = train_test_split(sample, test_size=0.2, stratify=sample["subreddit"])

train.shape, test.shape

((12156, 4), (3039, 4))

In [0]:
# :: Cleaning :: #

import re
def clean(X):

    # Prevent SettingWithCopyWarning
    X = X.copy()
    
    # remove '\\n'
    X['selftext'] = X['selftext'].map(lambda x: re.sub('\\n',' ',str(x)))
    X['subreddit'] = X['subreddit'].map(lambda x: re.sub('\\n',' ',str(x)))
    
    # remove '' 
    X['selftext'] = X['selftext'].map(lambda x: re.sub('<lb>',' ',str(x)))
    X['subreddit'] = X['subreddit'].map(lambda x: re.sub('<lb>',' ',str(x)))
    
    # remove any text starting with User... 
    X['selftext'] = X['selftext'].map(lambda x: re.sub("\[\[User.*",'',str(x)))
    X['subreddit'] = X['subreddit'].map(lambda x: re.sub("\[\[User.*",'',str(x)))
    
    # remove IP addresses or user IDs
    X['selftext'] =X['selftext'].map(lambda x: re.sub("\[\[User.*",'',str(x)))
    X['subreddit'] = X['subreddit'].map(lambda x: re.sub("\[\[User.*",'',str(x)))
    
    #remove http links in the text
    X['selftext'] = X['selftext'].map(lambda x: re.sub("(http://.*?\s)|(http://.*)",'',str(x)))
    X['subreddit'] = X['subreddit'].map(lambda x: re.sub("(http://.*?\s)|(http://.*)",'',str(x)))
    
    return X

train = clean(train)    
test = clean(test)

In [0]:
train.head()

Unnamed: 0,id,subreddit,title,selftext
421280,4orqa3,opiates,It's just not fair,"For the second time, I have found Floor Heroin..."
167647,7ewlv6,Costco,Holiday Shipping,Just wanted to ask how shipping is around this...
535426,6w9jvx,bladerunner,Does the Final Cut feel more suspenseful than ...,I recently re-watched the Directors Cut and th...
79973,7o6wsa,amazonecho,Is there any skills that make alexa more talka...,"So yeah, i've been looking after skills that t..."
684647,5vy3fs,cocktails,Honey ginger syrup for penicillins?,I want to make penicillins at a bonfire with s...


In [20]:
X_train = train["selftext"]
X_test = test["selftext"]

y_train = train["subreddit"]
y_test = test["subreddit"]

print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(12156,) (3039,)
(12156,) (3039,)


In [0]:
X_train.head()

627122    Been using this thing since December '15, look...
917458    i saw a bunch of unfamiliar Costco staff in th...
312727    We have an older toddler and an almost-1-year ...
12100     I've read many lyrical interpretations on TØP,...
825992    The learning curb on these bosses and patterns...
Name: selftext, dtype: object

## Transform data

In [0]:
# spacy tokenizer function to put into tfidf
def tokenize(document):
    
    doc = nlp(document)
    
    return [token.lemma_.strip().lower() for token in doc 
            if (token.is_stop is False) and (token.is_punct is False)
            and (token.is_space is False)]

In [0]:
# Tunning Parameters

# Instantiate vectorizer object
tfidf = TfidfVectorizer(stop_words='english',
                        sublinear_tf=True,
                        strip_accents='unicode', 
                        ngram_range=(1,2),
                        max_df=.97,
                        min_df=3,
                        max_features=1000,
                        analyzer='word',
                         tokenizer=tokenize)


# Create a vocabulary and get word counts per document
vect = tfidf.fit(X_train)

# these cells are for knn model
# # Get feature names to use as dataframe column headers
# dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())

# # View Feature Matrix as DataFrame
# dtm.shape
# dtm.head()

In [0]:
# Pickle the vectorizer 
import pickle
with open('vectorizer2.pkl', 'wb') as fin:
  pickle.dump(vect2, fin)

# LSTM model

In [22]:
# Combine for a big vocab
docs1 = X_train + y_train
docs1

709163    If you listen to the album from start to finis...
988539    I posted a personal story of mine here once be...
530992    Hi there! I am currently deciding between a HE...
948729    When I'm playing a movie that I want to close,...
980018    I found an old golden teacher syringe from abo...
                                ...                        
255095    So I want to start off this post by saying tha...
476513    &gt; Due to copyright restrictions, the Kindle...
411236    Yes, I know of Ken Tamplin but he charges like...
355992    I remember reading some samples were from a Br...
184087    I have seen multiple English covers/websites t...
Length: 12156, dtype: object

In [23]:
# get the dictionary that will allow encoding and decoding to sequence
from keras.preprocessing.text import Tokenizer
t = Tokenizer()
t.fit_on_texts(train['selftext'])

print(t.word_index)

Using TensorFlow backend.




In [24]:
len(t.word_index.values())

62469

In [30]:
# Use keras tokenizer to get sequences
X_sequences = t.texts_to_sequences(X_train)
X_sequences[:3]

[[25,
  20,
  1114,
  3,
  1,
  674,
  36,
  205,
  3,
  1158,
  1,
  674,
  9,
  36,
  80,
  7092,
  4,
  186,
  36,
  656,
  32,
  887,
  22694,
  45,
  60,
  3,
  5382,
  4,
  3,
  32,
  80,
  2747,
  30,
  26,
  45,
  505,
  18262,
  1043,
  45,
  6482,
  4,
  10218,
  4,
  7093,
  5,
  2921,
  1459,
  6,
  186,
  262,
  693,
  80,
  1495,
  3,
  13,
  674,
  9,
  820,
  6783,
  6,
  80,
  738,
  4,
  32,
  45,
  54,
  520,
  22,
  45,
  6784,
  13,
  207,
  2382,
  1079,
  36,
  1043,
  4,
  3039,
  17,
  80,
  854,
  1,
  769,
  2413,
  9,
  32,
  1,
  82,
  5787,
  9,
  36,
  52,
  4,
  111,
  45,
  184,
  13,
  1101,
  2,
  384,
  13,
  36,
  199,
  18263,
  4,
  2,
  6241,
  201,
  3,
  609,
  13,
  9,
  1,
  191,
  32,
  25,
  80,
  1188,
  1270,
  40,
  7,
  5,
  1771,
  657,
  25,
  20,
  220,
  1,
  674,
  4492,
  71,
  324,
  22695,
  82,
  8,
  3167,
  46,
  1484,
  198,
  1706,
  8,
  2459,
  5,
  143,
  34,
  4369,
  4039,
  4,
  198,
  4757,
  7,
  77,
  81,
  16,
  3

In [0]:
# Use the spacy tokenizer to make X_train into list of tokens
# docs = []
# for row in X_train:
#   docs.append(tokenize(row))

# docs[:5]

In [0]:
# replacing each token in X_train with a sequence #
# X_train_seq = []
# for doc in docs:
#   X_train_seq_intermediate = []
  
#   for token in doc:
#     X_train_seq_intermediate.append(t.word_index[token])
  
#   X_train_seq.append(X_train_seq_intermediate)


In [0]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM

In [0]:
# Set the params of the LSTM model
# Do not change this line. You need the +1 for some reason. 
max_features = len(t.word_index.values()) + 1

maxlen = 75
batch_size = 64

In [39]:
# transform the target
labeler = LabelEncoder()
y_train_seq = labeler.fit_transform(y_train)
y_train_seq

array([196, 298, 175, ..., 864, 713, 272])

In [31]:
# Make the input an array
X_train_seq_arr = np.array(X_sequences)
X_train_seq_arr

array([list([25, 20, 1114, 3, 1, 674, 36, 205, 3, 1158, 1, 674, 9, 36, 80, 7092, 4, 186, 36, 656, 32, 887, 22694, 45, 60, 3, 5382, 4, 3, 32, 80, 2747, 30, 26, 45, 505, 18262, 1043, 45, 6482, 4, 10218, 4, 7093, 5, 2921, 1459, 6, 186, 262, 693, 80, 1495, 3, 13, 674, 9, 820, 6783, 6, 80, 738, 4, 32, 45, 54, 520, 22, 45, 6784, 13, 207, 2382, 1079, 36, 1043, 4, 3039, 17, 80, 854, 1, 769, 2413, 9, 32, 1, 82, 5787, 9, 36, 52, 4, 111, 45, 184, 13, 1101, 2, 384, 13, 36, 199, 18263, 4, 2, 6241, 201, 3, 609, 13, 9, 1, 191, 32, 25, 80, 1188, 1270, 40, 7, 5, 1771, 657, 25, 20, 220, 1, 674, 4492, 71, 324, 22695, 82, 8, 3167, 46, 1484, 198, 1706, 8, 2459, 5, 143, 34, 4369, 4039, 4, 198, 4757, 7, 77, 81, 16, 397, 133, 9516, 17, 48, 674, 25, 20, 50, 5, 181, 34, 31475, 343, 80, 333, 728, 3, 18264, 4, 80, 333, 18, 354, 108, 7878, 36, 80, 18265, 75, 537, 3, 5, 31476, 186, 3, 5, 1341, 3, 5, 488, 10, 196, 693, 156, 80, 1341, 4, 26, 45, 2794, 45, 801, 186, 26, 20, 35, 103, 119, 20, 15, 80, 674, 25, 20, 220, 

In [0]:
# Pad the sequence to make uniform entries
x_train1 = sequence.pad_sequences(X_train_seq_arr, maxlen=maxlen)

In [38]:
x_train1

array([[  354,   108,  7878, ...,    45,   177,    13],
       [ 1001,    24,   351, ...,     2,  1196,    98],
       [ 2506,     3,  1088, ...,     8,   274,    20],
       ...,
       [    0,     0,     0, ...,   117,     7,   367],
       [   92, 62465,   601, ...,   112,   782,   339],
       [    0,     0,     0, ...,    31,  2869,   139]], dtype=int32)

In [0]:
# Build the model
model = Sequential()
# Need this to flatten it to the apt shape
model.add(Embedding(max_features, 128))
# 128 specified by papers/industry. Dropout and recurrent_dropout set our forget params
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
# multi class prediction of target y_train
model.add(Dense(len(y_train), activation='softmax'))


model.compile(optimizer='nadam', loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])


In [0]:
# x_train1 is the padded sequence
model.fit(x_train1, y_train_seq, batch_size=batch_size, epochs=15,
          validation_split=0.2)

Train on 9724 samples, validate on 2432 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15

#Logistic Regression


In [0]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(n_jobs=10, solver='lbfgs', multi_class='multinomial')

# vectorize features
X_train_vect = vect.transform(X_train)
logreg = logreg.fit(X_train_vect, y_train)

In [0]:
X_test_vect = vect.transform(X_test)
y_pred = logreg.predict(X_test_vect)


In [0]:
from sklearn.metrics import accuracy_score

print('accuracy', accuracy_score(y_pred, y_test))

accuracy 0.09921026653504442


In [0]:
train['selftext']

84082     maybe really become blood brothers in the Germ...
547571    Good morning,  I am an undergrad research stud...
781624    To provide a little bit of background: I've be...
760278    I know this has been asked before, but most of...
538314    Hey everyone! I've started a new story recentl...
                                ...                        
773815    Apart from the obvious - why does everyone cal...
39499     So I haven't seen a doctor yet (though I need ...
370191    I just got the Wacom Intuos Draw and I have tr...
519777    Would be fun to know your preferred Arc/Episod...
879381    I don't intend to turn this into a thread bash...
Name: selftext, Length: 8104, dtype: object

In [0]:
# Test it on sample text
sample_text = """
Hello all, throw away for obvious reasons.

So, I'm a big pet lover. We have several dogs, reptiles, frogs and small animals.

I work in the pet care industry and have recently lost my job due to the shutdown.

That being the case, I've been home a lot more often, and I noticed the animals aren't attached the him like me. I thought nothing of it, as I care for them mainly and had most of them before we got together.

Our pets have their own room, and yesterday I went to let out dogs out and our 5lb Chihuahua mix was limping and putting no weight on his leg. My husband jokingly said a few minutes before I checked on him, "I got Rufus to stop scratching the door."

I thought nothing of it until my poor dog was unable to move.

I took him to the vet and they did x-rays and let me know Rufus's hip was broken. They said being so small it could be because he jumped off the furniture and hurt himself, as he's rather old as well.

My husband later admitted that he lost his temper, picked Rufus up the the scruff, and beat him.

He feels incredibly guilty and wants nothing to do with any of the pets now.

I found out yesterday I'm pregnant, and I'm worried he may loose his temper with a newborn much worse then Rufus's constant scratching.

What do I do? I love this man and we've been married half a decade now.
              """

In [0]:
# sample text must be iterable to vectorize
logreg.predict(vect.transform([sample_text])

array(['gaymers'], dtype=object)

# Stochastic Gradient Descent Classifier

In [0]:
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier()
# use the previously vectorized features
sgd = sgd.fit(X_train_vect, y_train)
y_pred = sgd.predict(X_test_vect)

print('SGDClassifier accuracy', accuracy_score(y_pred, y_test))

SGDClassifier accuracy 0.13968410661401776


In [0]:
sgd.predict(vect.transform([sample_text]))

array(['Petscop'], dtype='<U21')

In [0]:
# Pickle the SGD Classifier
with open('SGDClassifier.pkl', 'wb') as fin:
  pickle.dump(sgd, fin)

# New hypothesis
#### The reason all of our  models are doing poorly is because there's too many subreddits with small number of instances. Let's do it all again.

In [0]:
sample = data.sample(frac=0.015, replace=True, random_state=42)
train, test = train_test_split(sample, test_size=0.15, stratify=sample["subreddit"])

train.shape, test.shape

((12915, 4), (2280, 4))

In [0]:
train = clean(train)    
test = clean(test)
X_train = train["selftext"]
X_test = test["selftext"]

y_train = train["subreddit"]
y_test = test["subreddit"]

print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(12915,) (2280,)
(12915,) (2280,)


In [0]:
# pd.set_option('display.max_rows', 1000)
# y_train.value_counts().head(1000)

# Ok the problem is actually there are too many subreddits just in general, 
# and we don't even have the main ones in here. Like r/aww, r/investing

bigboobproblems          64
sailing                  62
MoviePassClub            60
DimensionalJumping       59
SleepApnea               59
secretsanta              59
osx                      58
Corsair                  58
baldursgate              58
Conservative             57
stopsmoking              57
korea                    57
blackmirror              57
HouseOfCards             57
watercooling             57
walkingwarrobots         56
Narcolepsy               56
hiphopheads              55
photography              55
gorillaz                 54
amiibo                   54
malaysia                 54
HomeDepot                54
Volkswagen               54
bettafish                54
danganronpa              54
arrow                    54
RedditLaqueristas        54
peacecorps               54
personalfinance          54
Rowing                   54
dbz                      54
Cubers                   54
workflow                 54
baduk                    54
audible             

In [0]:
tfidf1 = TfidfVectorizer(stop_words='english',
                        sublinear_tf=True,
                        strip_accents='unicode', 
                        ngram_range=(1,2),
                        max_df=.97,
                        min_df=3,
                        max_features=1000,
                        analyzer='word',
                        token_pattern=r'\w{2,}',
                        tokenizer=tokenize)

vect = tfidf1.fit(X_train)

In [0]:
# Vectorize the Xs
X_train_vect = vect.transform(X_train)
X_test_vect = vect.transform(X_test)


In [0]:
# Logistic Regression

logreg = LogisticRegression(n_jobs=10, solver='lbfgs', multi_class='multinomial')

logreg = logreg.fit(X_train_vect, y_train)

y_pred = logreg.predict(X_test_vect)
print('Logistic accuracy:', accuracy_score(y_pred, y_test))

In [0]:
# Stochastic Gradient Descent
sgd = SGDClassifier()
sgd = sgd.fit(X_train_vect, y_train)
y_pred = sgd.predict(X_test_vect)

print('SGDClassifier accuracy', accuracy_score(y_pred, y_test))

# Try it with pickles


In [0]:
# load the pickle models
import pickle

with open('vectorizer2.pkl', 'rb') as file:
    vec_pickle = pickle.load(file)

with open('SGDClassifier_pickle.pkl', 'rb') as file1:
    clf_pickle = pickle.load(file1)

In [6]:
# Test with sample input

sample_text = """
If you have Final Fantasy 7 Remake preordered and really want it, go pick it up ASAP. Our district was told we have to send them out for online orders otherwise we would be closed again for not fulfilling these orders. They don't care, they just want the sales. Can't wait to deal with the aftermath.
              """
text_vec = vec_pickle.transform([sample_text])
output = clf_pickle.predict(text_vec)
    
type(output)

numpy.ndarray

In [7]:
output[0]

'Ghosts'

In [0]:
# Notes: it will come in JSON so use jsonify

## Model

Knn


In [0]:
# algorithm{‘auto’, ‘ball_tree’, ‘kd_tree’, ‘brute’}
nn = NearestNeighbors(n_neighbors=10, algorithm='brute', n_jobs=-1)

# Fit the model on TFidf Vectors
nn.fit(dtm)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=-1, n_neighbors=10, p=2,
                 radius=1.0)

In [0]:
nn.kneighbors([dtm.iloc[0].values])

(array([[0.        , 1.13874581, 1.20226557, 1.21065594, 1.23113375,
         1.2352659 , 1.24365648, 1.24431962, 1.24488678, 1.24901069]]),
 array([[   0,  294, 3480, 6215,  390, 3725, 2117, 2825, 5672, 3819]]))

In [0]:
# Query Using kneighbors 
nn.kneighbors([dtm.iloc[42]])

(array([[2.10734243e-08, 1.04315344e+00, 1.04397239e+00, 1.07311766e+00,
         1.07580780e+00, 1.07972784e+00, 1.08015754e+00, 1.09048932e+00,
         1.09634997e+00, 1.09756603e+00]]),
 array([[  42, 1835, 3886, 5621, 1095, 3094, 7645, 3426, 7345, 5920]]))

## Test

In [0]:
# https://www.reddit.com/r/learnprogramming/comments/g99at4/i_printed_hello_world_in_cobol/

test_input = """
I’m not much of a programmer, but when I saw that the world needs COBOL programmers right now, 
I thought I would do my best to help out, even though I knew nothing about the language. I’ve 
spent way too many hours over the past two weeks trying to get my system configured just to 
compile and run COBOL code. It might not seem like a big deal, but seeing those two words on 
the system output makes me feel like I can do anything!
"""

test_sparse = tfidf.transform([test_input])

In [0]:
test_array = nn.kneighbors(test_sparse.todense(), n_neighbors=10)
test_array

(array([[1.18033871, 1.18590088, 1.20771511, 1.20865218, 1.20988773,
         1.21075963, 1.21339007, 1.21622075, 1.22106848, 1.22322952]]),
 array([[4575, 2399, 1868, 4506, 2257,  465, 2391, 3063, 2330,  954]]))

In [0]:
# Extract the second item in the outer array
# This is the list of the review indices that are 'closest' to input

rec_id_list = test_array[1][0]
rec_id_list

array([4575, 2399, 1868, 4506, 2257,  465, 2391, 3063, 2330,  954])

In [0]:
data.iloc[rec_id_list]["subreddit"]

4575    twentyonepilots
2399               GMAT
1868             weezer
4506              wacom
2257            osugame
465              xxketo
2391       transformers
3063            Stellar
2330               UFOs
954         foxholegame
Name: subreddit, dtype: object

In [0]:
input_two = """

Michael Jordan on Isiah Thomas: "Whatever he says now, you know it wasn't his true actions then. 
He's had time to think about it. Or, the reaction of the public, that's kind of changed his 
perspective of it. You can show me anything you want. There's no way you can convince me he wasn't an asshole.
"""

In [0]:
def recommend(req, n=10):
    """Function to recommend top n subreddits given a request."""
    # Create vector from request
    req_vec = tfidf.transform([req])

    # Get indexes for n nearest neighbors
    top_id = nn.kneighbors(req_vec.todense(), n_neighbors=n)[1][0]

    # Index-locate the neighbors in original dataframe
    top_array = data.iloc[top_id]["subreddit"]

    return top_array

In [0]:
result = recommend(input_two)
result

5579                 cigars
7646    lawschooladmissions
7344                    wls
4025               funkopop
6748           LongDistance
4890                parrots
4948         suggestmeabook
2550               Warframe
3923               ArcherFX
311                  Tinder
Name: subreddit, dtype: object

In [0]:
# Results are...lol

# Use current top trending subreddits + post and retrain entrire model = PRAW api. 