## Imports

In [0]:
import spacy
#!python -m spacy download en_core_web_lg
#nlp = spacy.load("en_core_web_lg")
#nlp = spacy.load("en_core_web_md")
#nlp = spacy.load("en_core_web_sm")

import spacy.cli
spacy.cli.download("en_core_web_lg")
import en_core_web_lg
nlp = en_core_web_lg.load()

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


## Load data

In [0]:
# load data

!pip install PyDrive



In [0]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [0]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
downloaded = drive.CreateFile({'id':"1C4bfP-e2Uoiwc0xZYprS7jar3B9Iy6dB"})  
downloaded.GetContentFile('reddit.tsv')       

In [0]:
data = pd.read_csv('reddit.tsv', sep="\t")
sample = data.sample(frac=0.01, replace=True, random_state=42)
 
print(data.shape)
print(sample.shape)

(1013000, 4)
(10130, 4)


In [0]:
train, test = train_test_split(sample, test_size=0.2, stratify=sample["subreddit"])

train.shape, test.shape

((8104, 4), (2026, 4))

In [0]:
# :: Cleaning :: #

import re
def clean(X):

    # Prevent SettingWithCopyWarning
    X = X.copy()
    
    # remove '\\n'
    X['selftext'] = X['selftext'].map(lambda x: re.sub('\\n',' ',str(x)))
    X['subreddit'] = X['subreddit'].map(lambda x: re.sub('\\n',' ',str(x)))
    
    # remove '' 
    X['selftext'] = X['selftext'].map(lambda x: re.sub('<lb>',' ',str(x)))
    X['subreddit'] = X['subreddit'].map(lambda x: re.sub('<lb>',' ',str(x)))
    
    # remove any text starting with User... 
    X['selftext'] = X['selftext'].map(lambda x: re.sub("\[\[User.*",'',str(x)))
    X['subreddit'] = X['subreddit'].map(lambda x: re.sub("\[\[User.*",'',str(x)))
    
    # remove IP addresses or user IDs
    X['selftext'] =X['selftext'].map(lambda x: re.sub("\[\[User.*",'',str(x)))
    X['subreddit'] = X['subreddit'].map(lambda x: re.sub("\[\[User.*",'',str(x)))
    
    #remove http links in the text
    X['selftext'] = X['selftext'].map(lambda x: re.sub("(http://.*?\s)|(http://.*)",'',str(x)))
    X['subreddit'] = X['subreddit'].map(lambda x: re.sub("(http://.*?\s)|(http://.*)",'',str(x)))
    
    return X

train = clean(train)    
test = clean(test)

In [0]:
train.head()

Unnamed: 0,id,subreddit,title,selftext
627122,6glnxw,SteamController,Best ways to train Steam Controller FPS Aim?,"Been using this thing since December '15, look..."
917458,55apq4,Costco,"Was anyone at the Gaithersburg, MD Costco today?",i saw a bunch of unfamiliar Costco staff in th...
312727,6bcl84,japanlife,Family-friendly restaurants in Tokyo?,We have an older toddler and an almost-1-year ...
12100,5or5m6,twentyonepilots,The Christian Perspective,"I've read many lyrical interpretations on TØP,..."
825992,6mjggx,HollowKnight,Just Beat The Fake Knight- ohhhh does it feel ...,The learning curb on these bosses and patterns...


In [0]:
X_train = train["selftext"]
X_test = test["selftext"]

y_train = train["subreddit"]
y_test = test["subreddit"]

print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(8104,) (2026,)
(8104,) (2026,)


In [0]:
X_train.head()

627122    Been using this thing since December '15, look...
917458    i saw a bunch of unfamiliar Costco staff in th...
312727    We have an older toddler and an almost-1-year ...
12100     I've read many lyrical interpretations on TØP,...
825992    The learning curb on these bosses and patterns...
Name: selftext, dtype: object

## Transform data

In [0]:
def tokenize(document):
    
    doc = nlp(document)
    
    return [token.lemma_.strip().lower() for token in doc if (token.is_stop != True) and (token.is_punct != True)]

In [0]:
# Tunning Parameters

# Instantiate vectorizer object
tfidf = TfidfVectorizer(stop_words='english',
                        sublinear_tf=True,
                        strip_accents='unicode', 
                        ngram_range=(1,2),
                        max_df=.97,
                        min_df=3,
                        max_features=1000,
                        analyzer='word',
                        token_pattern=r'\w{2,}',
                        tokenizer=tokenize)


# Create a vocabulary and get word counts per document
dtm = tfidf.fit_transform(X_train)

# Get feature names to use as dataframe column headers
dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())

# View Feature Matrix as DataFrame
dtm.shape
dtm.head()

Unnamed: 0,Unnamed: 1,Unnamed: 2,1,2,3,4,amp;nbsp,currently,day,edit,feel,good,gt,guy,help,hope,idea,know,like,look,love,need,new,question,read,say,sorry,start,sure,thank,thing,think,thought,time,try,want,wonder,work,$,+,...,weapon,wear,web,website,week,week ago,weekend,weight,weird,welcome,white,wide,wife,willing,win,window,wish,woman,wonder.1,word,work.1,work.2,world,worried,worry,worth,write,wrong,x,yeah,year,year.1,year ago,year old,yes,yesterday,young,youtube,zero,|
0,0.145189,0.196442,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.182503,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.258734,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.120653,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.181796,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.190928,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.241605,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# LSTM model

In [0]:
# Combine for a big vocab
docs1 = X_train + y_train
docs1

627122    Been using this thing since December '15, look...
917458    i saw a bunch of unfamiliar Costco staff in th...
312727    We have an older toddler and an almost-1-year ...
12100     I've read many lyrical interpretations on TØP,...
825992    The learning curb on these bosses and patterns...
                                ...                        
551120    This "tech" has been brought up a few times, b...
184078    I'm writing a paper in my psychosocial class. ...
757634    My roommate, in an act of charity, let two str...
48320     I'm not talking about the sterotypical loud on...
602566    If I were to go guaranteed a school into the E...
Length: 8104, dtype: object

In [0]:
# get it into sequences
from keras.preprocessing.text import Tokenizer
t = Tokenizer()
t.fit_on_texts(docs1)

print(t.word_index)



In [0]:
len(t.word_index.values())

50347

In [0]:
word_list = []
for row in X_train:
  word_list.append(tokenize(row))

word_list

[['thing',
  'december',
  '15',
  'look',
  'way',
  'improve',
  'aim',
  'currently',
  'focus',
  'mainly',
  'improve',
  'overwatch',
  'quake',
  'live',
  'champions',
  '',
  'tend',
  'hold',
  'thing',
  'sit',
  '',
  'mouse',
  'setting',
  'use',
  '',
  'practice',
  'drill',
  'technique',
  '',
  '',
  'gyro',
  'right',
  'pad',
  'touch',
  'slightly',
  'high',
  'smoothing',
  'minimum',
  'movement',
  'threshold',
  'minimum',
  'alongside',
  'trackpad',
  'similar',
  'setting',
  'high',
  'sensitivity'],
 ['see',
  'bunch',
  'unfamiliar',
  'costco',
  'staff',
  'cashier',
  'line',
  'clipboard',
  '',
  'cashier',
  'look',
  'stressed',
  'cashier',
  'helper',
  'line',
  '',
  'store',
  'audit',
  '',
  'busy',
  'costcos',
  'wash',
  'dc',
  'area',
  '',
  'not',
  'help',
  'usual',
  'friday',
  'afternoon',
  '4',
  'customer',
  'deep',
  'line',
  'day',
  'coupon',
  'book',
  'day',
  '',
  'cashier',
  'pos',
  'machine',
  'reboot',
  'tra

In [0]:
X_train_seq = []
for list in word_list:
  X_train_seq_intermediate = []
  
  for word in list:
    X_train_seq_intermediate.append(t.word_index[word])
  
  X_train_seq.append(X_train_seq_intermediate)


In [0]:
# transform the target
labeler = LabelEncoder()
y_train_seq = labeler.fit_transform(y_train)
y_train_seq

array([341,  80, 673, ..., 453, 178, 962])

In [0]:
X_train_seq

In [0]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM

Using TensorFlow backend.


In [0]:
# Do not change this line. You need the +1 for some reason. 
max_features = len(t.word_index.values()) + 1

maxlen = 75
batch_size = 64

In [0]:
X_train_seq_arr = np.array(X_train_seq)
X_train_seq_arr

array([list([78, 1671, 459, 64, 62, 844, 2079, 216, 648, 1054, 844, 6452, 24825, 195, 7927, 5, 1605, 485, 78, 591, 5, 1273, 747, 81, 5, 730, 5163, 1900, 5, 5, 6953, 98, 1922, 842, 946, 191, 22496, 1923, 1728, 7295, 1923, 5065, 14033, 300, 747, 191, 5066]),
       list([86, 899, 6453, 3701, 2080, 5660, 341, 17458, 5, 5660, 64, 4229, 5660, 10490, 341, 5, 447, 4226, 5, 1800, 24826, 1924, 3757, 398, 5, 27, 66, 1437, 1395, 3407, 113, 1070, 953, 341, 70, 8704, 308, 70, 5, 5660, 5861, 1116, 4455, 1740, 22343, 5]),
       list([160, 5235, 52106, 160, 250, 548, 2165, 130, 14034, 401, 290, 5, 4050, 169, 33, 1884, 300, 511, 595, 1297, 5, 92, 333]),
       ...,
       list([1725, 847, 3233, 164, 2936, 8868, 791, 312, 81, 2388, 1117, 7278, 1066, 2260, 618, 11024, 2694, 3482, 3633, 6454, 1165, 1779, 6245, 326, 2260, 93, 1528, 427, 76, 9254, 225, 999, 618, 154, 2694, 3482, 355, 1725, 3633, 1083, 48, 70, 195, 999, 1037, 7278, 1804, 4940, 2694, 3482, 2388, 15443, 5696, 4044, 275, 697, 1725, 366, 1289, 

In [0]:
x_train1 = sequence.pad_sequences(X_train_seq_arr, maxlen=maxlen)

In [0]:
x_train1

array([[    0,     0,     0, ...,   747,   191,  5066],
       [    0,     0,     0, ...,  1740, 22343,     5],
       [    0,     0,     0, ...,     5,    92,   333],
       ...,
       [ 1165,  1779,  6245, ..., 16376,    98,   855],
       [20449,  6404,   418, ...,    14,   737,   418],
       [   29,  1200,  7253, ...,  2564,   601,  3774]], dtype=int32)

In [0]:
# Build the model
model = Sequential()
# Need this to flatten it to the apt shape
model.add(Embedding(max_features, 128))
# 128 specified by papers/industry. Dropout and recurrent_dropout set our forget params
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(len(y_train_seq), activation='softmax'))


model.compile(optimizer='nadam', loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])


In [0]:
model.fit(x_train1, y_train_seq, batch_size=batch_size, epochs=15,
          validation_split=0.2)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 6483 samples, validate on 1621 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.callbacks.History at 0x7fef8fea80f0>

## Model

Knn


In [0]:
# algorithm{‘auto’, ‘ball_tree’, ‘kd_tree’, ‘brute’}
nn = NearestNeighbors(n_neighbors=10, algorithm='brute', n_jobs=-1)

# Fit the model on TFidf Vectors
nn.fit(dtm)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=-1, n_neighbors=10, p=2,
                 radius=1.0)

In [0]:
nn.kneighbors([dtm.iloc[0].values])

(array([[0.        , 1.13874581, 1.20226557, 1.21065594, 1.23113375,
         1.2352659 , 1.24365648, 1.24431962, 1.24488678, 1.24901069]]),
 array([[   0,  294, 3480, 6215,  390, 3725, 2117, 2825, 5672, 3819]]))

In [0]:
# Query Using kneighbors 
nn.kneighbors([dtm.iloc[42]])

(array([[2.10734243e-08, 1.04315344e+00, 1.04397239e+00, 1.07311766e+00,
         1.07580780e+00, 1.07972784e+00, 1.08015754e+00, 1.09048932e+00,
         1.09634997e+00, 1.09756603e+00]]),
 array([[  42, 1835, 3886, 5621, 1095, 3094, 7645, 3426, 7345, 5920]]))

## Test

In [0]:
# https://www.reddit.com/r/learnprogramming/comments/g99at4/i_printed_hello_world_in_cobol/

test_input = """
I’m not much of a programmer, but when I saw that the world needs COBOL programmers right now, 
I thought I would do my best to help out, even though I knew nothing about the language. I’ve 
spent way too many hours over the past two weeks trying to get my system configured just to 
compile and run COBOL code. It might not seem like a big deal, but seeing those two words on 
the system output makes me feel like I can do anything!
"""

test_sparse = tfidf.transform([test_input])

In [0]:
test_array = nn.kneighbors(test_sparse.todense(), n_neighbors=10)
test_array

(array([[1.18033871, 1.18590088, 1.20771511, 1.20865218, 1.20988773,
         1.21075963, 1.21339007, 1.21622075, 1.22106848, 1.22322952]]),
 array([[4575, 2399, 1868, 4506, 2257,  465, 2391, 3063, 2330,  954]]))

In [0]:
# Extract the second item in the outer array
# This is the list of the review indices that are 'closest' to input

rec_id_list = test_array[1][0]
rec_id_list

array([4575, 2399, 1868, 4506, 2257,  465, 2391, 3063, 2330,  954])

In [0]:
data.iloc[rec_id_list]["subreddit"]

4575    twentyonepilots
2399               GMAT
1868             weezer
4506              wacom
2257            osugame
465              xxketo
2391       transformers
3063            Stellar
2330               UFOs
954         foxholegame
Name: subreddit, dtype: object

In [0]:
input_two = """

Michael Jordan on Isiah Thomas: "Whatever he says now, you know it wasn't his true actions then. 
He's had time to think about it. Or, the reaction of the public, that's kind of changed his 
perspective of it. You can show me anything you want. There's no way you can convince me he wasn't an asshole.
"""

In [0]:
def recommend(req, n=10):
    """Function to recommend top n subreddits given a request."""
    # Create vector from request
    req_vec = tfidf.transform([req])

    # Get indexes for n nearest neighbors
    top_id = nn.kneighbors(req_vec.todense(), n_neighbors=n)[1][0]

    # Index-locate the neighbors in original dataframe
    top_array = data.iloc[top_id]["subreddit"]

    return top_array

In [0]:
result = recommend(input_two)
result

5579                 cigars
7646    lawschooladmissions
7344                    wls
4025               funkopop
6748           LongDistance
4890                parrots
4948         suggestmeabook
2550               Warframe
3923               ArcherFX
311                  Tinder
Name: subreddit, dtype: object

In [0]:
# Results are...lol

# Use current top trending subreddits + post and retrain entrire model = PRAW api. 

## Pickle