In [1]:
import os
import pickle

import numpy as np
import spacy

from spacy.tokenizer import Tokenizer
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [2]:
import pandas as pd
import sqlite3
conn = sqlite3.connect("api/db.sqlite3")
df = pd.read_sql_query("select * from submissions limit 5000;", conn)


In [3]:
df.shape

(5000, 4)

In [64]:
df.head(10)

Unnamed: 0,subreddit,subreddit_subs,title,text
0,Home,10102,Beautiful Home :),
1,Home,10102,This was finished yesterday..,
2,Home,10102,My roommate is kicking me out because having a...,"So, I am not asking for advice, really...mores..."
3,Home,10102,This was this kitchen I grew up with in London...,
4,Home,10102,Anyone know this style of home?,
5,Home,10102,My view from home after a long day,
6,AskReddit,26268612,"People who haven't pooped in 2019 yet, why are...",
7,AskReddit,26268612,Stan Lee has passed away at 95 years old,As many of you know today is day that many of ...
8,AskReddit,26268612,How would you feel about a feature where if so...,
9,AskReddit,26268612,"Reddit, how would you feel about a law that ba...",


In [65]:
df['text'].head(5)

0                                                     
1                                                     
2    So, I am not asking for advice, really...mores...
3                                                     
4                                                     
Name: text, dtype: object

In [66]:
df['clean_text'] = df['text'].str.replace('[^\w\s]',' ')
df['clean_title'] = df['title'].str.replace('[^\w\s]',' ')

In [67]:
df.head(10)

Unnamed: 0,subreddit,subreddit_subs,title,text,clean_text,clean_title
0,Home,10102,Beautiful Home :),,,Beautiful Home
1,Home,10102,This was finished yesterday..,,,This was finished yesterday
2,Home,10102,My roommate is kicking me out because having a...,"So, I am not asking for advice, really...mores...",So I am not asking for advice really mores...,My roommate is kicking me out because having a...
3,Home,10102,This was this kitchen I grew up with in London...,,,This was this kitchen I grew up with in London...
4,Home,10102,Anyone know this style of home?,,,Anyone know this style of home
5,Home,10102,My view from home after a long day,,,My view from home after a long day
6,AskReddit,26268612,"People who haven't pooped in 2019 yet, why are...",,,People who haven t pooped in 2019 yet why are...
7,AskReddit,26268612,Stan Lee has passed away at 95 years old,As many of you know today is day that many of ...,As many of you know today is day that many of ...,Stan Lee has passed away at 95 years old
8,AskReddit,26268612,How would you feel about a feature where if so...,,,How would you feel about a feature where if so...
9,AskReddit,26268612,"Reddit, how would you feel about a law that ba...",,,Reddit how would you feel about a law that ba...


In [68]:
# We set our features as description, and target as subreddit.  
# Create a mass text.

features = ['clean_text', 'clean_title', 'subreddit_subs'] 
target = 'subreddit'

X = df[features]
y = df[[target]]

In [69]:
#Create the nlp object
nlp = spacy.load("en_core_web_md")

# create tokenizer object
tokenizer = Tokenizer(nlp.vocab)

In [70]:
def tokenize(doc):
        """Return the tokens"""
        return [token.text for token in tokenizer(doc)]

In [71]:
def get_lemmas(text):
        """Return the Lemmas"""
        lemmas = []
        doc = nlp(text)
    
        for token in doc: 
            if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_ != 'PRON'):
                lemmas.append(token.lemma_)
    
        return lemmas

In [72]:
text = df["clean_text"]

# Instantiate vectorizer object
tfidf = TfidfVectorizer(tokenizer=get_lemmas, min_df=0.025, max_df=.98, ngram_range=(1,2))
#tfidf = TfidfVectorizer(stop_words="english")
# Create a vocabulary and get word counts per document
dtm = tfidf.fit_transform(text) # Similiar to fit_predict

# Get feature names to use as dataframe column headers
#dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())

In [73]:
# Fit on TF-IDF Vectors
size = 5
model  = NearestNeighbors(n_neighbors=size, algorithm='ball_tree')
model.fit(dtm)



NearestNeighbors(algorithm='ball_tree', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=None, n_neighbors=5, p=2, radius=1.0)

In [74]:
user_input = ["""Capes are not a great idea of you are a super hero trying to save the world."""]

#vec_user_input = tfidf.transform(user_input)
#dist, subreddit_index = model.kneighbors(vec_user_input.todense())
dist, subreddit_index = model.kneighbors(tfidf.transform(user_input).todense())

In [75]:
recommended_reddits = [df[['subreddit','title','text','subreddit_subs']].iloc[n] for n in subreddit_index]

print(*recommended_reddits, sep = "\n")

              subreddit                                              title  \
2596  AskScienceFiction  [LOTR] Gandalf says "the world is gnawed by na...   
1994           baseball  THE HOUSTON ASTROS ARE THE 2017 WORLD SERIES C...   
3131     MechanicAdvice    Don't Ever Tell Anyone Your Birthday in a shop.   
3134      YouShouldKnow  YSK: Your last chance to save the freedom of f...   
3132     MechanicAdvice  I spent $1,000 on this 08' Malibu LS with a bl...   

                                                   text  subreddit_subs  
2596  Wouldn't it be a good idea to inform the world...          176850  
1994  THIS IS THEIR FIRST WORLD SERIES TITLE IN FRAN...         1106726  
3131                                                             309626  
3134                                                            1739419  
3132                                                             309626  


In [76]:
import pickle

knnPickle = 'model_pkl.sav'

index_df = df[['subreddit']]

pickle.dump((model, index_df), open(knnPickle, 'wb'))


In [77]:
# # load the model from disk
loaded_model, loaded_index_df = pickle.load(open(knnPickle, 'rb'))
dist, indices = loaded_model.kneighbors(tfidf.transform(user_input).todense())
recommended_reddits = [loaded_index_df.iloc[n]['subreddit'] for n in indices[0]]
print(recommended_reddits)

['AskScienceFiction', 'baseball', 'MechanicAdvice', 'YouShouldKnow', 'MechanicAdvice']
