In [84]:
import os
import pickle

import numpy as np
import spacy

from spacy.tokenizer import Tokenizer
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [88]:
import pandas as pd
import sqlite3
conn = sqlite3.connect("api/db.sqlite3")
df = pd.read_sql_query("select * from submissions limit 7000;", conn)


In [89]:
df.shape

(7000, 4)

In [90]:
df.head(10)

Unnamed: 0,subreddit,subreddit_subs,title,text
0,Home,10119,Beautiful Home :),
1,Home,10119,This was finished yesterday..,
2,Home,10119,My roommate is kicking me out because having a...,"So, I am not asking for advice, really...mores..."
3,Home,10119,This was this kitchen I grew up with in London...,
4,Home,10119,Anyone know this style of home?,
5,Home,10119,My view from home after a long day,
6,Home,10119,That is how we make our home even cozier place 🥰😎,
7,Home,10119,Should I continue the backslash around the cor...,
8,Home,10119,My first attempt at wall design. Purposely wen...,
9,Home,10119,Best way to hide wires?,


In [91]:
df['text'].head(5)

0                                                     
1                                                     
2    So, I am not asking for advice, really...mores...
3                                                     
4                                                     
Name: text, dtype: object

In [92]:
df['clean_text'] = df['text'].str.replace('[^\w\s]',' ')
df['clean_title'] = df['title'].str.replace('[^\w\s]',' ')

In [93]:
df.head(10)

Unnamed: 0,subreddit,subreddit_subs,title,text,clean_text,clean_title
0,Home,10119,Beautiful Home :),,,Beautiful Home
1,Home,10119,This was finished yesterday..,,,This was finished yesterday
2,Home,10119,My roommate is kicking me out because having a...,"So, I am not asking for advice, really...mores...",So I am not asking for advice really mores...,My roommate is kicking me out because having a...
3,Home,10119,This was this kitchen I grew up with in London...,,,This was this kitchen I grew up with in London...
4,Home,10119,Anyone know this style of home?,,,Anyone know this style of home
5,Home,10119,My view from home after a long day,,,My view from home after a long day
6,Home,10119,That is how we make our home even cozier place 🥰😎,,,That is how we make our home even cozier place
7,Home,10119,Should I continue the backslash around the cor...,,,Should I continue the backslash around the cor...
8,Home,10119,My first attempt at wall design. Purposely wen...,,,My first attempt at wall design Purposely wen...
9,Home,10119,Best way to hide wires?,,,Best way to hide wires


In [94]:
# We set our features as description, and target as subreddit.  
# Create a mass text.

features = ['clean_text', 'clean_title', 'subreddit_subs'] 
target = 'subreddit'

X = df[features]
y = df[[target]]

In [95]:
#Create the nlp object
nlp = spacy.load("en_core_web_md")

# create tokenizer object
tokenizer = Tokenizer(nlp.vocab)

In [96]:
def tokenize(doc):
        """Return the tokens"""
        return [token.text for token in tokenizer(doc)]

In [97]:
def get_lemmas(text):
        """Return the Lemmas"""
        lemmas = []
        doc = nlp(text)
    
        for token in doc: 
            if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_ != 'PRON'):
                lemmas.append(token.lemma_)
    
        return lemmas

In [98]:
text = df["clean_text"]

# Instantiate vectorizer object
tfidf = TfidfVectorizer(tokenizer=get_lemmas, min_df=0.025, max_df=.98, ngram_range=(1,2))
#tfidf = TfidfVectorizer(stop_words="english")
# Create a vocabulary and get word counts per document
dtm = tfidf.fit_transform(text) # Similiar to fit_predict

# Get feature names to use as dataframe column headers
#dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())

In [99]:
# Fit on TF-IDF Vectors
size = 501
model  = NearestNeighbors(n_neighbors=size, algorithm='ball_tree')
model.fit(dtm)



NearestNeighbors(algorithm='ball_tree', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=None, n_neighbors=501, p=2, radius=1.0)

In [100]:
user_input = ["""Capes are not a great idea of you are a super hero trying to save the world."""]

#vec_user_input = tfidf.transform(user_input)
#dist, subreddit_index = model.kneighbors(vec_user_input.todense())
dist, subreddit_index = model.kneighbors(tfidf.transform(user_input).todense())

In [101]:
recommended_reddits = [df[['subreddit','title','text','subreddit_subs']].iloc[n] for n in subreddit_index]

print(*recommended_reddits, sep = "\n")

      subreddit                                              title text  \
4107        aww                      This dad did not want a puppy        
4076        aww  This is "Frida", she has saved 52 people so fa...        
4075        aww  My autistic son hates his photo taken, so I le...        
4074        aww                        Her reaction at the end :’)        
4073        aww                 Puppy wants to nap with his friend        
4072        aww                                          Baby duck        
4071        aww  Osaka Aquarium just stepped up their gift shop...        
4070        aww  Service pitbull training to protect his owner'...        
4069        aww  Gorgeous grey wolf becomes a good boy when vis...        
4068        aww  Mister Weez was always my big chonk. After a y...        
4067        aww             Mittens! I told you, no more fighting.        
4066        aww               I take it my mailman is a dog lover.        
4065        aww  When you

In [77]:
import pickle

knnPickle = 'model_pkl.sav'

index_df = df[['subreddit']]

pickle.dump((model, index_df, tfidf), open(knnPickle, 'wb'))


In [79]:
# # load the model from disk
loaded_model, loaded_index_df, loaded_tfidf = pickle.load(open(knnPickle, 'rb'))
dist, indices = loaded_model.kneighbors(loaded_tfidf.transform(user_input).todense())
recommended_reddits = [loaded_index_df.iloc[n]['subreddit'] for n in indices[0]]
print(set(recommended_reddits))
print(recommended_reddits)

def uniq(input):
  output = []
  for x in input:
    if x not in output:
      output.append(x)
  return output

{'Tinder', 'leagueoflegends', 'AskMen'}
['AskMen', 'leagueoflegends', 'leagueoflegends', 'leagueoflegends', 'leagueoflegends', 'leagueoflegends', 'leagueoflegends', 'leagueoflegends', 'leagueoflegends', 'leagueoflegends', 'leagueoflegends', 'leagueoflegends', 'leagueoflegends', 'leagueoflegends', 'leagueoflegends', 'leagueoflegends', 'leagueoflegends', 'leagueoflegends', 'leagueoflegends', 'leagueoflegends', 'leagueoflegends', 'leagueoflegends', 'leagueoflegends', 'leagueoflegends', 'leagueoflegends', 'leagueoflegends', 'leagueoflegends', 'leagueoflegends', 'leagueoflegends', 'leagueoflegends', 'leagueoflegends', 'leagueoflegends', 'leagueoflegends', 'leagueoflegends', 'leagueoflegends', 'leagueoflegends', 'leagueoflegends', 'leagueoflegends', 'leagueoflegends', 'leagueoflegends', 'leagueoflegends', 'leagueoflegends', 'leagueoflegends', 'leagueoflegends', 'leagueoflegends', 'leagueoflegends', 'leagueoflegends', 'leagueoflegends', 'leagueoflegends', 'leagueoflegends', 'leagueoflegends',