In [119]:
import os
import pickle

import numpy as np
import spacy

from spacy.tokenizer import Tokenizer
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [120]:
import pandas as pd
import sqlite3
conn = sqlite3.connect("api/db-mega.sqlite3")
df = pd.read_sql_query("select * from submissions order by random() limit 5000;", conn)


In [121]:
df.shape

(5000, 4)

In [122]:
df.head(10)

Unnamed: 0,subreddit,subreddit_subs,title,text
0,Frugal,1195264,It's often more frugal to spend more money and...,$60 may seem like a lot of money for a cast ir...
1,conspiracy,1088292,Nestle bottles millions of litres of Canadian ...,
2,grandorder,141343,I will wait for you good friend o7,
3,tifu,15186112,TIFU by ruining a happy family.,"Like all good TIFUs, this didn’t happen yester..."
4,askgaybros,195264,"Yesterday my sister called me a virgin, and to...",Apparently she doesn't consider gay sex to be ...
5,netflixwitcher,73985,#CountingDown,
6,halo,402511,All of our friends who have sleeping avatars o...,
7,mildlyinfuriating,2216136,"Table at my school designed to fit, but doesn't",
8,elderscrollsonline,246975,A little late to Elsweyr but happy to be here!,
9,ShitPostCrusaders,456327,AN IL VENTO D'ORO FOR YOU!,


In [123]:
df['text'].head(5)

0    $60 may seem like a lot of money for a cast ir...
1                                                     
2                                                     
3    Like all good TIFUs, this didn’t happen yester...
4    Apparently she doesn't consider gay sex to be ...
Name: text, dtype: object

In [124]:
df['clean_text'] = df['text'].str.replace('[^\w\s]',' ')
df['clean_title'] = df['title'].str.replace('[^\w\s]',' ')

In [125]:
df.head(10)

Unnamed: 0,subreddit,subreddit_subs,title,text,clean_text,clean_title
0,Frugal,1195264,It's often more frugal to spend more money and...,$60 may seem like a lot of money for a cast ir...,60 may seem like a lot of money for a cast ir...,It s often more frugal to spend more money and...
1,conspiracy,1088292,Nestle bottles millions of litres of Canadian ...,,,Nestle bottles millions of litres of Canadian ...
2,grandorder,141343,I will wait for you good friend o7,,,I will wait for you good friend o7
3,tifu,15186112,TIFU by ruining a happy family.,"Like all good TIFUs, this didn’t happen yester...",Like all good TIFUs this didn t happen yester...,TIFU by ruining a happy family
4,askgaybros,195264,"Yesterday my sister called me a virgin, and to...",Apparently she doesn't consider gay sex to be ...,Apparently she doesn t consider gay sex to be ...,Yesterday my sister called me a virgin and to...
5,netflixwitcher,73985,#CountingDown,,,CountingDown
6,halo,402511,All of our friends who have sleeping avatars o...,,,All of our friends who have sleeping avatars o...
7,mildlyinfuriating,2216136,"Table at my school designed to fit, but doesn't",,,Table at my school designed to fit but doesn t
8,elderscrollsonline,246975,A little late to Elsweyr but happy to be here!,,,A little late to Elsweyr but happy to be here
9,ShitPostCrusaders,456327,AN IL VENTO D'ORO FOR YOU!,,,AN IL VENTO D ORO FOR YOU


In [126]:
# We set our features as description, and target as subreddit.  
# Create a mass text.

features = ['clean_text', 'clean_title', 'subreddit_subs'] 
target = 'subreddit'

X = df[features]
y = df[[target]]

In [127]:
#Create the nlp object
nlp = spacy.load("en_core_web_md")

# create tokenizer object
tokenizer = Tokenizer(nlp.vocab)

In [128]:
def tokenize(doc):
        """Return the tokens"""
        return [token.text for token in tokenizer(doc)]

In [129]:
def get_lemmas(text):
        """Return the Lemmas"""
        lemmas = []
        doc = nlp(text)
    
        for token in doc: 
            if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_ != 'PRON'):
                lemmas.append(token.lemma_)
    
        return lemmas

In [130]:
text = df["clean_text"]

# Instantiate vectorizer object
tfidf = TfidfVectorizer(tokenizer=get_lemmas, min_df=0.025, max_df=.98, ngram_range=(1,2))
#tfidf = TfidfVectorizer(stop_words="english")
# Create a vocabulary and get word counts per document
dtm = tfidf.fit_transform(text) # Similiar to fit_predict

# Get feature names to use as dataframe column headers
#dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())

In [131]:
# Fit on TF-IDF Vectors
size = 501
model  = NearestNeighbors(n_neighbors=size, algorithm='ball_tree')
model.fit(dtm)



NearestNeighbors(algorithm='ball_tree', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=None, n_neighbors=501, p=2, radius=1.0)

In [132]:
user_input = ["""Capes are not a great idea of you are a super hero trying to save the world."""]

#vec_user_input = tfidf.transform(user_input)
#dist, subreddit_index = model.kneighbors(vec_user_input.todense())
dist, subreddit_index = model.kneighbors(tfidf.transform(user_input).todense())

In [133]:
recommended_reddits = [df[['subreddit','title','text','subreddit_subs']].iloc[n] for n in subreddit_index]

print(*recommended_reddits, sep = "\n")

                  subreddit  \
1025             NHLStreams   
3493         BoJackHorseman   
3495               CasualUK   
3496             Cigarettes   
3497            woodworking   
3499                diablo3   
3500           rickandmorty   
3501                  razer   
3502              gtaonline   
3503          StreetFighter   
3504           MortalKombat   
3505               GalaxyS8   
3506        weddingplanning   
3507                  korea   
3492              tryonhaul   
3237                 spacex   
3513             3Dprinting   
3514         deadbydaylight   
3515           TheSilphRoad   
3516              offlineTV   
3517            BlackClover   
3518              Instagram   
3520                   juul   
3521             PlayTemtem   
3522             Cigarettes   
3523                  poker   
3525              EarthPorn   
3526                    DnD   
3527               software   
3510             PowerShell   
...                     ...   
3132    

In [134]:
import pickle

knnPickle = 'modelnew_pkl.sav'

index_df = df[['subreddit']]

pickle.dump((model, index_df, tfidf), open(knnPickle, 'wb'))


In [135]:
# # load the model from disk
loaded_model, loaded_index_df, loaded_tfidf = pickle.load(open(knnPickle, 'rb'))
dist, indices = loaded_model.kneighbors(loaded_tfidf.transform(user_input).todense())
recommended_reddits = [loaded_index_df.iloc[n]['subreddit'] for n in indices[0]]



def uniq(input):
  output = []
  for x in input:
    if x not in output:
      output.append(x)
  return output

print(uniq(recommended_reddits)[:3])

['NHLStreams', 'BoJackHorseman', 'CasualUK']
