In [1]:
import os
import pickle

import numpy as np
import spacy

from spacy.tokenizer import Tokenizer
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [2]:
import pandas as pd
import sqlite3
conn = sqlite3.connect("api/db-mega.sqlite3")
df = pd.read_sql_query("select * from submissions order by random() limit 5000;", conn)


In [3]:
df.shape

(5000, 4)

In [4]:
df.head(10)

Unnamed: 0,subreddit,subreddit_subs,title,text
0,london,220287,Oxford Street. Christmas morning,
1,darksouls3,293853,Dear Bob Ross,"Bob Ross, you invaded me in the catacombs. I a..."
2,podcasts,1028017,What are your favourite daily podcasts?,
3,Bumble,77503,Thoughts about the new conversation starters?,
4,gaming,24990738,"At Pax South, there were amazing Fallout Mario...",
5,NoMansSkyTheGame,303002,Managed to get a snap of the entrance to a bla...,
6,RPClipsGTA,27361,Yung Dab learns to dab.,
7,stocks,484990,"Tesla ships 63,000 vehicles in Q1, produced 77...",[https://www.cnbc.com/2019/04/04/tesla-q1-2019...
8,pics,23741818,My girlfriend wanted to do a Quidditch costume...,
9,StarWarsBattlefront,266620,I love this community,


In [5]:
df['text'].head(5)

0                                                     
1    Bob Ross, you invaded me in the catacombs. I a...
2                                                     
3                                                     
4                                                     
Name: text, dtype: object

In [6]:
df['clean_text'] = df['text'].str.replace('[^\w\s]',' ')
df['clean_title'] = df['title'].str.replace('[^\w\s]',' ')

In [7]:
df.head(10)

Unnamed: 0,subreddit,subreddit_subs,title,text,clean_text,clean_title
0,london,220287,Oxford Street. Christmas morning,,,Oxford Street Christmas morning
1,darksouls3,293853,Dear Bob Ross,"Bob Ross, you invaded me in the catacombs. I a...",Bob Ross you invaded me in the catacombs I a...,Dear Bob Ross
2,podcasts,1028017,What are your favourite daily podcasts?,,,What are your favourite daily podcasts
3,Bumble,77503,Thoughts about the new conversation starters?,,,Thoughts about the new conversation starters
4,gaming,24990738,"At Pax South, there were amazing Fallout Mario...",,,At Pax South there were amazing Fallout Mario...
5,NoMansSkyTheGame,303002,Managed to get a snap of the entrance to a bla...,,,Managed to get a snap of the entrance to a bla...
6,RPClipsGTA,27361,Yung Dab learns to dab.,,,Yung Dab learns to dab
7,stocks,484990,"Tesla ships 63,000 vehicles in Q1, produced 77...",[https://www.cnbc.com/2019/04/04/tesla-q1-2019...,https www cnbc com 2019 04 04 tesla q1 2019...,Tesla ships 63 000 vehicles in Q1 produced 77...
8,pics,23741818,My girlfriend wanted to do a Quidditch costume...,,,My girlfriend wanted to do a Quidditch costume...
9,StarWarsBattlefront,266620,I love this community,,,I love this community


In [8]:
# We set our features as description, and target as subreddit.  
# Create a mass text.

features = ['clean_text', 'clean_title', 'subreddit_subs'] 
target = 'subreddit'

X = df[features]
y = df[[target]]

In [10]:
# #Create the nlp object
# nlp = spacy.load("en_core_web_md")

In [None]:
# def get_lemmas(text):
#         """Return the Lemmas"""
#         lemmas = []
#         doc = nlp(text)
    
#         for token in doc: 
#             if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_ != 'PRON'):
#                 lemmas.append(token.lemma_)
    
#         return lemmas

In [11]:
import vectorizer_helpers

text = df["clean_text"]

# Instantiate vectorizer object
tfidf = TfidfVectorizer(tokenizer=vectorizer_helpers.get_lemmas, min_df=0.025, max_df=.98, ngram_range=(1,2))
#tfidf = TfidfVectorizer(stop_words="english")
# Create a vocabulary and get word counts per document
dtm = tfidf.fit_transform(text) # Similiar to fit_predict

# Get feature names to use as dataframe column headers
#dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())

In [12]:
# Fit on TF-IDF Vectors
size = 501
model  = NearestNeighbors(n_neighbors=size, algorithm='ball_tree')
model.fit(dtm)



NearestNeighbors(algorithm='ball_tree', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=None, n_neighbors=501, p=2, radius=1.0)

In [13]:
user_input = ["""Capes are not a great idea of you are a super hero trying to save the world."""]

#vec_user_input = tfidf.transform(user_input)
#dist, subreddit_index = model.kneighbors(vec_user_input.todense())
dist, subreddit_index = model.kneighbors(tfidf.transform(user_input).todense())

In [14]:
recommended_reddits = [df[['subreddit','title','text','subreddit_subs']].iloc[n] for n in subreddit_index]

#print(*recommended_reddits, sep = "\n")

In [15]:
import pickle

knnPickle = 'api/modelknn_pkl.sav'

index_df = df[['subreddit']]

pickle.dump((model, index_df, tfidf), open(knnPickle, 'wb'))


In [16]:
# # load the model from disk
loaded_model, loaded_index_df, loaded_tfidf = pickle.load(open(knnPickle, 'rb'))
dist, indices = loaded_model.kneighbors(loaded_tfidf.transform(user_input).todense())
recommended_reddits = [loaded_index_df.iloc[n]['subreddit'] for n in indices[0]]



def uniq(input):
  output = []
  for x in input:
    if x not in output:
      output.append(x)
  return output

print(uniq(recommended_reddits)[:3])

['excel', 'ValveIndex', 'RPClipsGTA']


In [None]:
# Pkl_Model = "Pickle_Model.pkl"  
# with open(Pkl_Model, 'wb') as file:  
#     pickle.dump(LR_Model, file)