In [1]:
import os
import pickle

import numpy as np
import spacy

from spacy.tokenizer import Tokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

In [2]:
import pandas as pd
import sqlite3
conn = sqlite3.connect("db.sqlite3")
df = pd.read_sql_query("select * from submissions limit 5000;", conn)


In [3]:
df.head(10)

Unnamed: 0,subreddit,subscribers,title,text
0,AskReddit,26241971,"People who haven't pooped in 2019 yet, why are...",
1,AskReddit,26241971,Stan Lee has passed away at 95 years old,As many of you know today is day that many of ...
2,Home,10022,Beautiful Home :),
3,Home,10022,My roommate is kicking me out because having a...,"So, I am not asking for advice, really...mores..."
4,videos,22077202,This is what happens when one company owns doz...,
5,videos,22077202,Youtube is Facilitating the Sexual Exploitatio...,
6,news,19777808,Blizzard Employees Staged a Walkout After the ...,
7,news,19777808,Kobe Bryant killed in helicopter crash in Cali...,
8,worldnews,23055673,Trump Impeached for Abuse of Power,
9,worldnews,23055673,"Two weeks before his inauguration, Donald J. T...",


In [4]:
df['text'].head(5)

0                                                     
1    As many of you know today is day that many of ...
2                                                     
3    So, I am not asking for advice, really...mores...
4                                                     
Name: text, dtype: object

In [5]:
df['clean_text'] = df['text'].str.replace('[^\w\s]',' ')
df['clean_title'] = df['title'].str.replace('[^\w\s]',' ')

In [6]:
df.head(10)

Unnamed: 0,subreddit,subscribers,title,text,clean_text,clean_title
0,AskReddit,26241971,"People who haven't pooped in 2019 yet, why are...",,,People who haven t pooped in 2019 yet why are...
1,AskReddit,26241971,Stan Lee has passed away at 95 years old,As many of you know today is day that many of ...,As many of you know today is day that many of ...,Stan Lee has passed away at 95 years old
2,Home,10022,Beautiful Home :),,,Beautiful Home
3,Home,10022,My roommate is kicking me out because having a...,"So, I am not asking for advice, really...mores...",So I am not asking for advice really mores...,My roommate is kicking me out because having a...
4,videos,22077202,This is what happens when one company owns doz...,,,This is what happens when one company owns doz...
5,videos,22077202,Youtube is Facilitating the Sexual Exploitatio...,,,Youtube is Facilitating the Sexual Exploitatio...
6,news,19777808,Blizzard Employees Staged a Walkout After the ...,,,Blizzard Employees Staged a Walkout After the ...
7,news,19777808,Kobe Bryant killed in helicopter crash in Cali...,,,Kobe Bryant killed in helicopter crash in Cali...
8,worldnews,23055673,Trump Impeached for Abuse of Power,,,Trump Impeached for Abuse of Power
9,worldnews,23055673,"Two weeks before his inauguration, Donald J. T...",,,Two weeks before his inauguration Donald J T...


In [7]:
# We set our features as description, and target as subreddit.  
# Create a mass text.

features = ['clean_text', 'clean_title', 'subscribers'] 
target = 'subreddit'

X = df[features]
y = df[[target]]

In [8]:
#Create the nlp object
nlp = spacy.load("en_core_web_md")

# create tokenizer object
tokenizer = Tokenizer(nlp.vocab)

In [9]:
def tokenize(doc):
        """Return the tokens"""
        return [token.text for token in tokenizer(doc)]

In [10]:
def get_lemmas(text):
        """Return the Lemmas"""
        lemmas = []
        doc = nlp(text)
    
        for token in doc: 
            if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_ != 'PRON'):
                lemmas.append(token.lemma_)
    
        return lemmas

In [11]:
text = df["clean_text"]

# Instantiate vectorizer object
#tfidf = TfidfVectorizer(tokenizer=get_lemmas, min_df=0.025, max_df=.98, ngram_range=(1,2))
tfidf = TfidfVectorizer(stop_words="english")
# Create a vocabulary and get word counts per document
dtm = tfidf.fit_transform(text) # Similiar to fit_predict

# Get feature names to use as dataframe column headers
dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())

In [12]:
# Fit on TF-IDF Vectors
size = 5
nn  = NearestNeighbors(n_neighbors=size, algorithm='ball_tree')
nn.fit(dtm)

NearestNeighbors(algorithm='ball_tree', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=None, n_neighbors=5, p=2, radius=1.0)

In [13]:
user_input = ["""Capes are not a great idea of you are a super hero trying to save the world."""]

vec_user_input = tfidf.transform(user_input)
dist, subreddit_index = nn.kneighbors(vec_user_input.todense())

In [14]:
recommended_reddits = [df[['subreddit','title','text','subscribers']].iloc[n] for n in subreddit_index]

print(recommended_reddits)

[          subreddit                                              title text  \
871         Gunners  Invincibles. If you upvote this, it literally ...        
878  netflixwitcher                                  To all the Morons        
872      IncelTears                                    Just a reminder        
873      IncelTears  The final boss all Incels must face after all ...        
877   tipofmytongue  [TOMT] What is the name of ankles, but above y...        

     subscribers  
871       148637  
878        73447  
872       338966  
873       338966  
877      1045210  ]


In [None]:
#do pickling here
# save the model to disk
filename = 'test_model.sav'
pickle.dump(model, open(filename, 'wb'))
 
# some time later...
 
# # load the model from disk
# loaded_model = pickle.load(open(filename, 'rb'))
# result = loaded_model.score(X_test, Y_test)
# print(result)