In [119]:
import pandas as pd
import sqlite3
import numpy as np

import spacy
from spacy.tokenizer import Tokenizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer

from tensorflow import keras

## Get & Process Data

In [120]:
conn = sqlite3.connect("db.sqlite3")
df = pd.read_sql_query("select * from submissions;", conn)
df.shape

(50000, 4)

In [121]:
#df = pd.·%&·$%&·$%&read_sql_query('db.sqlite3')
df.head(20)

Unnamed: 0,subreddit,subreddit_subs,title,text
0,Home,10106,Beautiful Home :),
1,Home,10106,This was finished yesterday..,
2,Home,10106,My roommate is kicking me out because having a...,"So, I am not asking for advice, really...mores..."
3,Home,10106,This was this kitchen I grew up with in London...,
4,Home,10106,Anyone know this style of home?,
5,Home,10106,My view from home after a long day,
6,Home,10106,That is how we make our home even cozier place 🥰😎,
7,Home,10106,Should I continue the backslash around the cor...,
8,Home,10106,My first attempt at wall design. Purposely wen...,
9,Home,10106,Best way to hide wires?,


In [122]:
df['clean_text'] = df['text'].str.replace('[^\w\s]',' ')
df['clean_title'] = df['title'].str.replace('[^\w\s]',' ')
df['combo'] = df['clean_text'] + df['clean_title']
df.head()

Unnamed: 0,subreddit,subreddit_subs,title,text,clean_text,clean_title,combo
0,Home,10106,Beautiful Home :),,,Beautiful Home,Beautiful Home
1,Home,10106,This was finished yesterday..,,,This was finished yesterday,This was finished yesterday
2,Home,10106,My roommate is kicking me out because having a...,"So, I am not asking for advice, really...mores...",So I am not asking for advice really mores...,My roommate is kicking me out because having a...,So I am not asking for advice really mores...
3,Home,10106,This was this kitchen I grew up with in London...,,,This was this kitchen I grew up with in London...,This was this kitchen I grew up with in London...
4,Home,10106,Anyone know this style of home?,,,Anyone know this style of home,Anyone know this style of home


## Preprocessing

In [123]:
nlp = spacy.load("en_core_web_md")

tokenizer = Tokenizer(nlp.vocab)
tokens = []

""" Make them tokens """

#stop words
STOP_WORDS = nlp.Defaults.stop_words.union(['', ' ', '-', 'reddit', 'post'])
    
tokens = []

for doc in tokenizer.pipe(df['combo'], batch_size=500):
    
    doc_tokens = []
    
    for token in doc: 
        if ((token.text.lower() not in STOP_WORDS) and 
            (token.is_stop == False) and 
            (token.is_punct == False) and 
            (token.pos_ != 'PRON')):
                 doc_tokens.append(token.lemma_.lower())
    tokens.append(' '.join(doc_tokens))
    
df['tokens'] = tokens

In [124]:
tfidf_vectorizer = TfidfVectorizer(min_df=0.02, max_df=.98,
                                   ngram_range=(1,2))

In [125]:
X = tfidf_vectorizer.fit_transform(df['tokens'])
y = pd.get_dummies(df['subreddit']).values #target

print(X.shape)
print(y.shape)

(50000, 215)
(50000, 100)


## TF Model

In [150]:
model = keras.Sequential()
model.add(keras.layers.Dense(64, input_dim=X.shape[1], activation='relu'))
model.add(keras.layers.Dropout(0.2))
model.add(keras.layers.Dense(16, activation='relu'))
model.add(keras.layers.Dense(y.shape[1], activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [151]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)

In [152]:
history = model.fit(X_train, y_train, epochs=10, batch_size=64)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [137]:
(np.sum(np.argmax(model.predict(X_test), axis=1) == np.argmax(y_test, axis=1)))/ X.shape[0]

0.02844

In [131]:
model.predict_classes(X_test[:1])

array([80])

In [153]:
df.to_pickle("./dummy.pkl")