In [12]:
import pandas as pd
import sqlite3
import numpy as np

import spacy
from spacy.tokenizer import Tokenizer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer

from tensorflow import keras

## Get & Process Data

In [61]:
conn = sqlite3.connect("db.sqlite3")
df = pd.read_sql_query("select * from submissions;", conn)
df.shape

(7846, 4)

In [65]:
df = pd.read_csv('Kaggle Datasets.csv')
df.head(20)

Unnamed: 0,ref,title,subtitle,creator,description,version,keywords,last_updated,license_name,size,size(bytes),downloads,discussions,views,likes,kernels
0,mlg-ulb/creditcardfraud,Credit Card Fraud Detection,Anonymized credit card transactions labeled as...,Machine Learning Group - ULB,Context\n---------\n\nIt is important that cre...,3,"finance, machine learning, crime",2018-03-23 01:17:27,"Database: Open Database, Contents: Database Co...",66MB,69155632,131731,40,3052086,3323,2121
1,tmdb/tmdb-movie-metadata,TMDB 5000 Movie Dataset,"Metadata on ~5,000 movies from TMDb",The Movie Database (TMDb),### Background\nWhat can we say about the succ...,2,film,2017-09-28 01:09:12,Other (specified in description),9MB,9747156,92202,53,630515,1601,1462
2,hugomathien/soccer,European Soccer Database,"25k+ matches, players & teams attributes for E...",Hugo Mathien,The ultimate Soccer database for data analysis...,10,"europe, association football",2016-10-23 22:31:38,"Database: Open Database, Contents: © Original ...",34MB,36121187,80922,95,638231,1875,1458
3,lava18/google-play-store-apps,Google Play Store Apps,Web scraped data of 10k Play Store apps for an...,Lavanya Gupta,### Context\n\nWhile many public datasets (on ...,6,"internet, video games, computer science, mobil...",2019-02-03 13:55:47,Unknown,2MB,2013348,72020,37,440983,1861,313
4,zynicide/wine-reviews,Wine Reviews,"130k wine reviews with variety, location, wine...",zackthoutt,### Context\n\nAfter watching [Somm](http://ww...,4,"food and drink, critical theory",2017-11-27 17:08:04,CC BY-NC-SA 4.0,51MB,53336173,68002,24,342243,1609,1948
5,ronitf/heart-disease-uci,Heart Disease UCI,https://archive.ics.uci.edu/ml/datasets/Heart+...,ronit,### Context \n\nThis database contains 76 attr...,1,"classification, binary classification, health,...",2018-06-25 11:33:56,Reddit API Terms,3KB,3438,67832,41,404892,2161,625
6,unsdsn/world-happiness,World Happiness Report,Happiness scored according to economic product...,Sustainable Development Solutions Network,### Context \n\nThe World Happiness Report is ...,2,"economics, social sciences, emotion",2017-06-14 20:41:45,CC0: Public Domain,29KB,29425,67658,6,273284,976,461
7,uciml/iris,Iris Species,Classify iris plants into three species in thi...,UCI Machine Learning,The Iris dataset was used in R.A. Fisher's cla...,2,botany,2016-09-27 07:38:05,CC0: Public Domain,4KB,3718,65859,18,292620,984,3810
8,wendykan/lending-club-loan-data,Lending Club Loan Data,Analyze Lending Club's issued loans,Wendy Kan,These files contain complete loan data for all...,1,finance,2019-03-18 18:43:12,Unknown,702MB,736483000,52096,33,293239,953,582
9,uciml/breast-cancer-wisconsin-data,Breast Cancer Wisconsin (Diagnostic) Data Set,Predict whether the cancer is benign or malignant,UCI Machine Learning,Features are computed from a digitized image o...,2,healthcare,2016-09-25 10:49:04,CC BY-NC-SA 4.0,48KB,49196,51767,22,321503,837,862


In [15]:
df['clean_text'] = df['text'].str.replace('[^\w\s]',' ')
df['clean_title'] = df['title'].str.replace('[^\w\s]',' ')
df['combo'] = df['clean_text'] + df['clean_title']
df.head()

Unnamed: 0,subreddit,subreddit_subs,title,text,clean_text,clean_title,combo
0,AskReddit,26256285,"People who haven't pooped in 2019 yet, why are...",,,People who haven t pooped in 2019 yet why are...,People who haven t pooped in 2019 yet why are...
1,AskReddit,26256285,Stan Lee has passed away at 95 years old,As many of you know today is day that many of ...,As many of you know today is day that many of ...,Stan Lee has passed away at 95 years old,As many of you know today is day that many of ...
2,Home,10068,Beautiful Home :),,,Beautiful Home,Beautiful Home
3,Home,10068,This was finished yesterday..,,,This was finished yesterday,This was finished yesterday
4,videos,22083867,This is what happens when one company owns doz...,,,This is what happens when one company owns doz...,This is what happens when one company owns doz...


## Preprocessing

In [16]:
nlp = spacy.load("en_core_web_md")

tokenizer = Tokenizer(nlp.vocab)
tokens = []

""" Make them tokens """

#stop words
STOP_WORDS = nlp.Defaults.stop_words.union(['', ' ', '-', 'reddit', 'post'])
    
tokens = []

for doc in tokenizer.pipe(df['combo'], batch_size=500):
    
    doc_tokens = []
    
    for token in doc: 
        if ((token.text.lower() not in STOP_WORDS) and 
            (token.is_stop == False) and 
            (token.is_punct == False) and 
            (token.pos_ != 'PRON')):
                 doc_tokens.append(token.lemma_.lower())
    tokens.append(' '.join(doc_tokens))
    
df['tokens'] = tokens

In [17]:
tfidf_vectorizer = TfidfVectorizer(min_df=0.02, max_df=.98,
                                   ngram_range=(1,2))

In [39]:
X = tfidf_vectorizer.fit_transform(df['tokens'])
y = pd.get_dummies(df['subreddit']).values #target

print(X.shape)
print(y.shape)

(5000, 257)
(5000, 2501)


## TF Model

In [40]:
model = keras.Sequential()
model.add(keras.layers.Dense(16, input_dim=X.shape[1], activation='relu'))
model.add(keras.layers.Dropout(0.2))
model.add(keras.layers.Dense(12, activation='relu'))
model.add(keras.layers.Dense(y.shape[1], activation='softmax'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [41]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)

In [43]:
history = model.fit(X_train, y_train, epochs=1, batch_size=64)



## Test

In [74]:
np.sum(np.argmax(model.predict(X0), axis=1) == np.argmax(y0, axis=1))

0

In [69]:
model.predict_classes(X_test[:1])

array([921])

In [None]:
º