In [3]:
import numpy as np
import pandas as pd
import os

print(os.listdir("./input"))

# running our benchmark code in this kernel lead to memory errors, so 
# we do a slightly less memory intensive procedure if this is True, 
# set this as False if you are running on a computer with a lot of RAM
# it should be possible to use less memory in this kernel using generators
# rather than storing everything in RAM, but we won't explore that here

['rspct.tsv', 'subreddit_info.csv']


In [4]:
rspct_df = pd.read_csv('./input/rspct.tsv', sep='\t')
info_df  = pd.read_csv('./input/subreddit_info.csv')

## Basic data analysis

In [5]:
rspct_df.head(5)

Unnamed: 0,id,subreddit,title,selftext
0,6d8knd,talesfromtechsupport,Remember your command line switches...,"Hi there, <lb>The usual. Long time lerker, fi..."
1,58mbft,teenmom,"So what was Matt ""addicted"" to?",Did he ever say what his addiction was or is h...
2,8f73s7,Harley,No Club Colors,Funny story. I went to college in Las Vegas. T...
3,6ti6re,ringdoorbell,"Not door bell, but floodlight mount height.",I know this is a sub for the 'Ring Doorbell' b...
4,77sxto,intel,Worried about my 8700k small fft/data stress r...,"Prime95 (regardless of version) and OCCT both,..."


In [7]:
# note that info_df has information on subreddits that are not in data, 
# we filter them out here

info_df = info_df[info_df.in_data].reset_index()
info_df.head(5)

Unnamed: 0,level_0,index,subreddit,category_1,category_2,category_3,in_data,reason_for_exclusion
0,0,0,whatsthatbook,advice/question,book,,True,
1,1,25,theydidthemath,advice/question,calculations,,True,
2,2,26,datarecovery,advice/question,data recovery,,True,
3,3,27,declutter,advice/question,declutter,,True,
4,4,30,productivity,advice/question,discipline,,True,


## Naive Bayes benchmark

In [8]:
# we join the title and selftext into one field

def join_text(row):
        return row['title'] + " " + row['selftext']

rspct_df['text'] = rspct_df[['title', 'selftext']].apply(join_text, axis=1)

In [9]:
# take the last 20% as a test set - N.B data is already randomly shuffled,
# and last 20% is a stratified split (equal proportions of subreddits)

train_split_index = int(len(rspct_df) * 0.8)

train_df, test_df = rspct_df[:train_split_index], rspct_df[train_split_index:]
X_train , X_test  = train_df.text, test_df.text
y_train, y_test   = train_df.subreddit, test_df.subreddit

In [16]:
from sklearn.preprocessing import LabelEncoder

# label encode y

le = LabelEncoder()
le.fit(y_train)
y_train = le.transform(y_train)
y_test  = le.transform(y_test)

y_train[:5]

array([920, 931, 161, 827, 669])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# extract features from text using bag-of-words (single words + bigrams)
# use tfidf weighting (helps a little for Naive Bayes in general)
# note : you can do better than this by extracting more features, then 
# doing feature selection, but not enough memory on this kernel!

print('this cell will take about 10 minutes to run')

NUM_FEATURES = 30000 

tf_idf_vectorizer = TfidfVectorizer(max_features = NUM_FEATURES,
                                min_df=5,
                                ngram_range=(1,2),
                                stop_words=None,
                                token_pattern='(?u)\\b\\w+\\b',
                            )

X_train = tf_idf_vectorizer.fit_transform(X_train)
X_test  = tf_idf_vectorizer.transform(X_test)

from sklearn.feature_selection import chi2, SelectKBest

# if we have more memory, select top 100000 features and select good features

chi2_selector = SelectKBest(chi2, 30000)

chi2_selector.fit(X_train, y_train) 

X_train = chi2_selector.transform(X_train)
X_test  = chi2_selector.transform(X_test)

X_train.shape, X_test.shape

this cell will take about 10 minutes to run


In [None]:
from sklearn.naive_bayes import MultinomialNB

# train a naive bayes model, get predictions

nb_model = MultinomialNB(alpha=0.1)
nb_model.fit(X_train, y_train)

y_pred_proba = nb_model.predict_proba(X_test)
y_pred = np.argmax(y_pred_proba, axis=1)

In [None]:
# we use precision-at-k metrics to evaluate performance
# (https://en.wikipedia.org/wiki/Evaluation_measures_(information_retrieval)#Precision_at_K)

def precision_at_k(y_true, y_pred, k=5):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    y_pred = np.argsort(y_pred, axis=1)
    y_pred = y_pred[:, ::-1][:, :k]
    arr = [y in s for y, s in zip(y_true, y_pred)]
    return np.mean(arr)

print('precision@1 =', np.mean(y_test == y_pred))
print('precision@3 =', precision_at_k(y_test, y_pred_proba, 3))
print('precision@5 =', precision_at_k(y_test, y_pred_proba, 5))


In [None]:
import pickle

filename = 'model.sav'
pickle.dump(nb_model, open(filename, 'wb'))


## Using Pre-Trained Model (Gnews)
---

In [17]:
X_train

0         Remember your command line switches... Hi ther...
1         So what was Matt "addicted" to? Did he ever sa...
2         No Club Colors Funny story. I went to college ...
3         Not door bell, but floodlight mount height. I ...
4         Worried about my 8700k small fft/data stress r...
                                ...                        
810395    Best workflow for app integration Hi all!<lb><...
810396    4K editing machine problems i upgraded my gpu ...
810397    Advice on mixing and editing I recently attend...
810398    How to properly control hue lights? When I say...
810399    First Yak? Alright so I've never owned a kayak...
Name: text, Length: 810400, dtype: object

In [18]:
X_test

810400     100+ Classrooms..how do you engage the student...
810401     [MANGA SPOILERS] Someone I read **Chapter 48: ...
810402     [21 y.o][Beard tips/first time] I really need ...
810403     Gluing the tip back on and durability of the b...
810404     Your examples of makeup that's not "standard" ...
                                 ...                        
1012995    Is this months rebirth and dungeon astro's wor...
1012996    I might need a Medical leave from grad school ...
1012997    Police harassing ethnic minorities in Hong Kon...
1012998    SU EECS 2030 and EECS 2021 - need advice Hi, I...
1012999    What is the worse wine you ever had? My worst ...
Name: text, Length: 202600, dtype: object

In [19]:
y_train

0         talesfromtechsupport
1                      teenmom
2                       Harley
3                 ringdoorbell
4                        intel
                  ...         
810395                     git
810396                premiere
810397             VoiceActing
810398              amazonecho
810399                Kayaking
Name: subreddit, Length: 810400, dtype: object

In [20]:

import csv
import re
import tensorflow as tf
import pandas as pd
import numpy as np
import nltk
from bs4 import BeautifulSoup

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords

from numpy import argmax
from sklearn.preprocessing import LabelEncoder

nltk.download('stopwords')
STOPWORDS = set(stopwords.words('english'))

print(tf.__version__)

2.2.0


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/popkdodge/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard

In [23]:
X_train

0         Remember your command line switches... Hi ther...
1         So what was Matt "addicted" to? Did he ever sa...
2         No Club Colors Funny story. I went to college ...
3         Not door bell, but floodlight mount height. I ...
4         Worried about my 8700k small fft/data stress r...
                                ...                        
810395    Best workflow for app integration Hi all!<lb><...
810396    4K editing machine problems i upgraded my gpu ...
810397    Advice on mixing and editing I recently attend...
810398    How to properly control hue lights? When I say...
810399    First Yak? Alright so I've never owned a kayak...
Name: text, Length: 810400, dtype: object

In [24]:
stop = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=10)

In [25]:
import tensorflow_hub as hub

In [33]:
model = "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"
hub_layer = hub.KerasLayer(model, output_shape=[20], input_shape=[], 
                           dtype=tf.string, trainable=True)
hub_layer(X_train[:3])

<tf.Tensor: shape=(3, 20), dtype=float32, numpy=
array([[ 1.7077502 , -4.0037446 ,  3.7489946 ,  2.948087  , -5.1763754 ,
        -4.1330338 , -2.4770722 ,  2.1744409 ,  1.1640458 ,  0.9517227 ,
        -3.0390337 ,  1.6589148 ,  0.33850574,  0.599539  , -3.4987762 ,
         1.1391282 ,  4.9868565 , -0.91237336, -1.6852969 , -2.0769358 ],
       [ 0.922072  , -4.147851  ,  1.5294727 ,  0.8775935 , -3.389209  ,
        -3.3807397 , -2.2445643 ,  3.1804943 ,  4.2478952 ,  1.2665141 ,
        -1.9726633 ,  0.9127798 ,  0.61721325, -0.15477253, -3.6096973 ,
         2.3084846 ,  3.961409  , -1.7416806 , -1.7581972 , -1.2604722 ],
       [ 1.6637378 , -2.7635317 ,  2.0233257 ,  1.113759  , -5.7743273 ,
        -3.2148895 , -3.460592  ,  1.6691866 ,  1.884025  ,  1.714093  ,
        -4.758285  ,  1.7424115 , -0.30933595, -0.2347627 , -5.368443  ,
         1.3150398 ,  3.7524164 , -1.8488756 , -3.5905008 , -0.19346792]],
      dtype=float32)>

In [None]:
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(32, activation='relu'))
model.add(tf.keras.layers.Dense(64, activation='relu'))
model.add(tf.keras.layers.Dense(1013, activation='softmax'))

In [None]:
model.summary()

In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
history = model.fit(X_train,
                    y_train,
                    epochs=1000,
                    batch_size=512,
                    validation_data=(X_test, y_test),
                    verbose=1,
                    callbacks=stop )

In [18]:
y_test

array([ 290,  323,  463, ...,  166, 1011, 1000])