In [3]:
import numpy as np
import pandas as pd
import os

print(os.listdir("./input"))

# running our benchmark code in this kernel lead to memory errors, so 
# we do a slightly less memory intensive procedure if this is True, 
# set this as False if you are running on a computer with a lot of RAM
# it should be possible to use less memory in this kernel using generators
# rather than storing everything in RAM, but we won't explore that here

['rspct.tsv', 'subreddit_info.csv']


In [4]:
rspct_df = pd.read_csv('./input/rspct.tsv', sep='\t')
info_df  = pd.read_csv('./input/subreddit_info.csv')

In [5]:
reddit_train = rspct_df.sample(frac=.3)
reddit_info = info_df.sample(frac=.3)

In [6]:
reddit_info = reddit_info[reddit_info.in_data].reset_index()

In [7]:
def join_text(row):
        return row['title'] + " " + row['selftext']

reddit_train['text'] = reddit_train[['title', 'selftext']].apply(join_text, axis=1)

In [8]:
train_split_index = int(len(reddit_train) * 0.8)

train_df, test_df = reddit_train[:train_split_index], reddit_train[train_split_index:]
X_train , X_test  = train_df.text, test_df.text
y_train, y_test   = train_df.subreddit, test_df.subreddit

In [25]:
# label encode y
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(y_train)
y_train = le.transform(y_train)
y_test  = le.transform(y_test)

np.save('y_test', y_test)
np.save('y_train', y_train)

In [26]:
le.inverse_transform(y_train)

array(['BackYardChickens', 'ConanExiles', 'devops', ..., 'xxfitness',
       'INeedAName', 'benzodiazepines'], dtype=object)

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2, SelectKBest


NUM_FEATURES = 30000 

tf_idf_vectorizer = TfidfVectorizer(max_features = NUM_FEATURES,
                                min_df=5,
                                ngram_range=(1,2),
                                stop_words=None,
                                token_pattern='(?u)\\b\\w+\\b',
                            )
chi2_selector = SelectKBest(chi2, 30000)
nb_model = MultinomialNB(alpha=0.1)


pipe = Pipeline([
    ('tfidf', tf_idf_vectorizer),
    ('chi', chi2_selector),
    ('Bayes', nb_model)
])
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)



0.5831523527476143

In [59]:
text = "Porsche 911 is the hottest car on the market."

In [63]:
example_text = X_test[:1].copy()

In [66]:
example_text[:1] = text

In [67]:
test_predict = pipe.predict(example_text)

In [68]:
le.inverse_transform(test_predict)

array(['Porsche'], dtype=object)

In [70]:
import joblib
filename = 'pipedBayed.joblib'
joblib.dump(pipe, filename, 6)

['pipedBayed.joblib']

## Basic data analysis

In [5]:
rspct_df.head(5)

Unnamed: 0,id,subreddit,title,selftext
0,6d8knd,talesfromtechsupport,Remember your command line switches...,"Hi there, <lb>The usual. Long time lerker, fi..."
1,58mbft,teenmom,"So what was Matt ""addicted"" to?",Did he ever say what his addiction was or is h...
2,8f73s7,Harley,No Club Colors,Funny story. I went to college in Las Vegas. T...
3,6ti6re,ringdoorbell,"Not door bell, but floodlight mount height.",I know this is a sub for the 'Ring Doorbell' b...
4,77sxto,intel,Worried about my 8700k small fft/data stress r...,"Prime95 (regardless of version) and OCCT both,..."


In [6]:
# note that info_df has information on subreddits that are not in data, 
# we filter them out here

info_df = info_df[info_df.in_data].reset_index()
info_df.head(5)

Unnamed: 0,index,subreddit,category_1,category_2,category_3,in_data,reason_for_exclusion
0,0,whatsthatbook,advice/question,book,,True,
1,25,theydidthemath,advice/question,calculations,,True,
2,26,datarecovery,advice/question,data recovery,,True,
3,27,declutter,advice/question,declutter,,True,
4,30,productivity,advice/question,discipline,,True,


## Naive Bayes benchmark

In [7]:
# we join the title and selftext into one field

def join_text(row):
        return row['title'] + " " + row['selftext']

rspct_df['text'] = rspct_df[['title', 'selftext']].apply(join_text, axis=1)

In [8]:
# take the last 20% as a test set - N.B data is already randomly shuffled,
# and last 20% is a stratified split (equal proportions of subreddits)

train_split_index = int(len(rspct_df) * 0.8)

train_df, test_df = rspct_df[:train_split_index], rspct_df[train_split_index:]
X_train , X_test  = train_df.text, test_df.text
y_train, y_test   = train_df.subreddit, test_df.subreddit

In [53]:
from sklearn.preprocessing import LabelEncoder

# label encode y

le = LabelEncoder()
le.fit(y_train)
y_train = le.transform(y_train)
y_test  = le.transform(y_test)

np.save('y_test', y_test)
np.save('y_train', y_train)

y_train[:5]

array([223,  81, 631, 602, 760])

In [54]:
le.inverse_transform(y_train)

array(['MPSelectMiniOwners', 'CreditCards', 'greysanatomy', ...,
       'Spanish', 'survivor', 'Snus'], dtype=object)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# extract features from text using bag-of-words (single words + bigrams)
# use tfidf weighting (helps a little for Naive Bayes in general)
# note : you can do better than this by extracting more features, then 
# doing feature selection, but not enough memory on this kernel!

print('this cell will take about 10 minutes to run')

NUM_FEATURES = 30000 

tf_idf_vectorizer = TfidfVectorizer(max_features = NUM_FEATURES,
                                min_df=5,
                                ngram_range=(1,2),
                                stop_words=None,
                                token_pattern='(?u)\\b\\w+\\b',
                            )

X_train = tf_idf_vectorizer.fit_transform(X_train)
X_test  = tf_idf_vectorizer.transform(X_test)

from sklearn.feature_selection import chi2, SelectKBest

# if we have more memory, select top 100000 features and select good features

chi2_selector = SelectKBest(chi2, 30000)

chi2_selector.fit(X_train, y_train) 

X_train = chi2_selector.transform(X_train)
X_test  = chi2_selector.transform(X_test)

X_train.shape, X_test.shape

this cell will take about 10 minutes to run


In [None]:
from sklearn.naive_bayes import MultinomialNB

# train a naive bayes model, get predictions

nb_model = MultinomialNB(alpha=0.1)
nb_model.fit(X_train, y_train)

y_pred_proba = nb_model.predict_proba(X_test)
y_pred = np.argmax(y_pred_proba, axis=1)

In [45]:
# label encode y

le = LabelEncoder()
le.fit(y_train)
y_train = le.transform(y_train)
y_test  = le.transform(y_test)

np.save('y_test', y_test)
np.save('y_train', y_train)

In [55]:
le.inverse_transform(y_train)

array(['MPSelectMiniOwners', 'CreditCards', 'greysanatomy', ...,
       'Spanish', 'survivor', 'Snus'], dtype=object)

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2, SelectKBest


NUM_FEATURES = 30000 

tf_idf_vectorizer = TfidfVectorizer(max_features = NUM_FEATURES,
                                min_df=5,
                                ngram_range=(1,2),
                                stop_words=None,
                                token_pattern='(?u)\\b\\w+\\b',
                            )
chi2_selector = SelectKBest(chi2, 30000)
nb_model = MultinomialNB(alpha=0.1)


pipe = Pipeline([
    ('tfidf', tf_idf_vectorizer),
    ('chi', chi2_selector),
    ('Bayes', nb_model)
])
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)



0.5831523527476143

In [59]:
text = "Porsche 911 is the hottest car on the market."

In [60]:
X_test[:1] = text

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_with(key, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


In [61]:
test_predict = pipe.predict(X_test[:1])

In [62]:
le.inverse_transform(test_predict)

array(['Porsche'], dtype=object)

In [None]:
# we use precision-at-k metrics to evaluate performance
# (https://en.wikipedia.org/wiki/Evaluation_measures_(information_retrieval)#Precision_at_K)

def precision_at_k(y_true, y_pred, k=5):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    y_pred = np.argsort(y_pred, axis=1)
    y_pred = y_pred[:, ::-1][:, :k]
    arr = [y in s for y, s in zip(y_true, y_pred)]
    return np.mean(arr)

print('precision@1 =', np.mean(y_test == y_pred))
print('precision@3 =', precision_at_k(y_test, y_pred_proba, 3))
print('precision@5 =', precision_at_k(y_test, y_pred_proba, 5))


In [None]:
import pickle

filename = 'model.sav'
pickle.dump(nb_model, open(filename, 'wb'))


## Using Pre-Trained Model (Gnews)
---

In [17]:
X_train

0         Remember your command line switches... Hi ther...
1         So what was Matt "addicted" to? Did he ever sa...
2         No Club Colors Funny story. I went to college ...
3         Not door bell, but floodlight mount height. I ...
4         Worried about my 8700k small fft/data stress r...
                                ...                        
810395    Best workflow for app integration Hi all!<lb><...
810396    4K editing machine problems i upgraded my gpu ...
810397    Advice on mixing and editing I recently attend...
810398    How to properly control hue lights? When I say...
810399    First Yak? Alright so I've never owned a kayak...
Name: text, Length: 810400, dtype: object

In [18]:
X_test

810400     100+ Classrooms..how do you engage the student...
810401     [MANGA SPOILERS] Someone I read **Chapter 48: ...
810402     [21 y.o][Beard tips/first time] I really need ...
810403     Gluing the tip back on and durability of the b...
810404     Your examples of makeup that's not "standard" ...
                                 ...                        
1012995    Is this months rebirth and dungeon astro's wor...
1012996    I might need a Medical leave from grad school ...
1012997    Police harassing ethnic minorities in Hong Kon...
1012998    SU EECS 2030 and EECS 2021 - need advice Hi, I...
1012999    What is the worse wine you ever had? My worst ...
Name: text, Length: 202600, dtype: object

In [19]:
y_train

0         talesfromtechsupport
1                      teenmom
2                       Harley
3                 ringdoorbell
4                        intel
                  ...         
810395                     git
810396                premiere
810397             VoiceActing
810398              amazonecho
810399                Kayaking
Name: subreddit, Length: 810400, dtype: object

In [10]:

import csv
import re
import tensorflow as tf
import pandas as pd
import numpy as np
import nltk
from bs4 import BeautifulSoup

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords

from numpy import argmax
from sklearn.preprocessing import LabelEncoder

nltk.download('stopwords')
STOPWORDS = set(stopwords.words('english'))

print(tf.__version__)

2.2.0


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/popkdodge/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard

In [12]:
X_train

890843    Blood in the coop! Need advice please. Went in...
255027    building planning questions. So, picking this ...
876882    Anyone else experiencing fatigue and anger ove...
358340    Please help in conversion of FSN to android UP...
312967    Advice on Sizing and fit (Levis 501 STF) Absol...
                                ...                        
378758    Dog has sporadic "episodes" of weakness and co...
591737    Beta not eating, lethargic, and now floating v...
799158    Heavy lifting without equipment Hey ladies, I ...
959988    I need a name for my app concept. Okay so here...
929993    My Experience Between Xanax and Clonazolam in ...
Name: text, Length: 243120, dtype: object

In [13]:
stop = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=10)

In [14]:
import tensorflow_hub as hub

In [15]:
model = "https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim/1"
hub_layer = hub.KerasLayer(model, output_shape=[20], input_shape=[], 
                           dtype=tf.string, trainable=True)
hub_layer(X_train[:3])

<tf.Tensor: shape=(3, 20), dtype=float32, numpy=
array([[ 1.4710238 , -2.3969805 ,  1.8748744 ,  1.6224763 , -3.3003945 ,
        -3.2372596 , -2.1706448 ,  1.5361576 ,  2.2106836 , -0.27289772,
        -2.1587198 ,  1.7079719 ,  0.5790621 ,  0.30224222, -3.8508694 ,
         1.4188178 ,  3.6417365 , -1.3552854 , -2.134932  , -1.3678887 ],
       [ 2.037355  , -1.8519831 ,  2.107686  ,  3.1011903 , -2.8023505 ,
        -4.159095  , -2.0328512 ,  1.809213  , -0.1158124 ,  0.8989647 ,
        -2.7574916 ,  2.621982  , -0.43547934, -0.02817489, -4.741898  ,
         2.5105217 ,  4.4980564 , -1.5572418 , -2.9888098 , -0.34141865],
       [ 4.4281325 , -3.0655746 ,  2.967873  ,  1.0363597 , -3.4089818 ,
        -3.5374804 , -0.5658244 ,  1.3100419 ,  1.6979674 , -1.1891607 ,
        -0.3206614 ,  2.191861  , -1.1746538 ,  1.0465789 , -5.000568  ,
        -0.26257285,  5.5111012 , -1.3371335 , -2.2992864 , -2.4718976 ]],
      dtype=float32)>

In [30]:
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(32, activation='relu'))
model.add(tf.keras.layers.Dense(64, activation='relu'))
model.add(tf.keras.layers.Dense(1013, activation='softmax'))

In [31]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
keras_layer_3 (KerasLayer)   (None, 50)                48190600  
_________________________________________________________________
dense_6 (Dense)              (None, 16)                816       
_________________________________________________________________
dense_7 (Dense)              (None, 32)                544       
_________________________________________________________________
dense_8 (Dense)              (None, 64)                2112      
_________________________________________________________________
dense_9 (Dense)              (None, 1013)              65845     
Total params: 48,259,917
Trainable params: 69,317
Non-trainable params: 48,190,600
_________________________________________________________________


In [32]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
history = model.fit(X_train,
                    y_train,
                    epochs=1000,
                    batch_size=512,
                    validation_data=(X_test, y_test),
                    verbose=1,
                    callbacks=stop )

In [18]:
y_test

array([ 290,  323,  463, ...,  166, 1011, 1000])

### BIGGER GNEWS
---

In [34]:
from tensorflow.keras.layers import Dense
from tensorflow.keras import Sequential

In [35]:
hub_layer = hub.KerasLayer("https://tfhub.dev/google/tf2-preview/nnlm-en-dim50/1", output_shape=[50],
                           input_shape=[], dtype=tf.string)

model = Sequential()
model.add(hub_layer)
model.add(Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(1013, activation='softmax'))

model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
keras_layer_4 (KerasLayer)   (None, 50)                48190600  
_________________________________________________________________
dense_10 (Dense)             (None, 16)                816       
_________________________________________________________________
dense_11 (Dense)             (None, 1013)              17221     
Total params: 48,208,637
Trainable params: 18,037
Non-trainable params: 48,190,600
_________________________________________________________________


In [36]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
history = model.fit(X_train,
                    y_train,
                    epochs=1000,
                    batch_size=512,
                    validation_data=(X_test, y_test),
                    verbose=1,
                    callbacks=stop )

Wiki words 250
---

In [38]:
hub_layer = hub.KerasLayer("https://tfhub.dev/google/Wiki-words-250/2",
                           input_shape=[], dtype=tf.string)

In [43]:
model = Sequential()
model.add(hub_layer)
model.add(Dense(245, activation='relu'))
model.add(Dense(245, activation='relu'))
model.add(Dense(245, activation='relu'))
model.add(Dense(1013, activation='softmax'))

model.summary()

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
keras_layer_5 (KerasLayer)   (None, 250)               252343750 
_________________________________________________________________
dense_16 (Dense)             (None, 245)               61495     
_________________________________________________________________
dense_17 (Dense)             (None, 245)               60270     
_________________________________________________________________
dense_18 (Dense)             (None, 245)               60270     
_________________________________________________________________
dense_19 (Dense)             (None, 1013)              249198    
Total params: 252,774,983
Trainable params: 431,233
Non-trainable params: 252,343,750
_________________________________________________________________


In [44]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
history = model.fit(X_train,
                    y_train,
                    epochs=1000,
                    batch_size=512,
                    validation_data=(X_test, y_test),
                    verbose=1,
                    callbacks=stop )

Impplementing BERT
---

In [46]:
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [49]:
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub

import tokenization

In [48]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.91-cp37-cp37m-manylinux1_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 1.8 MB/s eta 0:00:01
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.91


In [50]:
# Thanks to https://www.kaggle.com/xhlulu/disaster-nlp-keras-bert-using-tfhub
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [51]:
# Thanks to https://www.kaggle.com/xhlulu/disaster-nlp-keras-bert-using-tfhub
def build_model(bert_layer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    
    if Dropout_num == 0:
        # Without Dropout
        out = Dense(1, activation='sigmoid')(clf_output)
    else:
        # With Dropout(Dropout_num), Dropout_num > 0
        x = Dropout(Dropout_num)(clf_output)
        out = Dense(1, activation='sigmoid')(x)

    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [57]:
# Load BERT from the Tensorflow Hub
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
bert_layer = hub.KerasLayer(module_url, trainable=True)

In [53]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [None]:
# Thanks to https://www.kaggle.com/xhlulu/disaster-nlp-keras-bert-using-tfhub
# Encode the text into tokens, masks, and segment flags
train_input = bert_encode(X_train.values, tokenizer, max_len=160)
test_input = bert_encode(X_test.values, tokenizer, max_len=160)
train_labels = train.target.values

In [None]:
# Build BERT model with my tuning
model_BERT = build_model(bert_layer, max_len=160)
model_BERT.summary()

In [None]:
checkpoint = ModelCheckpoint('model_BERT.h5', monitor='val_loss', save_best_only=True)

train_history = model_BERT.fit(
    train_input, train_labels,
    validation_split = valid,
    epochs = epochs_num, # recomended 3-5 epochs
    callbacks=[checkpoint],
    batch_size = batch_size_num
)