In [0]:
import pandas as pd
import numpy as np
import re
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM
from sklearn.preprocessing import LabelEncoder
from keras import metrics

Using TensorFlow backend.


In [0]:
df = pd.read_csv('top200subs.csv')
df

Unnamed: 0.1,Unnamed: 0,title,subreddit,selftext,fulltext
0,0,Beautiful Home :),Home,,Beautiful Home :)
1,1,Pretty proud of this clean up project I found ...,Home,,Pretty proud of this clean up project I found ...
2,2,Twins,Home,,Twins
3,3,This was finished yesterday..,Home,,This was finished yesterday..
4,4,My roommate is kicking me out because having a...,Home,"So, I am not asking for advice, really...mores...",My roommate is kicking me out because having a...
...,...,...,...,...,...
9995,9995,"Crysis Remastered leaked for Xbox One, PS4, PC...",GamingLeaksAndRumours,"""Crysis Remastered brings new graphic features...","Crysis Remastered leaked for Xbox One, PS4, PC..."
9996,9996,TLOU 2 delayed indefinitely,GamingLeaksAndRumours,https://twitter.com/jasonschreier/status/12457...,TLOU 2 delayed indefinitelyhttps://twitter.com...
9997,9997,Test Drive Unlimited 3 (Project Sunrise) - TDU...,GamingLeaksAndRumours,(Sorry if my english isn't that good)\n\nI'm a...,Test Drive Unlimited 3 (Project Sunrise) - TDU...
9998,9998,RE8 is called RESIDENT EVIL VILLAGE,GamingLeaksAndRumours,https://twitter.com/Nibellion/status/124746090...,RE8 is called RESIDENT EVIL VILLAGEhttps://twi...


In [0]:
def clean(X):

    # Prevent SettingWithCopyWarning
    X = X.copy()
    
    # remove '\\n'
    X['fulltext'] = X['fulltext'].map(lambda x: re.sub('\\n',' ',str(x)))
    X['subreddit'] = X['subreddit'].map(lambda x: re.sub('\\n',' ',str(x)))
    
    # remove '' 
    X['fulltext'] = X['fulltext'].map(lambda x: re.sub('<lb>',' ',str(x)))
    X['subreddit'] = X['subreddit'].map(lambda x: re.sub('<lb>',' ',str(x)))
    
    # remove any text starting with User... 
    X['fulltext'] = X['fulltext'].map(lambda x: re.sub("\[\[User.*",'',str(x)))
    X['subreddit'] = X['subreddit'].map(lambda x: re.sub("\[\[User.*",'',str(x)))
    
    # remove IP addresses or user IDs
    X['fulltext'] =X['fulltext'].map(lambda x: re.sub("\[\[User.*",'',str(x)))
    X['subreddit'] = X['subreddit'].map(lambda x: re.sub("\[\[User.*",'',str(x)))
    
    #remove http links in the text
    X['fulltext'] = X['fulltext'].map(lambda x: re.sub("(http://.*?\s)|(http://.*)",'',str(x)))
    X['subreddit'] = X['subreddit'].map(lambda x: re.sub("(http://.*?\s)|(http://.*)",'',str(x)))
    
    return X

In [0]:
df = clean(df)
X_train = df['fulltext']
y_train = df['subreddit']

##LSTM Model

In [0]:
# keras tokenizer to assign a dictionary for sequences
t = Tokenizer()
t.fit_on_texts(X_train)


In [0]:
# For some reason y_train has to be numeric in this model as well
labeler = LabelEncoder()
y_train_enc = labeler.fit_transform(y_train)

In [0]:
# make sequences
X_sequences = t.texts_to_sequences(X_train)

# Set the params of the LSTM model
# Do not change this line. You need the +1 for some reason. 
max_features = len(t.word_index.values()) + 1

# 111 was the average so we will limit it to 100
maxlen = 100
batch_size = 128

# Pad the sequence to make uniform entries
X_seq_pad = sequence.pad_sequences(X_sequences, maxlen=maxlen)

In [0]:
# Find average number of words in each post
counter = 0
for seq in X_sequences:
  counter += len(seq)

counter/10000

111.0333

In [0]:
# Build the model
model = Sequential()
# Need this to flatten it to the apt shape
model.add(Embedding(max_features, 128))
# 128 specified by papers/industry. Dropout and recurrent_dropout set our forget params
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
# multi class prediction of target y_train
model.add(Dense(len(y_train), activation='softmax'))


model.compile(optimizer='nadam', loss='sparse_categorical_crossentropy',
              metrics=['top_k_categorical_accuracy'])


In [0]:
model.fit(X_seq_pad, y_train_enc, batch_size=batch_size, epochs=15,
          validation_split=0.2)

Train on 8000 samples, validate on 2000 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.callbacks.History at 0x7fc45c2865f8>

##BERT

 (Bidirectional Encoder Representations from Transformers) provides dense vector representations for natural language by using a deep, pre-trained neural network with the Transformer architecture. Used in google for search. Understands the context. Order and stop words accounted for situationally.

In [0]:
!pip install bert-serving-server  # server
!pip install bert-serving-client  # client, independent of `bert-serving-server`



In [0]:
!wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip && unzip uncased_L-12_H-768_A-12.zip

--2020-05-01 03:48:10--  https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.11.176, 2607:f8b0:4007:804::2010
Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.11.176|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 407727028 (389M) [application/zip]
Saving to: ‘uncased_L-12_H-768_A-12.zip.1’


2020-05-01 03:48:12 (163 MB/s) - ‘uncased_L-12_H-768_A-12.zip.1’ saved [407727028/407727028]

Archive:  uncased_L-12_H-768_A-12.zip
replace uncased_L-12_H-768_A-12/bert_model.ckpt.meta? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: uncased_L-12_H-768_A-12/bert_model.ckpt.meta  
replace uncased_L-12_H-768_A-12/bert_model.ckpt.data-00000-of-00001? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: uncased_L-12_H-768_A-12/bert_model.ckpt.data-00000-of-00001  Y A
Y

  inflating: uncased_L-12_H-768_A-12/vocab.txt  
  inflating: uncased_L-12_H-768_A-1

In [0]:
!bert-serving-start -model_dir uncased_L-12_H-768_A-12/ -num_worker=1 -max_seq_len 50

usage: /usr/local/bin/bert-serving-start -model_dir uncased_L-12_H-768_A-12/ -num_worker=1 -max_seq_len 50
                 ARG   VALUE
__________________________________________________
           ckpt_name = bert_model.ckpt
         config_name = bert_config.json
                cors = *
                 cpu = False
          device_map = []
       do_lower_case = True
  fixed_embed_length = False
                fp16 = False
 gpu_memory_fraction = 0.5
       graph_tmp_dir = None
    http_max_connect = 10
           http_port = None
        mask_cls_sep = False
      max_batch_size = 256
         max_seq_len = 50
           model_dir = uncased_L-12_H-768_A-12/
no_position_embeddings = False
    no_special_token = False
          num_worker = 1
       pooling_layer = [-2]
    pooling_strategy = REDUCE_MEAN
                port = 5555
            port_out = 5556
       prefetch_size = 10
 priority_batch_size = 16
show_tokens_to_client = False
     tuned_model_dir = None
             ve

In [0]:
# Needs specific tensorflow version
!pip install tensorflow==1.10



In [0]:

from bert_serving.client import BertClient

# make a connection with the BERT server using it's ip address; do not give any ip if same computer
bc = BertClient()
# get the embedding
embedding = bc.encode(["I love data science and analytics vidhya."])
# check the shape of embedding, it should be 1x768
print(embedding.shape)

KeyboardInterrupt: ignored