In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Process for sequential lstm on text classification 
1. clean entire corpus, removing stopwords html, and anything else specific to the DS
2. split cleaned data into test / train (val)
3. tokenize X, fit the transformers on train only then transform test and val
4. encode/tokenize y, (class labels) for training * also transform to binary (onehot) MATRIX *
5. fit and evaluate model
6. use transformers in pipeline to allow predict to consume 'raw input' 



In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('To enable a high-RAM runtime, select the Runtime > "Change runtime type"')
  print('menu, and then select High-RAM in the Runtime shape dropdown. Then, ')
  print('re-execute this cell.')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 27.4 gigabytes of available RAM

You are using a high-RAM runtime!


In [None]:

import re
import tensorflow as tf
import pandas as pd
import numpy as np
import nltk
from bs4 import BeautifulSoup

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords

from numpy import argmax
from sklearn.preprocessing import LabelEncoder

nltk.download('stopwords')
STOPWORDS = set(stopwords.words('english'))

print(tf.__version__)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
2.3.0


In [None]:
from sklearn.preprocessing import FunctionTransformer


def clean_text(X):
    """ 
        X: series
        
        return: np.array
    """
    REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
    BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
    STOPWORDS = set(stopwords.words('english'))
    
    X = X.map(lambda t : BeautifulSoup(t, "lxml").text) # strip html tags
    X = X.map(lambda t : t.lower()) # lowercase text
    X = X.map(lambda t : REPLACE_BY_SPACE_RE.sub(' ', t))  # symbols by space in text
    X = X.map(lambda t : BAD_SYMBOLS_RE.sub('', t)) # delete symbols which are in BAD_SYMBOLS_RE
    X = X.map(lambda t : ' '.join(word for word in t.split() if word not in STOPWORDS))# delete stopwords
    return np.array(X)

text_transformer = FunctionTransformer(clean_text)


In [None]:
df = pd.read_csv('/content/drive/My Drive/reduced30k.tsv', sep='\t')



In [None]:
df = df.sample(frac=.5)
df.shape

(151950, 5)

In [None]:

from numpy import argmax
from sklearn.preprocessing import LabelEncoder


newdf  = df.copy()
newdf['Text'] =newdf['title'].str.cat(newdf['selftext'], sep=' ')
newdf = newdf[['subreddit', 'Text']]
newdf.columns = ['Class Name', 'Text']



In [None]:
#need convert the labels to numeric
# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(newdf['Class Name'])
print(integer_encoded, len(label_encoder.classes_))   
       


[871 580 911 ... 379 561 473] 1013


In [None]:

training_portion = .8
train_size = int(newdf.shape[0] * training_portion)

train,test = newdf[0: train_size],newdf[train_size:]
print(train.shape,test.shape)

labels = newdf['Class Name'].tolist()   #use to train label encoder
test.head()

(121560, 2) (30390, 2)


Unnamed: 0,Class Name,Text
76179,callofcthulhu,Representing Cosmicism in an RPG I am an RPG g...
39852,neopets,"Daily Dare Tip! Just saw this on Jellyneo ""Bef..."
103825,juul,Just tried a V3 for the first time It is FAR b...
141753,lego,Where to pick up Lego Batman Movie Polybags? D...
81537,PHPhelp,Problems with a table in Cake PHP I am using C...


In [None]:
vocab_size = 10000
embedding_dim = 16
max_length = 200
trunc_type='post'
oov_tok = "<OOV>"

labels = newdf['Class Name'].tolist()   #use 

train_texts = train['Text'].tolist()
test_texts = test['Text'].tolist()

tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_texts)   #for train

word_index = tokenizer.word_index
X_train = tokenizer.texts_to_matrix((train_texts), mode='tfidf') 
X_test = tokenizer.texts_to_matrix((test_texts), mode='tfidf') 

train_y = train['Class Name'].tolist()
test_y = test['Class Name'].tolist()
# transform labels into 


y_train = tf.keras.utils.to_categorical(label_encoder.transform(train_y))
y_test = tf.keras.utils.to_categorical(label_encoder.transform(test_y))


In [None]:
X_train.shape, y_train.shape

((121560, 10000), (121560, 1013))

In [None]:

model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(350, input_shape=(vocab_size,)))
model.add(tf.keras.layers.Dense(5000, activation='tanh',
                             kernel_initializer='random_uniform'))
model.add(tf.keras.layers.Dropout(.5))
model.add(tf.keras.layers.Dense(1013, activation='softmax'))
model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adamax', metrics=['accuracy'])
NUM_EPOCHS = 50

print(model.metrics_names)

batch_size = 32
stop = tf.keras.callbacks.EarlyStopping(patience=3,monitor='val_loss')
history = model.fit(X_train, y_train, batch_size=batch_size, 
                    epochs=50,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=stop
                    )
score = model.evaluate(X_test, y_test, batch_size=batch_size, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 350)               3500350   
_________________________________________________________________
dense_4 (Dense)              (None, 5000)              1755000   
_________________________________________________________________
dropout (Dropout)            (None, 5000)              0         
_________________________________________________________________
dense_5 (Dense)              (None, 1013)              5066013   
Total params: 10,321,363
Trainable params: 10,321,363
Non-trainable params: 0
_________________________________________________________________
[]
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Test loss: 2.085965871810913
Test accuracy: 0.6253702044487


In [None]:
model.save(
    'cnn_model1.h5', overwrite=True, include_optimizer=True, save_format=None,
    signatures=None, options=None
)