CNN with dropout 


Process for sequential lstm on text classification 
1. clean entire corpus, removing stopwords html, and anything else specific to the DS
2. split cleaned data into test / train (val)
3. tokenize X, fit the transformers on train only then transform test and val
4. encode/tokenize y, (class labels) for training * also transform to binary (onehot) MATRIX *
5. fit and evaluate model
6. use transformers in pipeline to allow predict to consume 'raw input' 



In [3]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('To enable a high-RAM runtime, select the Runtime > "Change runtime type"')
  print('menu, and then select High-RAM in the Runtime shape dropdown. Then, ')
  print('re-execute this cell.')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 27.4 gigabytes of available RAM

You are using a high-RAM runtime!


In [4]:

import csv
import re
import tensorflow as tf
import pandas as pd
import numpy as np
import nltk
from bs4 import BeautifulSoup

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords

from numpy import argmax
from sklearn.preprocessing import LabelEncoder

nltk.download('stopwords')
STOPWORDS = set(stopwords.words('english'))

print(tf.__version__)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
2.3.0


In [5]:
from sklearn.preprocessing import FunctionTransformer


def clean_text(X):
    """ 
        X: series
        
        return: np.array
    """
    REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
    BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
    STOPWORDS = set(stopwords.words('english'))
    
    X = X.map(lambda t : BeautifulSoup(t, "lxml").text) # strip html tags
    X = X.map(lambda t : t.lower()) # lowercase text
    X = X.map(lambda t : REPLACE_BY_SPACE_RE.sub(' ', t))  # symbols by space in text
    X = X.map(lambda t : BAD_SYMBOLS_RE.sub('', t)) # delete symbols which are in BAD_SYMBOLS_RE
    X = X.map(lambda t : ' '.join(word for word in t.split() if word not in STOPWORDS))# delete stopwords
    return np.array(X)

text_transformer = FunctionTransformer(clean_text)


In [6]:
df = pd.read_csv('/content/drive/My Drive/reduced30k.tsv', sep='\t')


In [7]:

from numpy import argmax
from sklearn.preprocessing import LabelEncoder


newdf  = df.copy()
newdf['Text'] =newdf['title'].str.cat(newdf['selftext'], sep=' ')
newdf = newdf[['subreddit', 'Text']]
newdf.columns = ['Class Name', 'Text']



In [8]:
#need convert the labels to numeric
# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(newdf['Class Name'])
print(integer_encoded, len(label_encoder.classes_))   
       


[164 107 657 ... 802 339 421] 1013


In [9]:

training_portion = .8
train_size = int(newdf.shape[0] * training_portion)

train,test = newdf[0: train_size],newdf[train_size:]
print(train.shape,test.shape)

labels = newdf['Class Name'].tolist()   #use to train label encoder
test.head()

(243120, 2) (60780, 2)


Unnamed: 0,Class Name,Text
243120,learnmachinelearning,Starting an image recognition network? Hey all...
243121,GenderCritical,Women = infants I’m sure someone has covered t...
243122,PlasticSurgery,I am getting a rhinoplasty to fix breathing af...
243123,DebateAltRight,"""Open borders, but only for white countries"" S..."
243124,pihole,Web Interface (Lighttpd) stops working Some mo...


In [10]:
vocab_size = 10000
embedding_dim = 16
max_length = 200
trunc_type='post'
oov_tok = "<OOV>"

labels = newdf['Class Name'].tolist()   #use 

train_texts = train['Text'].tolist()
test_texts = test['Text'].tolist()

tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_texts)   #for train, 

word_index = tokenizer.word_index

X_train_sequences = tokenizer.texts_to_sequences(train_texts)     # sequences is a list of seq
X_train_padded = pad_sequences(X_train_sequences,maxlen=max_length, truncating=trunc_type)  #training X, y

### not sure 
test_sequences = tokenizer.texts_to_sequences(test['Text'].tolist())     #testing X
test_padded = pad_sequences(test_sequences,maxlen=max_length)            #X_test_padded


train_y = train['Class Name'].tolist()
test_y = test['Class Name'].tolist()
# transform labels into 
le  = LabelEncoder()
le.fit(newdf['Class Name'])


train_y = le.transform(train_y)
test_y = le.transform(test_y)
train_y = tf.keras.utils.to_categorical(train_y)    # https://www.tensorflow.org/api_docs/python/tf/keras/utils/to_categorical
test_y = tf.keras.utils.to_categorical(test_y)

In [11]:

len(train_y) , len(X_train_sequences)
len(word_index)
len(train_texts[0].split())

267

In [13]:
%%time 

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 13),
    tf.keras.layers.Conv1D(128, 5, activation='relu'),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1013, activation='softmax')   ## this matches the number of categories
    ])

model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
NUM_EPOCHS = 100

history = model.fit(X_train_padded,train_y,
                    epochs=NUM_EPOCHS,
                    validation_data=(test_padded,test_y),
                    callbacks= woahvicky)
                    )


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 13)          130000    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, None, 128)         8448      
_________________________________________________________________
global_average_pooling1d_1 ( (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                8256      
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1013)              65845     
Total params: 212,549
Trainable params: 212,549
Non-trainable params: 0
__________________________________________________

In [None]:
model.save(
    'cnn_model1.h5', overwrite=True, include_optimizer=True, save_format=None,
    signatures=None, options=None
)