### Text Classfication using CNN

In [1]:
import numpy as np
import pandas as pd
import re
import os
import datetime
import string
import  nltk
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.chunk import tree2conlltags
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Dropout,Input, Flatten, Embedding
from tensorflow.keras.callbacks import LearningRateScheduler, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.models import Model
from sklearn.metrics import roc_auc_score, f1_score
from tensorflow.keras.callbacks import LearningRateScheduler, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau


#### MODEL TRAINING

In [3]:
df = pd.read_csv('CNN_DOC_CLASSIFICATION_stacked_data.csv')

In [2]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip
!ls
!pwd

--2022-06-13 16:30:23--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2022-06-13 16:30:23--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2022-06-13 16:30:24--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2022-0

In [4]:
X = df['column_data_stacked']
y = df['label']
num_classes = len(set(list(y)))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y)

####  Word-Embeddings

In [5]:
tk =  Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^`{|}~\t\n',)
tk.fit_on_texts(X_train)
vocab_size = len(tk.word_index) + 1
encoded_train = tk.texts_to_sequences(X_train)
encoded_test = tk.texts_to_sequences(X_test)

max_len_train = max(([len(s.split()) for s in X_train]))
max_len_test = max(([len(s.split()) for s in X_test]))

X_tr = pad_sequences(encoded_train, maxlen=max_len_train, padding='post')
X_te = pad_sequences(encoded_test, maxlen=max_len_test, padding='post')

print(X_tr.shape,  X_te.shape)

(42363, 3865) (14121, 3865)


In [6]:
embeddings_index = dict()
f = open('glove.6B.100d.txt')
for line in f:
	values = line.split()
	word = values[0]
	coefs = np.asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()

In [7]:
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in tk.word_index.items():
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector

####  Char-Embeddings

In [8]:
tk_char =  Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^`{|}~\t\n', char_level=True, oov_token=True)
tk_char.fit_on_texts(X_train)
vocab_size_char= len(tk_char.word_index) + 1
encoded_train = tk_char.texts_to_sequences(X_train)
encoded_test = tk_char.texts_to_sequences(X_test)

max_len_train = max(([len(s.split()) for s in X_train]))
max_len_test = max(([len(s.split()) for s in X_test]))

X_tr_char = pad_sequences(encoded_train, maxlen=max_len_train, padding='post')
X_te_char = pad_sequences(encoded_test, maxlen=max_len_test, padding='post')

# print(len(encoded_train),  encoded_test)
print(X_tr_char.shape,  X_te_char.shape)

(42363, 3865) (14121, 3865)


In [9]:
embeddings_index_char = dict()
f = open('glove.840B.300d-char.txt')
for line in f:
	values = line.split()
	word = values[0]
	coefs = np.asarray(values[1:], dtype='float32')
	embeddings_index_char[word] = coefs
f.close()

In [10]:
embedding_matrix_char = np.zeros((vocab_size_char, 300))
for word, i in tk_char.word_index.items():
	embedding_vector_char = embeddings_index_char.get(word)
	if embedding_vector_char is not None:
		embedding_matrix_char[i] = embedding_vector_char

#### Callbacks

In [11]:
# Implementing ModelCheckpoint class which comes with keras library to save the learned weights if validation accuracy improves from previous epoch
filepath = "model_save/best_model-{epoch:02d}-{val_accuracy:.4f}.hdfs"
checkpoint_callback = ModelCheckpoint(
    filepath = filepath,
    monitor="val_accuracy",
    verbose=1,
    save_best_only=True,
    mode="auto"
)

In [12]:
early_stopping_callback = EarlyStopping(monitor='val_accuracy', min_delta=0.35, patience=2, verbose=1)

In [13]:
tensorboard_log_dir = os.path.join("logs", "fits", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = tf.keras.callbacks.TensorBoard(
    log_dir=tensorboard_log_dir,
    histogram_freq=1,
    write_graph=True
)

#### Model -1

In [14]:
embedding_layer = Embedding(vocab_size,
                            100,
                            weights=[embedding_matrix],
                            input_length=max_len_train,
                            trainable=False)

In [15]:
def create_convnet(img_path='network_image.png'):
    input_shape = Input(shape=(max_len_train))
    embedded_sequences = embedding_layer(input_shape) 
    conv11 = Conv1D(filters=3, kernel_size=3, activation='relu', padding ='same', kernel_initializer=tf.keras.initializers.he_uniform())(embedded_sequences)
    conv12 = Conv1D(filters=5, kernel_size=5, activation='relu', padding ='same', kernel_initializer=tf.keras.initializers.he_uniform())(embedded_sequences)
    conv13 = Conv1D(filters=8, kernel_size=8, activation='relu', padding ='same', kernel_initializer=tf.keras.initializers.he_uniform())(embedded_sequences)

    merged1 = tf.keras.layers.concatenate([conv11, conv12, conv13])

    pooling1 = MaxPooling1D(pool_size=3, padding='same')(merged1)

    conv21 = Conv1D(filters=3, kernel_size=3, activation='relu', padding ='same', kernel_initializer=tf.keras.initializers.he_uniform())(pooling1)
    conv22 = Conv1D(filters=6, kernel_size=6, activation='relu', padding ='same', kernel_initializer=tf.keras.initializers.he_uniform())(pooling1)
    conv23 = Conv1D(filters=8, kernel_size=8, activation='relu', padding ='same', kernel_initializer=tf.keras.initializers.he_uniform())(pooling1)

    merged2 = tf.keras.layers.concatenate([conv21, conv22, conv23])
    
    pooling2 = MaxPooling1D(pool_size=3, padding='same')(merged2)

    conv3 = Conv1D(filters=12, kernel_size=12, activation='relu', kernel_initializer=tf.keras.initializers.he_uniform())(pooling2)

    Flatten1 = Flatten()(conv3)

    dropout1 = Dropout(0.5)(Flatten1)

    out_intm = Dense(20, activation='relu')(dropout1)
    out = Dense(num_classes, activation='softmax')(out_intm)

    model = Model(input_shape, out)
    tf.keras.utils.plot_model(model, to_file=img_path)
    return model

In [16]:
# callback_list = [metric_callbacks_obj, early_stopping_callback, tensorboard_callback, checkpoint_callback]
callback_list =[early_stopping_callback, tensorboard_callback, checkpoint_callback]

model_1 = create_convnet()

print(model_1.summary())

optimizer_adam = tf.keras.optimizers.Adam(
    learning_rate=0.01,
    beta_1=0.9,
    beta_2=0.999,
    epsilon=1e-07
)

model_1.compile(optimizer=optimizer_adam, loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])
model_1.fit(X_tr, y_train,  validation_data=(X_te, y_test),  epochs=10, batch_size=2000, callbacks=callback_list)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 3865)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, 3865, 100)    228400      ['input_1[0][0]']                
                                                                                                  
 conv1d (Conv1D)                (None, 3865, 3)      903         ['embedding[0][0]']              
                                                                                                  
 conv1d_1 (Conv1D)              (None, 3865, 5)      2505        ['embedding[0][0]']              
                                                                                              

  return dispatch_target(*args, **kwargs)


Epoch 1: val_accuracy improved from -inf to 0.65661, saving model to model_save/best_model-01-0.6566.hdfs
INFO:tensorflow:Assets written to: model_save/best_model-01-0.6566.hdfs/assets
Epoch 2/10
Epoch 2: val_accuracy improved from 0.65661 to 0.78231, saving model to model_save/best_model-02-0.7823.hdfs
INFO:tensorflow:Assets written to: model_save/best_model-02-0.7823.hdfs/assets
Epoch 3/10
Epoch 3: val_accuracy improved from 0.78231 to 0.80929, saving model to model_save/best_model-03-0.8093.hdfs
INFO:tensorflow:Assets written to: model_save/best_model-03-0.8093.hdfs/assets
Epoch 3: early stopping


<keras.callbacks.History at 0x7f6d4fc14910>

#### MODEL-2

In [17]:
embedding_layer_model2 = Embedding(vocab_size_char,
                            300,
                            weights=[embedding_matrix_char],
                            input_length=max_len_train,
                            trainable=False)

In [18]:
def create_convnet2(img_path='network_image_model2.png'):
    input_shape = Input(shape=(max_len_train))
    embedded_sequences = embedding_layer_model2(input_shape) 
    conv11 = Conv1D(filters=3, kernel_size=3, activation='relu', padding ='same', kernel_initializer=tf.keras.initializers.he_uniform())(embedded_sequences)
    conv12 = Conv1D(filters=5, kernel_size=5, activation='relu', padding ='same', kernel_initializer=tf.keras.initializers.he_uniform())(conv11)

    pooling1 = MaxPooling1D(pool_size=3, padding='same')(conv12)

    conv21 = Conv1D(filters=6, kernel_size=6, activation='relu', padding ='same', kernel_initializer=tf.keras.initializers.he_uniform())(pooling1)
    conv22 = Conv1D(filters=8, kernel_size=8, activation='relu', padding ='same', kernel_initializer=tf.keras.initializers.he_uniform())(conv21)
    
    pooling2 = MaxPooling1D(pool_size=3, padding='same')(conv22)

    Flatten1 = Flatten()(pooling2)

    dropout1 = Dropout(0.5)(Flatten1)

    out_intm = Dense(20, activation='relu')(dropout1)
    out = Dense(num_classes, activation='softmax')(out_intm)

    model = Model(input_shape, out)
    tf.keras.utils.plot_model(model, to_file=img_path)
    return model

In [19]:
callback_list = [early_stopping_callback, tensorboard_callback, checkpoint_callback]

model_2 = create_convnet2()
print(model_2.summary())

optimizer_adam = tf.keras.optimizers.Adam(
    learning_rate=0.001,
    beta_1=0.9,
    beta_2=0.999,
    epsilon=1e-07
)

model_2.compile(optimizer=optimizer_adam, loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])
model_2.fit(X_tr_char, y_train, validation_data=(X_te_char, y_test),  epochs=5, batch_size=1000, callbacks=callback_list)

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 3865)]            0         
                                                                 
 embedding_1 (Embedding)     (None, 3865, 300)         9900      
                                                                 
 conv1d_7 (Conv1D)           (None, 3865, 3)           2703      
                                                                 
 conv1d_8 (Conv1D)           (None, 3865, 5)           80        
                                                                 
 max_pooling1d_2 (MaxPooling  (None, 1289, 5)          0         
 1D)                                                             
                                                                 
 conv1d_9 (Conv1D)           (None, 1289, 6)           186       
                                                           

  return dispatch_target(*args, **kwargs)


Epoch 1: val_accuracy did not improve from 0.80929
Epoch 2/5
Epoch 2: val_accuracy did not improve from 0.80929
Epoch 3/5
Epoch 3: val_accuracy did not improve from 0.80929
Epoch 3: early stopping


<keras.callbacks.History at 0x7f6d43a5fb90>


#### REFERENCE
https://towardsdatascience.com/how-i-preprocessed-text-data-using-regular-expressions-for-my-text-classification-task-cnn-cb206e7274ed

https://stackoverflow.com/questions/43151775/how-to-have-parallel-convolutional-layers-in-keras/

http://ai.intelligentonlinetools.com/ml/document-classification-using-convolutional-neural-network/

https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html

https://stackoverflow.com/questions/71357014/running-a-fine-tune-model-for-my-cnn-value-error