In [60]:
import pandas as pd
import keras
from keras.models import Sequential
from keras.layers import Input,Embedding, Lambda,Dropout,Conv1D,Activation, Dense, Bidirectional,GlobalMaxPooling1D, LSTM, SpatialDropout1D, TimeDistributed,Masking,Layer
from keras.models import Model
from keras.callbacks import ModelCheckpoint
import keras.backend as K
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
import tensorflow as tf
tf.compat.v1.get_default_graph
from keras.layers.merge import add
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics
from sklearn.utils import shuffle
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.metrics import accuracy_score
from spacy.lang.en import English
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import numpy as np
import string
import spacy
spacy.load('en')
parser = English()

In [44]:
STOPWORDS = set(stopwords.words('english') + list(ENGLISH_STOP_WORDS)) 
SYMBOLCHARS = " ".join(string.punctuation).split(" ") + ["-", "...", "”", "”","''"]

In [45]:
OffensiveLangDF = pd.read_csv('../Data/Offensive Language Dataset/Cleaned_labeled_data.csv')
spamSmsDF = pd.read_csv('../Data/SMS Spam Dataset/Cleaned_SMSSpamCollection.csv')
politicalDF = pd.read_csv('../Data/Indian Political Tweets Dataset/cleaned-tweets.csv')

currentDF = politicalDF

In [46]:
def tokenizeText(textData):

    textData = textData.strip().replace("\n", " ").replace("\r", " ")
    textData = textData.lower()
    tokens = parser(textData)

    lemmas = []
    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas
    
    # Remove Stop Words
    tokens = [tok for tok in tokens if tok.lower() not in STOPWORDS]
    # Remove Symbols
    tokens = [tok for tok in tokens if tok not in SYMBOLCHARS]
    # Remove words with less than 3 characters
    tokens = [tok for tok in tokens if len(tok) >= 3]
    # Remove Non-Alphabetic Characters
    tokens = [tok for tok in tokens if tok.isalpha()]
    
    # Stemming of Words
    porter = PorterStemmer()
    tokens = [porter.stem(word) for word in tokens]
    
    tokens = list(set(tokens))
    textData = ' '.join(tokens[:])
    return textData

In [47]:
currentDF['text'] = currentDF['text'].apply(lambda x:tokenizeText(x))
y = list(currentDF['category'])
x = list(currentDF['text'])

In [62]:
token = Tokenizer(num_words=1000)
token.fit_on_texts(x)
vocab_size = len(token.word_index) + 1
print(vocab_size)

# for cnn preproces
cnn_texts_seq = token.texts_to_sequences(x)
cnn_texts_mat = sequence.pad_sequences(cnn_texts_seq,maxlen=100)

10205


In [49]:
le = preprocessing.LabelEncoder()
le.fit(y)

def encode(le, labels):
    enc = le.transform(labels)
    return keras.utils.to_categorical(enc)

def decode(le, one_hot):
    dec = np.argmax(one_hot, axis=1)
    return le.inverse_transform(dec)
y_enc = encode(le, y)

In [50]:
# split the dataset into training and validation datasets
x_train, x_val, y_train, y_val = model_selection.train_test_split(np.asarray(cnn_texts_mat), np.asarray(y_enc), test_size=0.2, random_state=42)
# split the dataset into training and validation datasets
x_train, x_test, y_train, y_test = model_selection.train_test_split(x_train, y_train, test_size=0.2, random_state=42)

In [51]:
x_train.shape,x_val.shape, x_test.shape

((3878, 100), (1212, 100), (970, 100))

## CNN Model

In [55]:
def train_model(model,x_train,y_train,x_val,y_val,filepath):
        # checkpoint
        checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
        callbacks_list = [checkpoint]
        history = model.fit(x_train, y_train,validation_data=(x_val,y_val),callbacks=callbacks_list,epochs=10, batch_size=2)

In [56]:
def test_model(model,x_test,file_path):
    model.load_weights(file_path)  
    predicts = model.predict(x_test, batch_size=2)
    return predicts

In [57]:
def get_cnn_model_v2(): 
    model = Sequential()
    model.add(Embedding(vocab_size,
                        50, 
                        input_length=max_len))
    model.add(Dropout(0.2))
    model.add(Conv1D(64,
                     3,
                     padding='valid',
                     activation='relu',
                     strides=1))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(256))
    model.add(Dropout(0.2))
    model.add(Activation('relu'))
    model.add(Dense(2))
    model.add(Activation('softmax'))
    model.summary()
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

model_v2 = get_cnn_model_v2()
train_model(model_v2,x_train,y_train,x_val,y_val,'./cnnModel.hdf5')

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 100, 50)           510250    
_________________________________________________________________
dropout_10 (Dropout)         (None, 100, 50)           0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 98, 64)            9664      
_________________________________________________________________
global_max_pooling1d_5 (Glob (None, 64)                0         
_________________________________________________________________
dense_10 (Dense)             (None, 256)               16640     
_________________________________________________________________
dropout_11 (Dropout)         (None, 256)               0         
_________________________________________________________________
activation_10 (Activation)   (None, 256)              

In [58]:
predicts = test_model(model_v2,x_test,'./cnnModel.hdf5')
y_test_dec = decode(le, y_test)
y_preds = decode(le, predicts)

## Accuracy

In [61]:
print(metrics.confusion_matrix(y_test_dec, y_preds))
print(metrics.classification_report(y_test_dec, y_preds))
print("Accuracy:",accuracy_score(y_test_dec,y_preds))

[[294  19]
 [ 44 613]]
              precision    recall  f1-score   support

      NOTPOL       0.87      0.94      0.90       313
         POL       0.97      0.93      0.95       657

    accuracy                           0.94       970
   macro avg       0.92      0.94      0.93       970
weighted avg       0.94      0.94      0.94       970

Accuracy: 0.9350515463917526
