In [18]:
import argparse
import gensim.downloader as api
import numpy as np
import os
import shutil
import tensorflow as tf
from sklearn.metrics import accuracy_score, confusion_matrix
import pandas as pd
import re
import matplotlib.pyplot as plt

In [19]:
data_frame = pd.read_csv('C:/Users/Asad/Downloads/clean_data.csv')

In [20]:
def returnSpecialCharacters(texts,y):
    seq = []
    for i, text in enumerate(texts):
        try:
            final = [re.sub(r"[^a-zA-Z]+", ' ', k) for k in text.split(" ")]
            seq.append(' '.join(final))
        except:
            del y[i]
#             seq.append('  ')
            pass
    return seq, y

In [21]:
texts = data_frame['text']
# texts = data_frame['text'].values
y = data_frame['is_offensive']
texts,y = returnSpecialCharacters(texts,y)
y = data_frame['is_offensive']

In [22]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(texts)
text_sequences = tokenizer.texts_to_sequences(texts)

In [28]:
text_sequences = tf.keras.preprocessing.sequence.pad_sequences(text_sequences)
num_records = len(text_sequences)
max_seqlen = len(text_sequences[0])

print('{:d} sentences, max length: {:d}'.format(num_records, max_seqlen))

184350 sentences, max length: 1403


In [29]:
NUM_CLASSES = 2
labels = tf.keras.utils.to_categorical(y, num_classes = NUM_CLASSES)

In [30]:
word2idx = tokenizer.word_index
idx2word = {v:k for k,v in word2idx.items()}
word2idx["PAD"] = 0
idx2word[0] = "PAD"
vocab_size = len(word2idx)
print('vocab size {:d}'.format(vocab_size))

vocab size 174765


In [32]:
dataset = tf.data.Dataset.from_tensor_slices((text_sequences, labels))
dataset = dataset.shuffle(10000)
test_size = num_records//4
val_size = (num_records - test_size)//10
test_dataset = dataset.take(test_size)
val_dataset = dataset.skip(test_size).take(val_size)
train_dataset = dataset.skip(test_size + val_size)

In [33]:
BATCH_SIZE = 128
test_dataset = test_dataset.batch(BATCH_SIZE, drop_remainder = True)
train_dataset = train_dataset.batch(BATCH_SIZE, drop_remainder = True)
val_dataset = val_dataset.batch(BATCH_SIZE, drop_remainder = True)

In [36]:
EMBEDDING_MODEL = api.load('glove-wiki-gigaword-300')



In [38]:
def build_embedding_matrix(EMBEDDING_MODEL, word2idx, EMBEDDING_DIM):
    E = np.zeros((vocab_size, EMBEDDING_DIM))
    for word, idx in word2idx.items():
        try:
            E[idx] = EMBEDDING_MODEL.word_vec(word)
        except KeyError:
            pass
    return E

In [39]:
EMBEDDING_DIM = 300
E = build_embedding_matrix(EMBEDDING_MODEL, word2idx, EMBEDDING_DIM)

In [45]:
E.shape

(174765, 300)

In [53]:
class SpamClassifierModel(tf.keras.Model):
    def __init__(self, vocab_sz, embed_sz, input_length, num_filters, kernel_sz, output_sz, embedding_weights, **kwargs):
        super(SpamClassifierModel, self).__init__(**kwargs)
        self.embedding = tf.keras.layers.Embedding(vocab_sz,embed_sz, input_length = input_length, weights = [embedding_weights],
                                                  trainable = False)
        self.conv = tf.keras.layers.Conv1D(filters = num_filters, kernel_size = kernel_sz, activation = 'relu')
        self.dropout = tf.keras.layers.SpatialDropout1D(0.2)
        self.pool = tf.keras.layers.GlobalAveragePooling1D()
        self.dense = tf.keras.layers.Dense(output_sz,  activation = 'softmax')
        
    def call(self, x):
        x = self.embedding(x)
        x = self.conv(x)
        x = self.dropout(x)
        x = self.pool(x)
        x = self.dense(x)
        return x

In [54]:
filters = 256
kernel_sz = 3
model = SpamClassifierModel(vocab_size, EMBEDDING_DIM, max_seqlen, filters, kernel_sz, NUM_CLASSES, E)
model.build(input_shape = (None, max_seqlen))

In [55]:
model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

In [58]:
data_frame['is_offensive'].value_counts()

0    147505
1     36845
Name: is_offensive, dtype: int64

In [61]:
NUM_EPOCHS = 3
CLASS_WEIGHTS = {0:1, 1:4}

model.fit(train_dataset, epochs = NUM_EPOCHS, validation_data = val_dataset, class_weight = CLASS_WEIGHTS)

Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x1d789e877c0>

In [63]:
model.save_weights('model.h5', save_format= 'h5')

In [66]:
labels, predictions = [], []
for Xtest, Ytest in test_dataset:
    y_pred = model.predict_on_batch(Xtest)
    y_pred = np.argmax(y_pred, axis = 1)
    y_test = np.argmax(Ytest, axis = 1)
    labels.extend(y_test.tolist())
    predictions.extend(y_pred.tolist())

print("test accuracy: {:.3f}".format(accuracy_score(labels, predictions)))
print("confusion matrix")
print(confusion_matrix(labels, predictions))

test accuracy: 0.884
confusion matrix
[[32323  4569]
 [  783  8405]]
