In [22]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers
import keras
import re

In [23]:
# Load Dataset
comments = pd.read_csv('attack_annotated_comments.tsv', sep = '\t', index_col = 0)
annotations = pd.read_csv('attack_annotations.tsv',  sep = '\t')

In [24]:
# Labelling Comment
labels = annotations.groupby('rev_id')['attack'].mean() > 0.5

In [25]:
# Label and Comment
comments['attack'] = labels

In [26]:
# Clean Text
comments['comment'] = comments['comment'].apply(lambda x: x.replace("NEWLINE_TOKEN", " "))
comments['comment'] = comments['comment'].apply(lambda x: x.replace("TAB_TOKEN", " "))
comments['comment'] = comments['comment'].apply(lambda x: x.lower())
comments['comment'] = comments['comment'].apply((lambda x: re.sub('.,[^a-zA-z0-9\s]','',x)))

In [27]:
# Training and test sets
train_comments = comments.query("split=='train'")
valid_comments = comments.query("split=='test'")

In [28]:
# Split the dataset into training and validation datasets 
train_x, valid_x = train_comments['comment'], valid_comments['comment'], 
train_y, valid_y = train_comments['attack'], valid_comments['attack']

In [29]:
# Label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

In [30]:
# Create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(comments['comment'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='\\w{1,}', tokenizer=None,
                vocabulary=None)

In [31]:
# Transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)

In [32]:
# Word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(comments['comment'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

In [33]:
# Ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(comments['comment'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

In [34]:
# Characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(comments['comment'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x)

In [35]:
pwd

'C:\\Users\\islam\\Desktop\\Anaconda'

In [37]:
# Load the pre-trained word-embedding vectors 
embeddings_index = {}
for i, line in enumerate(open('C:\\Users\\islam\\Desktop\\Anaconda\\wiki-news-300d-1M.vec', encoding="utf8")):
    values = line.split()
    embeddings_index[values[0]] = np.asarray(values[1:], dtype='float32')

In [38]:
# Create a tokenizer 
token = text.Tokenizer()
token.fit_on_texts(comments['comment'])
word_index = token.word_index

In [39]:
# Convert text to sequence of tokens and pad them to ensure equal length vectors 
train_seq_x = sequence.pad_sequences(token.texts_to_sequences(train_x), maxlen=70)
valid_seq_x = sequence.pad_sequences(token.texts_to_sequences(valid_x), maxlen=70)

In [40]:
# Create token-embedding mapping
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [41]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False, epochs=None):
    
    if is_neural_net:
        classifier.fit(feature_vector_train, label ,epochs=epochs)
        predictions = classifier.predict(feature_vector_valid)
        predictions = predictions.argmax(axis=-1)
    else:
        # Fit the training dataset on the classifier
        classifier.fit(feature_vector_train, label)
        # Predict the labels on validation dataset 
        predictions = classifier.predict(feature_vector_valid)
        
    accuracy = metrics.accuracy_score(predictions, valid_y)
    f1score = metrics.f1_score(valid_y, predictions)
    return accuracy, f1score

In [42]:
# Naive Bayes on Count Vectors
accuracy, f1score = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xvalid_count)
print("NB, Count Vectors:   accuracy: %s      f1 score: %s"% (accuracy,f1score))

NB, Count Vectors:   accuracy: 0.9291569591854345      f1 score: 0.6573455759599333


In [43]:
# Naive Bayes on Word Level TF IDF Vectors
accuracy, f1score = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf)
print("NB, WordLevel TF-IDF:   accuracy: %s     f1 score: %s"% (accuracy,f1score))

NB, WordLevel TF-IDF:   accuracy: 0.933859694537924     f1 score: 0.631755945231804


In [44]:
# Naive Bayes on Ngram Level TF IDF Vectors
accuracy, f1score = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("NB, N-Gram Vectors:   accuracy: %s     f1 score: %s"% (accuracy,f1score))

NB, N-Gram Vectors:   accuracy: 0.9119855034946932     f1 score: 0.4785276073619632


In [45]:
# Naive Bayes on Character Level TF IDF Vectors
accuracy, f1score = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print("NB, CharLevel Vectors:   accuracy: %s   f1 score: %s"% (accuracy,f1score))
print("===============================================================================")

NB, CharLevel Vectors:   accuracy: 0.9230304599188885   f1 score: 0.5806299952985425


In [46]:
# Linear Classifier on Count Vectors
accuracy, f1score = train_model(linear_model.LogisticRegression(), xtrain_count, train_y, xvalid_count)
print("LR, Count Vectors:   accuracy: %s   f1 score: %s"% (accuracy,f1score))



LR, Count Vectors:   accuracy: 0.9294589697126585   f1 score: 0.6104360257326662




In [47]:
# Linear Classifier on Word Level TF IDF Vectors
accuracy, f1score = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xvalid_tfidf)
print("LR, WordLevel TF-IDF:   accuracy: %s   f1 score: %s"% (accuracy,f1score))

LR, WordLevel TF-IDF:   accuracy: 0.9418845456898783   f1 score: 0.6987251174233953


In [48]:
# Linear Classifier on Ngram Level TF IDF Vectors
accuracy, f1score = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("LR, N-Gram Vectors:   accuracy: %s   f1 score: %s"% (accuracy,f1score))

LR, N-Gram Vectors:   accuracy: 0.9138838553801019   f1 score: 0.48263348885432866


In [49]:
# Linear Classifier on Character Level TF IDF Vectors
accuracy, f1score = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print("LR, CharLevel Vectors:   accuracy: %s   f1 score: %s"% (accuracy,f1score))
print("===============================================================================")

LR, CharLevel Vectors:   accuracy: 0.940676503580982   f1 score: 0.6917731450347454


In [50]:
# SVM Classifier on Count Vectors
accuracy, f1score = train_model(svm.SVC(gamma='scale'), xtrain_count, train_y, xvalid_count)
print("SVM, Count Vectors:   accuracy: %s   f1 score: %s"% (accuracy,f1score))

SVM, Count Vectors:   accuracy: 0.9056432824229873   f1 score: 0.34891336707353376


In [51]:
# SVM Classifier on Word Level TF IDF Vectors
accuracy, f1score = train_model(svm.SVC(gamma='scale'), xtrain_tfidf, train_y, xvalid_tfidf)
print("SVM, WordLevel TF-IDF:   accuracy: %s   f1 score: %s"% (accuracy,f1score))

SVM, WordLevel TF-IDF:   accuracy: 0.9433083096039347   f1 score: 0.7064343163538874


In [52]:
# SVM on Ngram Level TF IDF Vectors
accuracy, f1score = train_model(svm.SVC(gamma='scale'), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("SVM, N-Gram Vectors TF-IDF:   accuracy: %s   f1 score: %s"% (accuracy,f1score))

SVM, N-Gram Vectors TF-IDF:   accuracy: 0.9142721546293899   f1 score: 0.46999199786609763


In [54]:
# SVM Classifier on Character Level TF IDF Vectors
accuracy, f1score = train_model(svm.SVC(gamma='scale'), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print("SVM, CharLevel Vectors:   accuracy: %s   f1 score: %s"% (accuracy,f1score))
print("===============================================================================")

SVM, CharLevel Vectors:   accuracy: 0.9430925877987747   f1 score: 0.7007034263671432


In [61]:
# CNN model
def cnn(train_x, train_y, valid_x, batch_size=128, epochs = 1):
    # Add an Input Layer
    input_layer = layers.Input((70, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

    # Add the convolutional Layer
    conv_layer = layers.Convolution1D(100, 3, activation="relu")(embedding_layer)

    # Add the pooling Layer
    pooling_layer = layers.GlobalMaxPool1D()(conv_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(pooling_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(train_x, train_y,
              batch_size=batch_size,
              epochs=epochs)
    
    predictions = model.predict(valid_x)
    predictions = predictions.argmax(axis=-1)
    
    accuracy = metrics.accuracy_score(predictions, valid_y)
    f1score = metrics.f1_score(valid_y, predictions) 
    return accuracy, f1score

In [62]:
accuracy, f1score = cnn(train_seq_x, train_y, valid_seq_x, 10)
print("CNN, Word Embeddings:   acuuracy: %s   f1 score: %s"% (accuracy,f1score))
print("===============================================================================")

Epoch 1/1
CNN, Word Embeddings:   acuuracy: 0.8810941409957719   f1 score: 0.0


  'precision', 'predicted', average, warn_for)


In [63]:
# LSTM model
def lstm(train_x, train_y, valid_x, batch_size=1024, epochs = 10):
    # Add an Input Layer
    input_layer = layers.Input((70, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

    # Add the LSTM Layer
    lstm_layer = layers.Bidirectional(layers.LSTM(100))(embedding_layer)
    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(lstm_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.fit(train_x, train_y,
              batch_size=batch_size,
              epochs=epochs)
    
    predictions = model.predict(valid_x)
    predictions = predictions.argmax(axis=-1)
    
    accuracy = metrics.accuracy_score(predictions, valid_y)
    f1score = metrics.f1_score(valid_y, predictions) 
    return accuracy, f1score

In [64]:
accuracy, f1score = lstm(train_seq_x, train_y, valid_seq_x)
print("LSTM, Word Embeddings:  accuracy: %s   f1 score: %s"% (accuracy,f1score))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
LSTM, Word Embeddings:  accuracy: 0.8810941409957719   f1 score: 0.0
