# Individual experimentation belonging to:

#### Mohamed Hassan 
#### URN: 6 

## Imports

In [None]:
# Imports
import pickle
import keras
import pandas as pd
import numpy as np
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Conv1D, Dropout, Conv1D, GlobalMaxPooling1D, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Constants
EPOCHS = 30
INIT_LR = 1e-3

### Constants

In [None]:
TRAIN_DATASET = "train.csv"
TEST_DATA = "test.csv"
TEST_LABELS = "test_labels.csv"
REDUNDANT_FIELDS = ["id"]
DATA_FIELD = ["comment_text"]
LABEL_FIELDS = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]

NUM_WORDS = 20000
EMBEDDING_DIM = 100
MAX_WORD = 200

BATCH_SIZE = 32
EPOCHS = 10

N_SPLITS = 10
N_REPEATS = 3
RANDOM_STATE = None

# Pre-Processing

ddd

## Preparing the data

fff

In [None]:
# Read in training dataset
train = pd.read_csv(TRAIN_DATASET)

# Read in test data and labels
test_data = pd.read_csv(TEST_DATA)
test_labels = pd.read_csv(TEST_LABELS)

# Combine test data and labels into one data frame
test = pd.concat([test_data, test_labels], axis=1)

# Remove redundant id field from both datasets
train = train.drop(columns=REDUNDANT_FIELDS)
test = test.drop(columns=REDUNDANT_FIELDS)

# Remove samples with labels containing -1 in test dataset, this 
# is a place holder for samples that were not assigned labels.
test = test.drop(test[(test.toxic == -1) |
                      (test.severe_toxic == -1) |
                      (test.obscene == -1) |
                      (test.threat == -1) |
                      (test.insult == -1) |
                      (test.identity_hate == -1)].index)

#### Here is how the training dataset looks

In [None]:
train

#### Here is how the test dataset looks

In [None]:
test

### Class breakdown visualisation

In [None]:
# Putting the data into a ...
dataset = {}

for index in range(1,7):
    dataset.update({x.iloc[:,index].name : np.dataset(x.iloc[:,index], bins=2)[0]})

In [None]:
dataset

#### Visualising data spread

In [None]:
# Visualisation of label spread
df.plot(x ='labels', y='Is', kind = 'bar')

plt.title('Number of messages associated with labels')
plt.ylabel('Number of messages')
plt.xlabel('Labels')

plt.show()

#### Average word length

In [None]:
# Finds average word length for each label
def calculate_average_word_length(doc):
    # Construct a list that contains the word lengths for each DISTINCT word in the document
    vocab_lengths = [len(i) for i in set(doc)] # TODO 4
    # Find the average word type length
    avg_vocab_length = sum(vocab_lengths) / len(vocab_lengths) # TODO 5

    return avg_vocab_length

In [None]:
toxic_avg = calculate_average_word_length(toxic_comments['comment_text'])
severe_toxic_avg = calculate_average_word_length(severe_toxic_comments['comment_text'])
threat_avg = calculate_average_word_length(threat_comments['comment_text'])
identity_hate_avg= calculate_average_word_length(identity_hate_comments['comment_text'])
obscene_avg= calculate_average_word_length(obscene_comments['comment_text'])
insult_avg = calculate_average_word_length(insult_comments['comment_text'])


df['average word-length'] = [toxic_avg, severe_toxic_avg, obscene_avg, threat_avg, insult_avg, identity_hate_avg]

In [None]:
# Visualise average word length
df.plot(x ='labels', y='average word-length', kind = 'bar')

plt.title('Average word length')
plt.ylabel('Number of messages')
plt.xlabel('Labels')

plt.show()

### Class balancing

In [None]:
# Drop toxic samples
train = train.drop(train[(train.toxic == 1) & 
                          (train.severe_toxic != 1) & 
                          (train.threat != 1) & 
                          (train.obscene != 1) &
                          (train.insult != 1) &
                          (train.identity_hate != 1)].index[:6000])

# Drop obscene samples
train = train.drop(train[(train.toxic == 1) & 
                          (train.severe_toxic != 1) & 
                          (train.threat != 1) & 
                          (train.obscene == 1) &
                          (train.insult != 1) &
                          (train.identity_hate != 1)].index[:3000])

# Drop insult samples
train = train.drop(train[(train.toxic == 1) & 
                          (train.severe_toxic != 1) & 
                          (train.threat != 1) & 
                          (train.obscene != 1) &
                          (train.insult == 1) &
                          (train.identity_hate != 1)].index[:3000])

# Drop non-toxic samples
train = train.drop(train[(train.toxic != 1) & 
                          (train.severe_toxic != 1) & 
                          (train.threat != 1) & 
                          (train.obscene != 1) &
                          (train.insult != 1) &
                          (train.identity_hate != 1)].index[:100000])


### Class spread visualisation after balancing

### Cleaning dataset

In [None]:
# Remove punctuation
regex_str = "[^a-zA-Z\s]"
train['comment_text'] = train['comment_text'].replace(regex=regex_str, value="")

# Remove extra whitespaces
regex_space = "\s+"
train['comment_text'] = train['comment_text'].replace(regex=regex_space, value=" ")

# Strip whitespaces
train['comment_text'] = train['comment_text'].str.strip()

# Lowercase
train['comment_text'] = train['comment_text'].str.lower()

# Convert comment_text column into a list
comment_list = train['comment_text'].tolist()

print(comment_list[898])

### Stopword removal

To remove the stopwords a list of word ... <br>
The stopword list used here is the base with the upcoming stopword lists experiments using different stopword lists such as ... 

In [None]:
# Remove stopwords, using stopword list from nltk
stopword_list = set(stopwords.words('english'))
removed_stopwords = [word for word in tokenised_comment if word not in stopword_list]

### Visualisation of top stopwords 

In [None]:
from nltk.corpus import stopwords

def plot_top_stopwords_barchart(text):
    stop=set(stopwords.words('english'))
    
    new= text.str.split()
    new=new.values.tolist()
    corpus=[word for i in new for word in i]
    from collections import defaultdict
    dic=defaultdict(int)
        if word in stop:
            dic[word]+=1
            
    top=sorted(dic.items(), key=lambda x:x[1],reverse=True)[:10] 
    x,y=zip(*top)
    plt.bar(x,y)

In [None]:
plot_top_stopwords_barchart(x['comment_text'])

### Visualisation of top most frequent words after stopword removal

In [None]:
from collections import  Counter

def plot_top_non_stopwords_barchart(text):
    stop=set(stopwords.words('english'))
    
    new= text.str.split()
    new=new.values.tolist()
    corpus=[word for i in new for word in i]

    counter=Counter(corpus)
    most=counter.most_common()
    x, y=[], []
    for word,count in most[:40]:
        if (word not in stop):
            x.append(word)
            y.append(count)
            
    sns.barplot(x=y,y=x)

In [None]:
plot_top_non_stopwords_barchart(x['comment_text'])    

### N-grams

In [None]:
# Gensim N-grams
# Create bigram model
bigram = Phrases(comment_token, min_count=5, threshold=100)
bigram_model = Phraser(bigram)

### Visualisation of top N-grams 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import seaborn as sns

def plot_top_ngrams_barchart(text, n=2):
    stop=set(stopwords.words('english'))

    new= text.str.split()
    new=new.values.tolist()
    corpus=[word for i in new for word in i]

    def _get_top_ngram(corpus, n=None):
        vec = CountVectorizer(ngram_range=(n, n)).fit(corpus)
        bag_of_words = vec.transform(corpus)
        sum_words = bag_of_words.sum(axis=0) 
        words_freq = [(word, sum_words[0, idx]) 
                      for word, idx in vec.vocabulary_.items()]
        words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
        return words_freq[:10]

    top_n_bigrams=_get_top_ngram(text,n)[:10]
    x,y=map(list,zip(*top_n_bigrams))
    sns.barplot(x=y,y=x)

In [None]:
plot_top_ngrams_barchart(x['comment_text'], 2)

### Tokenise dataset 

This is done ...

In [None]:
# Tokenize function
def tokenize(text):
    return [word_tokenize(word) for word in text]

In [None]:
# Convert comment_text column into a list
comment_list = train_dataset['comment_text'].tolist()

# Tokenize
comment_token = tokenize(comment_list)

### Lemmatisation 

Here we will be applying ...
One of the upcoming experiments will be implementing different stemmers to determine which has the best ...

In [None]:
from nltk.corpus import wordnet

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    # now we need to convert from nltk to wordnet POS notations (for compatibility reasons)
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN) # return and default to noun if not found

In [None]:
nltk.download('averaged_perceptron_tagger')
comment_lemma = []
for comment in comment_token_stop:
    temp = []
    temp.append([lemmatizer.lemmatize(word, pos=get_wordnet_pos(word)) for word in comment])
    comment_lemma += temp


### Saving preprocessed dataset

In [None]:
# Save lemmatised tokens
pickle.dump(comment_lemma, open("comment_lemma.pickle","wb"))

### Wordcloud visualisation 

ddd

### Heatmap visualisation 

ffff

# Model 

The model ...

### Building model

ff

In [None]:
def build_model(num_words):
    EPOCHS = 30
    INIT_LR = 1e-3

    model = Sequential()

    model.add(Embedding(num_words, 128))
    model.add(Dropout(0.4))
    model.add(Conv1D(128, 7, padding="valid", activation="relu", strides=3))
    model.add(Conv1D(128, 7, padding="valid", activation="relu", strides=3))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(128, activation="relu"))
    model.add(Dropout(0.5))
    model.add(Dense(6, activation='softmax'))

    adam = tf.keras.optimizers.Adam(lr=INIT_LR, decay=INIT_LR / EPOCHS)
    
    model.compile(loss='binary_crossentropy',
                optimizer=adam,
                metrics=['accuracy'])
    
    return model

### ddd

In [None]:
num_words = 20000
max_len = 80

tokenizer = Tokenizer(num_words)
tokenizer.fit_on_texts(x_train)
corpus = tokenizer.word_index
reverse_corpus = dict(map(reversed, corpus.items()))

In [None]:
x_sequences_train = tokenizer.texts_to_sequences(x_train)
x_padded_train = keras.preprocessing.sequence.pad_sequences(x_sequences_train, maxlen=max_len)
x_padded_train = np.array(x_padded_train)

In [None]:
y = pd.read_pickle('balanced_dataset.pickle')
y = y.drop(columns="comment_text")

### Model compilation

dd

In [None]:
model.fit(x_padded_train, y, batch_size=60, epochs=30)

### Saving model

In [None]:
model.save('base_model')

In [None]:
new_model = tf.keras.models.load_model('base_model')
new_model.summary()

# Experiment 1 - Stemmers 

dd

### Experiment 1.1 - Snowball stemmer

ffff

### Compiling experiments

fggg

### Evaluation

ffff

# Experiment 2 - Stopword

ff

### Experiment 2.1 - Snowball stemmer

ffff

### Compiling experiments

fggg

### Evaluation

ffff

# Experiment 3 - Spelling correction 

dd

### Experiment 3.1 - Snowball stemmer

ffff

### Compiling experiments

fggg

### Evaluation

ffff

# Experiment 4 - ??? 

dd

### Experiment 4.1 - Snowball stemmer

ffff

### Compiling experiments

fggg

### Evaluation

ffff

In [None]:
# Gensim N-grams
# Create bigram model
bigram = Phrases(comment_token, min_count=5, threshold=100)
bigram_model = Phraser(bigram)

In [None]:
# Remove stopwords
comment_stop = [word for word in comment_list if word not in STOP_WORDS]

# Tokenize stopwords removed
comment_token_stop = tokenize(comment_stop)

# Create Gensim n-grams
comment_bigrams = [bigram_model[word] for word in comment_token_stop]

In [None]:
# Stemmers implementation

# Model

In [None]:
num_words = 20000
max_len = 80

tokenizer = Tokenizer(num_words)
tokenizer.fit_on_texts(x_train)
corpus = tokenizer.word_index
reverse_corpus = dict(map(reversed, corpus.items()))

In [None]:
x_sequences_train = tokenizer.texts_to_sequences(x_train)
x_padded_train = keras.preprocessing.sequence.pad_sequences(x_sequences_train, maxlen=max_len)
x_padded_train = np.array(x_padded_train)

In [None]:
y = pd.read_pickle('balanced_dataset.pickle')
y = y.drop(columns="comment_text")

In [None]:
model = utils.build_model(num_words)

model.fit(x_padded_train, y, batch_size=60, epochs=30)

In [None]:
model.save('base_model')

# Model evaluation

In [None]:
import utils
import pandas as pd
import keras
import numpy as np

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

TEST_DATA = "test.csv"
TEST_LABELS = "test_labels.csv"
DATA_FIELD = ["id","comment_text"]
LABEL_FIELDS = ["id","toxic","severe_toxic","obscene","threat","insult","identity_hate"]
REDUNDANT_FIELDS = ["id"]

In [None]:
# Load in model
model = utils.build_model(20000)
model.summary()

# make predictions on model
test_data = pd.read_csv(TEST_DATA)
test_labels = pd.read_csv(TEST_LABELS)

# Combine test data and labels into one data frame
test_dataset = pd.concat([test_data, test_labels], axis=1)

# Remove redundant id field 
test_dataset = test_dataset.drop(columns=REDUNDANT_FIELDS)

# Remove samples with labels containing -1 in test dataset, this 
# is a place holder for samples that were not assigned labels.
test_dataset = test_dataset.drop(test_dataset[(test_dataset.toxic == -1) |
                                              (test_dataset.severe_toxic == -1) |
                                              (test_dataset.obscene == -1) |
                                              (test_dataset.threat == -1) |
                                              (test_dataset.insult == -1) |
                                              (test_dataset.identity_hate == -1)].index)

x_test = test_dataset["comment_text"]
y_true = test_dataset[["toxic","severe_toxic","obscene","threat","insult","identity_hate"]]


tokenizer = Tokenizer(20000)
tokenizer.fit_on_texts(x_test)
corpus = tokenizer.word_index
reverse_corpus = dict(map(reversed, corpus.items()))

x_sequences_test = tokenizer.texts_to_sequences(x_test)
x_padded_test = keras.preprocessing.sequence.pad_sequences(x_sequences_test, maxlen= 150)
x_padded_test = np.array(x_padded_test)

print("Shape of test data:", x_padded_test.shape)
print("Shape of test labels:", y_true.shape)

In [None]:
from sklearn.metrics import roc_curve, auc, f1_score, accuracy_score
import matplotlib.pyplot as plt

y_pred = model.predict(x_padded_test).round()
y_pred = y_pred.astype(int)

print(y_pred[7])
print(y_true.values[7])
print(y_pred)
print(y_true.values)

f1 = f1_score(y_true.values, y_pred, average='macro')
accuracy = accuracy_score(y_true, y_pred)

print("F1: ", f1)
print("Accuracy: ", accuracy)


In [None]:
# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(6):
    fpr[i], tpr[i], _ = roc_curve(y_true.values[:, i], y_pred[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])
    

# Plot of a ROC curve for a specific class
for i in range(6):
    plt.figure()
    plt.plot(fpr[i], tpr[i], label='ROC curve (area = %0.2f)' % roc_auc[i])
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()
