Importing Libraries

In [1]:
import time
import re, string
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
from wordcloud import WordCloud
import os
from google.colab import drive
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
lemmatizer = WordNetLemmatizer()
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

Mounting to Google Drive Folder

In [0]:
drive.mount('/content/drive')
os.chdir('/content/drive/My Drive/DBA3803')
os.getcwd()

Reading training data and displaying a few rows of the training data

In [0]:
train = pd.read_csv("train.csv")
print(len(train))
train.head(5)

Function to clean the data and to detect the word's Part of Speech tag in order to accurately lemmatise the word based on its Part of Speech Tag

In [0]:
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def clean_text(sentences):
    print('Preprocessing Begins...')
    result = []
    for text in sentences:
        text = str(text)
        text = text.lower()
        text = re.sub(r"\'m", " am", text)
        text = re.sub(r"\'s", " is", text)
        text = re.sub(r"\'ll", " will", text)
        text = re.sub(r"\'ve", " have", text)
        text = re.sub(r"\'re", " are", text)
        text = re.sub(r"won\'t", "will not", text)
        text = re.sub(r"can\'t", "cannot", text)
        text = re.sub(r"n\'t", " not", text)
        text = re.sub(r"\'bout", "about", text)
        text = re.sub(r"\'til", "until", text)
        text = re.sub(r"""[^a-zA-Z0-9-!$%^&@*()+|=`{}\[\]:;"'<>?,.\/\s]"""," ",text)
        text = re.sub(r"\bft\b|\bforeigntalent\b","foreign talent",text)
        text = re.sub(r"\bgovt\b|\bgahment\b|\bgahmen\b|\bgov\b","government",text)
        text = re.sub(r"\bpapies\b|\bpapees\b","pappies",text)
        text = re.sub(r"\bchi bai\b|\bchibai\b|\bchee bye\b|\bcheebye\b","cb",text)
        text = re.sub(r"\bfk\b|\bfck\b","fuck",text)
        text = re.sub(r"\bkanina\b|\bkanena\b","knn",text)
        text = re.sub(r"\blky\b","lee kuan yew",text)
        text = re.sub(r"\blhl\b|\bpm lee\b","lee hsien loong",text)
        if len(text.split()) == 0:
            continue
        then = []
        sw = stopwords.words('english')
        for word in text.split():
            if word not in sw:
              word = lemmatizer.lemmatize(word, get_wordnet_pos(word))
              then.append(word)
        result.append(then)
    result = [" ".join(x) for x in result]
    print(result[:3], '\n')
    return result



Cleaning the data with the earlier function

In [0]:
clean_train = clean_text(train["Comments"])
clean_join = pd.Series(clean_train)
train["Comments"] = clean_join
train = train.dropna()
train["Comments"][:5]

Creating a list for all the labels

In [0]:
label_cols = ['Insulting', 'Anti Government', 'Xenophobic', 'Racist', 'Sexual']

Using the TfidfVectorizer function to create a Term Frequency-Inverse Document Frequency Document Term Matrix. 

Creating Bigrams and Unigrams.

Feature must at least appear in 3 comments to to be in the Matrix.

Displaying some of the features in the Matrix

In [0]:
vec = TfidfVectorizer(ngram_range=(1,2), 
               min_df=3, strip_accents='unicode')
trn_term_doc = vec.fit_transform(train["Comments"])
print(vec.get_feature_names()[5000:5050])
print(len(vec.get_feature_names()))

Sample of the Term Freqeuncy-Inverse Document Freqeuncy Document-Term Matrix

In [0]:
x = trn_term_doc
df = pd.SparseDataFrame(x, columns = vec.get_feature_names())
df.loc[:5,["government","foreign","minister"]]

Creating a likelihood function whereby it sums the TF-IDF weights of a feature given the comment contains the class and divides it by the total number of comments that contain that respective class

In [0]:
def pr(y_i, y):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+ 1)

Creating a function where it log divides both likelihoods to create the Bayesian ratio and multiplies the ratio against the TF-IDF Document Term matrix

Function creates a Logistic Regression model for each of the 5 labels with each label having their own y variable (either 1 or 0) and x variable (ratio adjusted TF-IDF Document Term Matrix)

In [0]:
scores = []
sd = []

def get_mdl(y):
    y = y.values
    r = np.log(pr(1,y) / pr(0,y))
    m = LogisticRegression(solver = "liblinear", max_iter = 1000)
    x_nb = x.multiply(r)
    cv_score = np.mean(cross_val_score(
        m, x_nb, y, cv=5, scoring='roc_auc'))
    cv_sd = np.std(cross_val_score(
        m, x_nb, y, cv=5, scoring='roc_auc'))
    scores.append(cv_score)
    sd.append(cv_sd)
    print('CV score for class {} is {}, standard deviation is {}'.format(j, cv_score, cv_sd))
    z = m.fit(x_nb, y)
    return m.fit(x_nb, y)
  

Running earlier functions to create Bayesian ratio and to fit the ratio multiplied TF-IDF Document Term matrix and the classes (either 1 or 0) to 5 different logistic regression models representing each of the 5 labels.

Using 5-fold Cross Validation scoring by the Receiver Operating Characteristic, Area Under Curve measure we can gather the mean of the score and the mean of the standard deviation for each label.



In [0]:
for i, j in enumerate(label_cols):
    print('fit', j)
    get_mdl(train[j])
    
print('Mean CV score is {}'.format(np.mean(scores)))

print('The Standard Deviation of the CV score is {}'.format(np.std(scores)))

Creating similar function as the one earlier, except without CV scores and standard deviations

In [0]:
def test_mdl(y):
    y = y.values
    r = np.log(pr(1,y) / pr(0,y))
    m = LogisticRegression(C=4, dual=True, solver = "liblinear", max_iter = 1000)
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r

Running a test comment on the model and getting the run time for the test comment's results to be generated

In [0]:
start = time.time()
test_str = "Dumb idiot"
test_str = clean_text([test_str])
test_com = pd.Series(test_str)
test_vec = vec.transform(pd.Series(test_com))
test_pred = np.zeros((len(test_com), len(label_cols)))

for i, j in enumerate(label_cols):
    m,r = test_mdl(train[j])
    test_pred[:,i] = m.predict_proba(test_vec.multiply(r))[:,1]
    if test_pred[:,i] >= 0.5:
      print("\n","This comment is",j)
    
    
score = pd.DataFrame(test_pred, columns = label_cols)
print("\n\n",score)


end = time.time()
print("\n\n","The run time is:",end-start)