<b>Ευριπίδης Παντελαίος - 1115201600124 </b>

In [1]:
import pandas as pd
import numpy as np
import scipy
import nltk

from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn import svm, datasets
from sklearn.ensemble import RandomForestClassifier

from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import accuracy_score, f1_score

from nltk.stem import WordNetLemmatizer 

In [2]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english')) 

pd.options.display.max_colwidth = None

<br><b>Some useful functions </b><br>
<b> 1) Cleaning</b><br>
<b> 2) Lemmatization</b><br>
<b> 3) Remove stop words </b><br>
<b> 4) Part-of-Speech Tag</b><br>

In [3]:
#clean data and remove symbols, urls, unnecessary words
def cleanData(comments):
    
    StoredComments = []
    for line in comments:
      
        line = line.lower()
        
        #replace some words, symbols and letters that appear frequently and are useless
        line = line.replace('-', '')
        line = line.replace('_', '')
        line = line.replace('0', '')
        line = line.replace("\n", '')
        line = line.replace("\\", '')
        line = line.replace('XD', '')  
        line = line.replace('..', '') 
        line = line.replace('  ', ' ') 
        line = line.replace('https', '')
        line = line.replace('http', '')
                                                                    
        removeList = ['@', r'\x', '\\', 'corrup', '^', '#', '$', '%', '&']
        #for line in comments:
        words = ' '.join([word for word in line.split() if not any([phrase in word for phrase in removeList]) ])
        StoredComments.append(words)
           
    return StoredComments

In [4]:
#lemmatize the comments
def lemmatizer (comments):
    
    lemma = WordNetLemmatizer()
    
    StoredComments = []
    for line in comments:
        line = ' '.join([lemma.lemmatize(w) for w in nltk.word_tokenize(line)])
        StoredComments.append(line)
        
    return StoredComments

In [5]:
#remove stop words
def removeStopWords (comments):
    
    StoredComments=[]
    for line in comments:
        line = ' '.join([w for w in nltk.word_tokenize(line) if w not in stop_words])
        StoredComments.append(line)
        
    return StoredComments

In [6]:
#calculate Pos tags and the frequency of them
def posTag(comments):
    
    adjectiveFrequency=[]
    adverbFrequency=[]
    nounFrequency=[]
    verbFrequency=[]
    
    for comment in comments:
        
        adjectiveCounter=0
        adverbCounter=0
        nounCounter=0
        verbCounter=0
        
        #Pos tagging the words 
        words = nltk.word_tokenize(comment)
        words = nltk.pos_tag(words)
        cnt = len(words)
        
        for word in words:
            if(word[1][:1] == 'NN'):
                nounCounter = nounCounter+1
            
            elif(word[1][:1] == 'VV'):
                verbCounter = verbCounter+1 
                
            elif(word[1][:1] == 'RR'):
                adverbCounter = adverbCounter+1
                
            elif(word[1][:1] == 'JJ'):
                adjectiveCounter = adjectiveCounter+1
                
                
        #not divide with zero       
        if(cnt!=0):    #calculate the frequency of each tag
            nounFrequency.append(nounCounter/cnt)
            verbFrequency.append(verbCounter/cnt)
            adverbFrequency.append(adverbCounter/cnt)
            adjectiveFrequency.append(adjectiveCounter/cnt)
            
        else:
            nounFrequency.append(0)
            verbFrequency.append(0)
            adverbFrequency.append(0)
            adjectiveFrequency.append(0)
            
    return nounFrequency, verbFrequency, adverbFrequency, adjectiveFrequency

<br><br><b> Read csv files for train and test set and cleaning the data</b>

In [7]:
trainSet = pd.read_csv("data/train.csv")
testSet = pd.read_csv("data/impermium_verification_labels.csv")   #I dont use the file 'impermium_verification_set.csv' at all, 
                                                                    #because the other file named 'impermium_verification_labels.csv'
                                                                    #covers completely the requirements of the exercise.

#Cleaning the data and test set
trainSet['Comment'] = cleanData(trainSet['Comment'])
testSet['Comment'] = cleanData(testSet['Comment'])

<br><b>Train the train data with Bag of Words </b>

In [8]:
countVectorizer = CountVectorizer()

BagOfWordsTrain = countVectorizer.fit_transform(trainSet['Comment'].values)
BagOfWordsTrainArray = BagOfWordsTrain.toarray()

<br><b>Train the test data with Bag of Words </b>

In [9]:
BagOfWordsTest = countVectorizer.transform(testSet['Comment'].values)
BagOfWordsTestArray = BagOfWordsTest.toarray()

<br><br><b> Gaussian Naive Bayes classifier </b>

In [10]:
classifierNB = GaussianNB()

classifierNB.fit(BagOfWordsTrainArray, trainSet['Insult'])

BoWprediction = classifierNB.predict(BagOfWordsTestArray)

y_test = testSet['Insult']

<br><br><b> Gaussian Naive Bayes Scores</b>

In [11]:
print ('Accuracy Score:', accuracy_score(y_test, BoWprediction))
print('F1 Score:', f1_score(y_test, BoWprediction))

Accuracy Score: 0.5266219239373602
F1 Score: 0.5208333333333333


<br><br><b> Now I am doing 4 optimizations for Naive Bayes (Lemmatization, Remove stop words, Bigrams, Laplace Smoothing</b>

<b> 1) Lemmatization</b>

In [12]:
trainSet['commentLemmatization'] = lemmatizer(trainSet['Comment'])
testSet['commentLemmatization'] = lemmatizer(testSet['Comment'])

In [13]:
lemmazationTrain = countVectorizer.fit_transform(trainSet['commentLemmatization'])
lemmazationTrainArray = lemmazationTrain.toarray()

lemmazationTest = countVectorizer.transform(testSet['commentLemmatization'])
lemmazationTestArray = lemmazationTest.toarray()

classifierNB.fit(lemmazationTrainArray,trainSet['Insult'])
lemmatizationPredict = classifierNB.predict(lemmazationTestArray)

print('Accuracy Score:', accuracy_score(y_test, lemmatizationPredict))
print('F1 Score:', f1_score(y_test, lemmatizationPredict))

Accuracy Score: 0.5257270693512305
F1 Score: 0.5276292335115864


<br><b>2) Remove stop words </b>

In [14]:
trainSet['commentStopWords'] = removeStopWords(trainSet['Comment'])
testSet['commentStopWords'] = removeStopWords(testSet['Comment'])

In [15]:
stopWordsTrain = countVectorizer.fit_transform(trainSet['commentStopWords'])
stopWordsTrainArray = stopWordsTrain.toarray()

stopWordsTest = countVectorizer.transform(testSet['commentStopWords'])
stopWordsTestArray = stopWordsTest.toarray()

classifierNB.fit(stopWordsTrainArray,trainSet['Insult'])
stopWordPredict = classifierNB.predict(stopWordsTestArray)

print ('Accuracy Score:', accuracy_score(y_test, stopWordPredict))
print('F1 Score:', f1_score(y_test, stopWordPredict))

Accuracy Score: 0.5243847874720358
F1 Score: 0.5174761688606445


<br><b> 3) Bigrams</b>

In [16]:
bigramVectorizer = CountVectorizer(ngram_range=(2,2))

bigramTrain = bigramVectorizer.fit_transform(trainSet['Comment'])
bigramTrainArray = bigramTrain.toarray()

bigramTest= bigramVectorizer.transform(testSet['Comment'])
bigramTestArray = bigramTest.toarray()

classifierNB.fit(bigramTrainArray,trainSet['Insult'])
bigramPredict = classifierNB.predict(bigramTestArray)

print ('Accuracy Score:', accuracy_score(y_test, bigramPredict))
print('F1 Score:', f1_score(y_test, bigramPredict))

Accuracy Score: 0.556599552572707
F1 Score: 0.5292161520190024


<br><b> 4) Laplace Smoothing</b>

In [17]:
classifierMultinomialNB = MultinomialNB(alpha=1.0)
classifierMultinomialNB.fit(BagOfWordsTrainArray,trainSet['Insult'])
laplacePredict = classifierMultinomialNB.predict(BagOfWordsTestArray)

print ('Accuracy Score:', accuracy_score(y_test, laplacePredict))
print('F1 Score:', f1_score(y_test, laplacePredict))

Accuracy Score: 0.6769574944071588
F1 Score: 0.6143162393162394


<br><br> <b>Tf-idf Vectorizer </b> <br>

In [18]:
TfIdf = TfidfVectorizer()

TfIdfTrain = TfIdf.fit_transform(trainSet['Comment'])
TfIdfTest = TfIdf.transform(testSet['Comment'])

<br><br> <b>Part-of-Speech features for Train set </b><br>


In [19]:
AdjectiveTrain, AdverbTrain, NounTrain, VerbTrain = posTag(trainSet['Comment'])

<br><b>Append tf-idf and Part-of-Speech features for train set</b><br>

In [20]:
posTrainVectorizer = scipy.sparse.hstack((TfIdfTrain, scipy.sparse.csr_matrix(NounTrain).T))

posTrainVectorizer = scipy.sparse.hstack((posTrainVectorizer, scipy.sparse.csr_matrix(AdjectiveTrain).T))

posTrainVectorizer = scipy.sparse.hstack((posTrainVectorizer, scipy.sparse.csr_matrix(AdverbTrain).T))

posTrainVectorizer = scipy.sparse.hstack((posTrainVectorizer, scipy.sparse.csr_matrix(VerbTrain).T))

<br><br><b>Part-of-Speech features for Test set </b>

In [21]:
AdjectiveTest, AdverbTest, NounTest, VerbTest = posTag(testSet['Comment'])

<br><b>Append tf-idf and Part-of-Speech features for test set</b>

In [22]:
posTestVectorizer = scipy.sparse.hstack((TfIdfTest, scipy.sparse.csr_matrix(NounTest).T))

posTestVectorizer = scipy.sparse.hstack((posTestVectorizer, scipy.sparse.csr_matrix(AdjectiveTest).T))

posTestVectorizer = scipy.sparse.hstack((posTestVectorizer, scipy.sparse.csr_matrix(AdverbTest).T))

posTestVectorizer = scipy.sparse.hstack((posTestVectorizer, scipy.sparse.csr_matrix(VerbTest).T))

<br><b> Test score for Tf-idf PoS model</b>

In [23]:
classifierMultinomialNB.fit(posTrainVectorizer, trainSet['Insult'])
posVectorizerPredict = classifierMultinomialNB.predict(posTestVectorizer)

print('Accuracy Score:', accuracy_score(y_test, posVectorizerPredict))
print('F1 Score:', f1_score(y_test, posVectorizerPredict))

Accuracy Score: 0.545413870246085
F1 Score: 0.11343804537521815


<br><br><b>SVM </b>

In [24]:
svc = svm.SVC(kernel='linear', C=1.0, gamma=0.9)

In [25]:
svc.fit(posTrainVectorizer,trainSet['Insult']) 
posVectorizerSVM = svc.predict(posTestVectorizer)

print ('Accuracy Score:', accuracy_score(y_test, posVectorizerSVM))
print ('Test F1:', f1_score(y_test, posVectorizerSVM))

Accuracy Score: 0.6926174496644295
Test F1: 0.6094371802160318


<br><br><b> Random Decision Forest</b>

In [26]:
randomDecisionForest = RandomForestClassifier(n_estimators = 150)

randomDecisionForest.fit(posTrainVectorizer, trainSet['Insult']) 
posVectorizerRandomForest = randomDecisionForest.predict(posTestVectorizer)

print ('Accuracy Score:', accuracy_score(y_test, posVectorizerRandomForest))
print ('Test F1:', f1_score(y_test, posVectorizerRandomForest))

Accuracy Score: 0.6259507829977629
Test F1: 0.42024965325936203


<br><br><b> Beat the benchmark with proper data processing with lemmatization, remove stop words and using Tf-idf and SVM</b>

In [27]:
#I couldn't improve the scores much ...
#as there are many slang words and methods that are impossible to understand,
#even with modern improved algorithms,  if these words are offensive or not.
#If the values of dataset were labeled correct I could produce better results.

TfIdf = TfidfVectorizer(ngram_range=(1, 2))

trainSet['commentLemmatization'] = removeStopWords(trainSet['commentLemmatization'])
testSet['commentLemmatization'] = removeStopWords(testSet['commentLemmatization'])

TfIdfTrain = TfIdf.fit_transform(trainSet['commentLemmatization'])
TfIdfTest = TfIdf.transform(testSet['commentLemmatization'])

svc.fit(TfIdfTrain,trainSet['Insult'])
TfIdfPredict = svc.predict(TfIdfTest)

print ('Accuracy Score:', accuracy_score(y_test, TfIdfPredict))
print ('F1 Score:', f1_score(y_test, TfIdfPredict))

Accuracy Score: 0.6917225950782998
F1 Score: 0.6005797101449276
