In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import numpy as np
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn import metrics
from statistics import *
from scipy.sparse import csr_matrix

Reading and improving the quality of the data

In [2]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/impermium_verification_set.csv")
test_solution = pd.read_csv("data/impermium_verification_labels.csv")

In [3]:
test_solution['Comment'] = test_solution['Comment'].str.lower() # lowercase
test_solution['Comment'] = test_solution['Comment'].str.replace('[^\w\s]','') # remove punctuations
test_solution['Comment'] = test_solution['Comment'].str.replace('\n', '')
test_solution['Comment'] = test_solution['Comment'].str.replace('xa0', '') # remove 'xa0' string
test_solution['Comment'] = test_solution['Comment'].str.replace('\d+', '') # remove digits from the text
test_solution['Comment'] = test_solution['Comment'].fillna('')

answers = test_solution['Insult']

test_solution.head()

Unnamed: 0,id,Insult,Date,Comment,Usage
0,1,0,20120603163526Z,like this if you are a tribe fan,PrivateTest
1,2,1,20120531215447Z,youre idiot,PrivateTest
2,3,1,20120823164228Z,i am a woman babs and the only war on women i ...,PrivateTest
3,4,1,20120826010752Z,wow you benefitted so many wins this year fro...,PrivateTest
4,5,1,20120602223825Z,haha green me red you now loser whos winning n...,PrivateTest


In [4]:
train['Comment'] = train['Comment'].str.lower() # lowercase
train["Comment"] = train['Comment'].str.replace('[^\w\s]','') # remove punctuations
train['Comment'] = train['Comment'].str.replace('\n', '') # remove '\n'
train['Comment'] = train['Comment'].str.replace('xa0', '') # remove 'xa0'
train['Comment'] = train['Comment'].str.replace('\d+', '') # remove digits from the text
train['Comment'] = train['Comment'].fillna('')
train

Unnamed: 0,Insult,Date,Comment
0,1,20120618192155Z,you fuck your dad
1,0,20120528192215Z,i really dont understand your point it seems t...
2,0,,axcmajority of canadians can and has been wron...
3,0,,listen if you dont wanna get married to a man ...
4,0,20120619094753Z,cxec buean xuuedng uubueddng biuecu txecnh cx...
...,...,...,...
3942,1,20120502172717Z,you are both morons and that is never happening
3943,0,20120528164814Z,many toolbars include spell check like yahoo f...
3944,0,20120620142813Z,lambeauorwrigleykmossnsioux falls sd i told my...
3945,0,20120528205648Z,how about felix he is sure turning into one he...


In [5]:
test['Comment'] = test['Comment'].str.lower() # lowercase
test['Comment'] = test['Comment'].str.replace('[^\w\s]','') # remove punctuations
test['Comment'] = test['Comment'].str.replace('\n', '')
test['Comment'] = test['Comment'].str.replace('xa0', '')
test['Comment'] = test['Comment'].str.replace('\d+', '')
test['Comment'] = test['Comment'].fillna('')
test.head()

Unnamed: 0,id,Insult,Date,Comment,Usage
0,1,,20120603163526Z,like this if you are a tribe fan,PrivateTest
1,2,,20120531215447Z,youre idiot,PrivateTest
2,3,,20120823164228Z,i am a woman babs and the only war on women i ...,PrivateTest
3,4,,20120826010752Z,wow you benefitted so many wins this year fro...,PrivateTest
4,5,,20120602223825Z,haha green me red you now loser whos winning n...,PrivateTest


## Classification Naive Bayes

In [6]:
from sklearn.model_selection import train_test_split

target = train['Insult']

X_train, X_test, y_train, y_test = train_test_split(train.Comment, target, test_size=0.2)

In [7]:
len(X_train)

3157

In [8]:
len(X_test)

790

In [9]:
v = CountVectorizer()
X_train_count = v.fit_transform(X_train.values)
X_train_count.toarray()[:3]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [10]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB(alpha=0.00001) # without smoothing
model.fit(X_train_count, y_train)

MultinomialNB(alpha=1e-05, class_prior=None, fit_prior=True)

In [11]:
test_count = v.transform(test['Comment'])
y_pred = model.predict(test_count)

In [12]:
len(y_pred)

2235

In [13]:
len(answers)

2235

In [14]:
recall = recall_score(answers,y_pred)
precision = precision_score(answers,y_pred)

print(recall)
print(precision)

0.5088207985143919
0.569055036344756


#### the mean accuracy

In [15]:
X_test_count = v.transform(X_test)
print("Accuracy:", model.score(X_test_count, y_test))
print("F1 score:", 2*((precision*recall)/(precision+recall)))

Accuracy: 0.7569620253164557
F1 score: 0.5372549019607843


In [16]:
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer



class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, text):
        return [self.wnl.lemmatize(t) for t in word_tokenize(text)]

#train['Comment_lemmatized'] = train.Comment.apply(lemmatize_text)

#train['Comment_lemmatized']

X_train, X_test, y_train, y_test = train_test_split(train.Comment, target, test_size=0.2)


## with Lemmatization

In [17]:
v = CountVectorizer(tokenizer=LemmaTokenizer())
X_train_count = v.fit_transform(X_train.values)
X_train_count.toarray()[:3]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [18]:
model = MultinomialNB(alpha=0.00001) # without smoothing
model.fit(X_train_count, y_train)

MultinomialNB(alpha=1e-05, class_prior=None, fit_prior=True)

In [19]:
test_count = v.transform(test['Comment'])
y_pred = model.predict(test_count)

recall = recall_score(answers,y_pred)
precision = precision_score(answers,y_pred)

#### the mean accuracy

In [20]:
X_test_count = v.transform(X_test)

print("Accuracy:", model.score(X_test_count, y_test))
print("F1 score:", 2*((precision*recall)/(precision+recall)))

Accuracy: 0.7253164556962025
F1 score: 0.546075085324232


## Remove stopwords

In [21]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

data = train['Comment']
data.apply(lambda x: [item for item in x if item not in stop])

X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2)

v = CountVectorizer()
X_train_count = v.fit_transform(X_train.values)
X_train_count.toarray()[:3]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [22]:
model = MultinomialNB(alpha=0.00001)
model.fit(X_train_count, y_train)

MultinomialNB(alpha=1e-05, class_prior=None, fit_prior=True)

In [23]:
test_count = v.transform(test['Comment'])
y_pred = model.predict(test_count)

recall = recall_score(answers,y_pred)
precision = precision_score(answers,y_pred)

#### the mean accuracy

In [24]:
X_test_count = v.transform(X_test)
print("Accuracy:", model.score(X_test_count, y_test))
print("F1 score:", 2*((precision*recall)/(precision+recall)))

Accuracy: 0.7506329113924051
F1 score: 0.5511887433284812


## With bigrams

In [25]:
v = CountVectorizer(ngram_range=(2, 2))
X_train_count = v.fit_transform(X_train.values)

model = MultinomialNB(alpha=0.0001) # without laplace smoothing
model.fit(X_train_count, y_train)

MultinomialNB(alpha=0.0001, class_prior=None, fit_prior=True)

In [26]:
test_count = v.transform(test['Comment'])
y_pred = model.predict(test_count)

recall = recall_score(answers,y_pred)
precision = precision_score(answers,y_pred)

#### the mean accuracy

In [27]:
#the mean accuracy:

X_test_count = v.transform(X_test)
print("Accuracy:", model.score(X_test_count, y_test))
print("F1 score:", 2*((precision*recall)/(precision+recall)))

Accuracy: 0.7443037974683544
F1 score: 0.5318352059925093


## With  Laplace Smoothing

In [28]:
v = CountVectorizer()
X_train_count = v.fit_transform(X_train.values)

model = MultinomialNB() # by default alpha is 1
model.fit(X_train_count, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [29]:
test_count = v.transform(test['Comment'])
y_pred = model.predict(test_count)

recall = recall_score(answers,y_pred)
precision = precision_score(answers,y_pred)

#### the mean accuracy

In [30]:
#Score:

X_test_count = v.transform(X_test)
print("Accuracy:", model.score(X_test_count, y_test))
print("F1 score:", 2*((precision*recall)/(precision+recall)))

Accuracy: 0.8253164556962025
F1 score: 0.6051167964404894


Όπως φαίνεται από τα παραπάνω score όντως οι τεχνικές βελτιώνουν τον Naive Bayes

### part of speech and tf idf

In [31]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

train['Comment'].dropna(inplace=True)


data = train['Comment']
data.apply(lambda x: [item for item in x if item not in stop])

tokens = data.apply(word_tokenize) # tokenizing the data

tagged_tokens=[]
for token in tokens:
    tagged_tokens.append(nltk.pos_tag(token))
    
print(tagged_tokens)



#### Create tfidf

In [32]:
tfidf_vectorizer = TfidfVectorizer(max_df=1.0, min_df=1, max_features=1000, #cut down to 1000 for practicality
stop_words='english', ngram_range = (1,1))
tfidf = tfidf_vectorizer.fit_transform(train['Comment'])
tfidf = tfidf.toarray()

print(tfidf)

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.21831192 0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.21913759 0.         0.        ]]


In [33]:
from collections import Counter
i = 0
j = 0
array = np.zeros((3947,4))
while i < len(tagged_tokens):
    counts = Counter(x[1] for x in tagged_tokens[i])
    array[i][0] = array[i][0] + counts['RB']
    array[i][0] = array[i][0] + counts['RBR']
    array[i][0] = array[i][0] + counts['RBS']
    array[i][1] = array[i][1] + counts['VB']
    array[i][1] = array[i][1] + counts['VBD']
    array[i][1] = array[i][1] + counts['VBG']
    array[i][1] = array[i][1] + counts['VBN']
    array[i][1] = array[i][1] + counts['VBP']
    array[i][1] = array[i][1] + counts['VBZ']
    array[i][2] = array[i][2] + counts['JJ']
    array[i][2] = array[i][2] + counts['JJR']
    array[i][2] = array[i][2] + counts['JJS']
    array[i][3] = array[i][3] + counts['NN']
    array[i][3] = array[i][3] + counts['NNS']
    array[i][3] = array[i][3] + counts['NNP']
    array[i][3] = array[i][3] + counts['NNPS']
    array[i][0] = array[i][0]/len(tagged_tokens[i])
    array[i][1] = array[i][1]/len(tagged_tokens[i])
    array[i][2] = array[i][2]/len(tagged_tokens[i])
    array[i][3] = array[i][3]/len(tagged_tokens[i])
    i = i + 1

Create tfidf array and then combine it with our part of speech array

In [34]:
tfidf_vectorizer = TfidfVectorizer(max_df=1.0, min_df=1, max_features = 1000)

tfidf_train = tfidf_vectorizer.fit_transform(train['Comment'])
train_correct = train['Insult'].to_numpy()
train_correct = train_correct.astype('int')

import scipy.sparse as sp

c = sp.hstack((array,tfidf_train))

c = c.toarray()

data = pd.DataFrame(c)

data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,994,995,996,997,998,999,1000,1001,1002,1003
0,0.000000,0.250000,0.000000,0.250000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.214498,0.0,0.0,0.0,0.306296,0.000000,0.0,0.0,0.0
1,0.066667,0.266667,0.066667,0.266667,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.144434,0.0,0.0,0.0,0.206247,0.000000,0.0,0.0,0.0
2,0.078125,0.156250,0.078125,0.234375,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.113241,0.0,0.0,0.0,0.161705,0.125251,0.0,0.0,0.0
3,0.033898,0.288136,0.084746,0.118644,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.221202,0.0,0.0,0.0,0.236901,0.000000,0.0,0.0,0.0
4,0.000000,0.013158,0.263158,0.723684,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3942,0.111111,0.333333,0.000000,0.111111,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.164255,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
3943,0.076923,0.307692,0.076923,0.153846,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.181830,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
3944,0.000000,0.250000,0.041667,0.333333,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
3945,0.083333,0.194444,0.027778,0.277778,0.0,0.136967,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0


### SVM

In [35]:
from sklearn import svm
from sklearn.model_selection import KFold

y = target = train['Insult']
X = data

c = svm.SVC()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # 0.2 is 20%
kf = KFold(n_splits=10,shuffle=False)
kf.split(X)

accuracy_model = []
f1_model = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model = c.fit(X_train, y_train)
    accuracy_model.append(accuracy_score(y_test, model.predict(X_test), normalize=True))
    f1_model.append(f1_score(y_test, model.predict(X_test), average='micro'))
print("Accuracy average:", mean(accuracy_model))
print("F1_score average:",mean(f1_model))

Accuracy average: 0.8358170018633939
F1_score average: 0.8358170018633939


### Random Forest

In [36]:
from sklearn.ensemble import RandomForestClassifier

y = target = train['Insult']
X = data

c=RandomForestClassifier(n_estimators=100)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # 0.2 is 20%
kf = KFold(n_splits=10,shuffle=False)
kf.split(X)

accuracy_model = []
f1_model = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model = c.fit(X_train, y_train)
    accuracy_model.append(accuracy_score(y_test, model.predict(X_test), normalize=True))
    f1_model.append(f1_score(y_test, model.predict(X_test), average='micro'))
print("Accuracy average:", mean(accuracy_model))
print("F1_score average:",mean(f1_model))

Accuracy average: 0.8119989719205809
F1_score average: 0.8119989719205809


### Improve F1 score

Μετά από αρκετές δοκιμές δεν κατάφερα να ξεπεράσω το 0.952. Χρησιμοποίησα Kfolds ώστε να αξιοποιήσω καλύτερα το dataset(Όπου το χρησιμοποίησα και στους δύο παραπάνω αλγορίθμους) και είδα μια αισθητή βελτίωση στο accuracy και στο f1-score.
Το καλύτερο σκορ που πέτυχα είναι ~0.84

Επίσης το dataset έχει μερικά λάθη και ίσως γι αυτό να μην κατάφερα να ανεβάσω παραπάνω το score (Υπάρχουν προσβλητικα σχόλια που έχουν τη τιμή 0).

Μια άλλη ιδέα θα ήταν να βελτιώσω τον πίνακα part of speech - tfidf, χρησιμοποιώντας bigrams(φράσεις δύο λέξεων)

In [37]:
y = target = train['Insult']
X = data

c = svm.SVC()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # 0.2 is 20%
kf = KFold(n_splits=9,shuffle=False)
kf.split(X)

accuracy_model = []
f1_model = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model = c.fit(X_train, y_train)
    accuracy_model.append(accuracy_score(y_test, model.predict(X_test), normalize=True))
    f1_model.append(f1_score(y_test, model.predict(X_test), average='micro'))
print("Accuracy average:", mean(accuracy_model))
print("F1_score average:",mean(f1_model))

Accuracy average: 0.836321421430792
F1_score average: 0.836321421430792
