In [2]:
import numpy as np
import pandas as pd
import nltk
import re
import operator
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score,recall_score,f1_score,accuracy_score
nltk.download('stopwords') # If needed
nltk.download('punkt') # If needed
nltk.download('wordnet') # If needed
dataset_full=pd.read_csv("hateval.tsv",sep='\t')
dataset_text=dataset_full['text']
dataset_label=dataset_full['label']
dataset_list=[]
for i in range(len(dataset_text)):
    for j in range(len(dataset_label)):
        if i==j:
            dataset_list.append((dataset_text[i],dataset_label[j]))
dataset_fixed=[]
link = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]{2}))+')
#label = re.compile(r'[@][0-9a-zA-Z\_]*')
for speech in range(len(dataset_text)):
    url_1 = re.findall(link, dataset_text[speech])
    #url_2 = re.findall(label, dataset_text[speech])
    dataset_fixed.append(dataset_text[speech].strip(str(url_1)))
kf = KFold(n_splits=10)
kf.get_n_splits(dataset_full)
accuracy_total=0.0

lemmatizer = nltk.stem.WordNetLemmatizer()
def get_list_tokens(string): # Function to retrieve the list of tokens from a string
    sentence_split=nltk.tokenize.sent_tokenize(string)
    list_tokens=[]
    for sentence in sentence_split:
        list_tokens_sentence=nltk.tokenize.word_tokenize(sentence)
        for token in list_tokens_sentence:
            list_tokens.append(lemmatizer.lemmatize(token).lower())
    return list_tokens

stopwords=set(nltk.corpus.stopwords.words('english'))
stopwords.add(".")
stopwords.add(",")
stopwords.add("--")
stopwords.add("``")
stopwords.add("#")
stopwords.add("@")
stopwords.add("'")

def get_vector_text(list_vocab,string):
    vector_text=np.zeros(len(list_vocab))
    list_tokens_string=get_list_tokens(string)
    for i, word in enumerate(list_vocab):
        if word in list_tokens_string:
            vector_text[i]=list_tokens_string.count(word)  
    return vector_text

def get_vocabulary(training_set, num_features): # Function to retrieve vocabulary
    dict_word_frequency={}
    for instance in training_set:
        sentence_tokens=get_list_tokens(instance[0])
        for word in sentence_tokens:
            if word in stopwords: 
                continue
            if word not in dict_word_frequency: 
                dict_word_frequency[word]=1
            else: 
                dict_word_frequency[word]+=1
    sorted_list = sorted(dict_word_frequency.items(), key=operator.itemgetter(1), reverse=True)[:num_features]
    vocabulary=[]
    for word,frequency in sorted_list:
        vocabulary.append(word)
    return vocabulary

def train_svm_classifier(training_set, vocabulary): # Function for training our svm classifier
    X_train=[]
    Y_train=[]
    for instance in training_set:
        vector_instance=get_vector_text(vocabulary,instance[0])
        X_train.append(vector_instance)
        Y_train.append(instance[1])
        # Finally, we train the SVM classifier 
    #print(X_train)
    svm_clf=SVC(kernel="linear",gamma='auto')
    svm_clf.fit(X_train,Y_train)
    return svm_clf

precision_total=0
recall_total=0
f1_total=0
for train_index, test_index in kf.split(dataset_list):
    train_set_fold=[]
    test_set_fold=[]
    for i,instance in enumerate(dataset_list):
        if i in train_index:
            train_set_fold.append(instance)
        else:
            test_set_fold.append(instance)
    #print(train_set_fold)
    vocabulary_fold=get_vocabulary(train_set_fold, 500)
    svm_clf_fold=train_svm_classifier(train_set_fold, vocabulary_fold)
    X_test_fold=[]
    Y_test_fold=[]
    for instance in test_set_fold:
        vector_instance=get_vector_text(vocabulary_fold,instance[0])
        X_test_fold.append(vector_instance)
        Y_test_fold.append(instance[1])
    Y_test_fold_gold=np.asarray(Y_test_fold)
    X_test_fold=np.asarray(X_test_fold)
    Y_test_predictions_fold=svm_clf_fold.predict(X_test_fold)
    precision_fold=precision_score(Y_test_fold_gold, Y_test_predictions_fold, average='macro')
    recall_fold=recall_score(Y_test_fold_gold, Y_test_predictions_fold, average='macro')
    f1_fold=f1_score(Y_test_fold_gold, Y_test_predictions_fold, average='macro')
    accuracy_fold=accuracy_score(Y_test_fold_gold, Y_test_predictions_fold)
    precision_total+=precision_fold
    recall_total+=recall_fold
    f1_total+=f1_fold
    accuracy_total+=accuracy_fold
    print ("Fold completed.")
    print(precision_fold)
    print(recall_fold)
    print(f1_fold)
    print(accuracy_fold)
precision_accuracy=precision_total/10
recall_accuracy=recall_total/10
f1_accuracy=f1_total/10
average_accuracy=accuracy_total/10
print ("\nPrecision Accuracy: "+str(round(precision_accuracy,3)))
print ("\nRecall Accuracy: "+str(round(recall_accuracy,3)))
print ("\nf1 Accuracy: "+str(round(f1_accuracy,3)))
print ("\nAverage Accuracy: "+str(round(average_accuracy,3)))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Administrator\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Fold completed.
0.7873931923238549
0.7645975517278735
0.7715418652871893
0.7888888888888889
Fold completed.
0.8099448414684203
0.7825364933573888
0.7902659427845492
0.8055555555555556
Fold completed.
0.7714591356439879
0.7438699360341151
0.7506020757725245
0.77
Fold completed.
0.7804643917988843
0.740108541155188
0.7489111926237337
0.7744444444444445
Fold completed.
0.8063153388608508
0.780267579212105
0.7880936454849499
0.8044444444444444
Fold completed.
0.5035222767675448
0.536291157589563
0.3281777991478677
0.4411111111111111
Fold completed.
0.5218181818181818
0.6165501165501166
0.34306201897261496
0.42


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


Fold completed.
0.5
0.3566666666666667
0.4163424124513619
0.7133333333333334


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


Fold completed.
0.5
0.3861111111111111
0.4357366771159875
0.7722222222222223
Fold completed.
0.5640840508182048
0.5799418873651287
0.5361685460430023
0.5555555555555556

Precision Accuracy: 0.655

Recall Accuracy: 0.629

f1 Accuracy: 0.591

Average Accuracy: 0.685
