In [None]:
import csv
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
from nltk.corpus import stopwords
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import Perceptron
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

nltk.download('stopwords')


In [None]:
# we used google colab to to train our model therefore i can not change the paths
# I would need to download big sized libraries in order to run this notebook
# if you would like to test this notebook please configure the paths

In [None]:
# link below is our work on colab and should work without needing the change paths

#https://colab.research.google.com/drive/1oEW6oTTxAux3Sw4aORyVaDvhKJhRuC5G?usp=sharing

In [None]:

df_en = pd.read_csv('/content/drive/MyDrive/hateval2019/hateval2019_en_train.csv')
df_en

In [None]:
df_tr = pd.read_csv('/content/drive/MyDrive/tr_hate_eval.csv')

df_tr

In [None]:
vocab_size = 5000 # make the top list of words (common words)
embedding_dim = 64
max_length = 200
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>' # OOV = Out of Vocabulary
training_portion = .8

STOPWORDS = set(stopwords.words('english'))

In [None]:
def get_text_and_labels(en=True):
  texts = []
  labels = []
  if en:
    with open("/content/drive/MyDrive/hateval2019/hateval2019_en_train.csv", 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        punctuation='@/\,;.:!?$123456789¿¡'
        table=str.maketrans(dict.fromkeys(punctuation))

        next(reader)
        for row in reader:
            labels.append(row[2])
            text = row[1].lower()
            text = text.translate(table)
            for word in STOPWORDS:
                token = ' ' + word + ' '
                text = text.replace(token, ' ')
                text = text.replace(' ', ' ')
            texts.append(text)

  else:
    with open("/content/drive/MyDrive/tr_hate_eval.csv", 'r') as csvfile:
      reader = csv.reader(csvfile, delimiter=',')
      punctuation='#@/\,;.:!?$123456789¿¡'
      table=str.maketrans(dict.fromkeys(punctuation))

      next(reader)
      for row in reader:
          labels.append(row[1])
          text = row[0].lower()
          text = text.translate(table)
          for word in STOPWORDS:
              token = ' ' + word + ' '
              text = text.replace(token, ' ')
              text = text.replace(' ', ' ')
          texts.append(text)

  return texts, labels

In [None]:
def preprocess_text(texts, labels):
  X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.10, random_state=42)

  y_train = np.array(y_train, dtype=np.int)
  y_test = np.array(y_test, dtype=np.int)
  tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
  tokenizer.fit_on_texts(X_train)
  word_index = tokenizer.word_index
  train_sequences = tokenizer.texts_to_sequences(X_train)
  train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
  test_sequences = tokenizer.texts_to_sequences(X_test)
  test_padded = pad_sequences(test_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

  return train_padded, y_train, test_padded, y_test


In [None]:
text_en, labels_en = get_text_and_labels(en=True)
text_tr, labels_tr = get_text_and_labels(en=False)

In [None]:
x_train_en, y_train_en, x_test_en, y_test_en = preprocess_text(text_en, labels_en)
x_train_tr, y_train_tr, x_test_tr, y_test_tr = preprocess_text(text_tr, labels_tr)

In [None]:
# starting with perceptron

# listing hyperparameters to be adjusted

penalty = ['l2', 'l1', 'elasticnet', None]
alpha = [0.01, 0.005, 0.001, 0.0005, 0.0001, 0.00005, 0.00001]
fit_intercept = [True, False]
tol = [0.01, 0.005, 0.001, 0.0005, 0.0001, 0.00005, 0.00001]

perceptrons = []
params_all = []

for i in range(len(penalty)):
    for j in range(len(alpha)):
        for k in range(len(fit_intercept)):
            for x in range(len(tol)):
                pen = penalty[i]
                alp = alpha[j]
                fit_inter = fit_intercept[k]
                tol_ = tol[i]
                if pen is not None:
                    perceptron = Perceptron(penalty=pen, alpha=alp, fit_intercept=fit_inter, tol=tol_)
                else:
                    perceptron = Perceptron(fit_intercept=fit_inter, tol=tol_)
                
                perceptrons.append(perceptron)



In [None]:
# for deleting same objects since i made a mistake in the cell above but because it would be too long i will change them here

perceptrons_new = []
params_new  = []

for i in range(len(perceptrons)):
    params_of_the_perceptron = str(perceptrons[i].get_params)
    if params_of_the_perceptron not in params_new:
        params_new.append(params_of_the_perceptron)
        perceptrons_new.append(perceptrons[i])

perceptrons = perceptrons_new

In [None]:
predictions_perceptron = []
accs_perceptron = []
max_acc_perceptron = 0
max_acc_perceptron_ind = 0


for i in range(len(perceptrons)):
    perceptrons[i].fit(x_train_en, y_train_en)
    prediction = perceptrons[i].predict(x_test_en)
    predictions_perceptron.append(prediction)

    score = accuracy_score(y_test_en, predictions_perceptron[i])
    
    if score > max_acc_perceptron:
        max_acc_perceptron = score
        max_acc_perceptron_ind = i
        
    accs_perceptron.append(score)


In [None]:
# preparing sorted list
idx_acc_sorted_perceptron = sorted(range(len(accs_perceptron)),key=accs_perceptron.__getitem__)
accs_sorted_perceptron = []
perceptrons_sorted = []

for i in range(len(accs_perceptron)):
    accs_sorted_perceptron.append(accs_perceptron[idx_acc_sorted_perceptron[i]])
    perceptrons_sorted.append(perceptrons[idx_acc_sorted_perceptron[i]])    


In [None]:
# getting perceptron objects parameters

params_all_perceptron = []

for i in range(len(perceptrons_sorted)):
    params = str(perceptrons_sorted[i].get_params)
    params = params[params.find("(")+1:params.find(")")]
    params_all_perceptron.append(params)



In [None]:
# plotting highest accuracy 10 perceptron objects since printing all the objects needs to big plot 

plt.figure(figsize=(20, 5))
plt.plot(np.arange(10), accs_sorted_perceptron[-10:])
plt.xticks(np.arange(10), params_all_perceptron[-10:], rotation= 90)
plt.xlabel('hyperparameters')
plt.ylabel('accuracy')
plt.title('Perceptron accuracies with adjusted hyperparameters plotted')

In [None]:
# the algorithm fails to converge on reasonable times

# svcs = []

# c_s = [0.9, 1, 1.1]
# kernels = ['linear', 'poly', 'rbf'] 
# degrees = [3]
# gammas = ['scale']

# for i in range(len(c_s)):
#     for j in range(len(kernels)):
#         c = c_s[i]
#         kernel = kernels[j]
        
#         if kernel == 'poly':
#             for k in range(len(degrees)):
#                 degree = degrees[k]
#                 for x in range(len(gammas)):
#                     gamma = gammas[x]
#                     svc_ = SVC(C=c, kernel=kernel, degree=degree, gamma=gamma)
#                     svcs.append(svc_)
                    
#         elif kernel == 'rbf' or kernel == 'sigmoid':
#             for k in range(len(gammas)):
#                 gamma = gammas[k]
#                 scv_ = SVC(C=c, kernel=kernel, gamma=gamma)
#                 svcs.append(svc_)

                
#         else:
#             svc_ = SVC(C=c, kernel=kernel)
#             svcs.append(svc_)

In [None]:
# predictions_svm = []
# accs_svm = []
# max_acc_svm = 0
# max_acc_svm_ind = 0

# for i in range(len(svcs)):
#     svcs[i].fit(x_train_en, y_train_en)
#     print('train i done', i)
#     prediction = svcs[i].predict(x_test_en)
#     print('test i done', i)
#     predictions_svm.append(prediction)

#     score = accuracy_score(y_test_en, predictions_svm[i])
    
#     if score > max_acc_svm:
#         max_acc_svm = score
#         max_acc_svm_ind = i
        
#     accs_svm.append(score)

# idx_acc_sorted_svm = sorted(range(len(accs_svm)),key=accs_svm.__getitem__)

In [None]:
# # preparing sorted list

# accs_sorted_svm = []
# svm_sorted = []

# for i in range(len(accs_svm)):
#     accs_sorted_svm.append(accs_svm[idx_acc_sorted_svm[i]])
#     svm_sorted.append(svcs[idx_acc_sorted_svm[i]])

In [None]:
# # getting svm objects parameters

# params_all_svm = []

# for i in range(len(svm_sorted)):
#     params = str(svm_sorted[i].get_params)
#     params = params[params.find("(")+1:params.find(")")]
#     params_all_svm.append(params)



In [None]:
# # plotting svm objects with ordered accuracies

# plt.figure(figsize=(20, 5))
# plt.plot(np.arange(len(accs_sorted_svm)), accs_sorted_svm)
# plt.xticks(np.arange(len(params_all_svm)), params_all_svm, rotation= 90)
# plt.xlabel('hyperparameters')
# plt.ylabel('accuracy')
# plt.title('SVM accuracies with adjusted hyperparameters plotted')

In [None]:
criterions = ['entropy'] # initially tested with gini as well did not give desired performance
splitters = ['random'] # random works better
max_depths = [10, 15, None]
min_samples_splits = [1.0, 2, 3]
min_samples_leafs = [1, 2, 3]
max_features = ['sqrt', 'log2'] # max features none does not yield the optimal results therefore i dropped it

decision_trees = []

for i in range(len(criterions)):
    for j in range(len(splitters)):
        for k in range(len(max_depths)):
            for l in range(len(min_samples_splits)):
                for x in range(len(min_samples_leafs)):
                    for y in range(len(max_features)):
                    
                        criterion = criterions[i]
                        splitter = splitters[j]
                        max_depth = max_depths[k]
                        min_samples_split = min_samples_splits[l]
                        min_samples_leaf = min_samples_leafs[x]
                        max_feature = max_features[y]
                        
                        decision_tree = DecisionTreeClassifier(criterion=criterion, splitter=splitter, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, max_features=max_feature)
                        decision_trees.append(decision_tree)

In [None]:
predictions_decision_tree = []
accs_decision_tree = []
max_acc_decision_tree = 0
max_acc_decision_tree_ind = 0

for i in range(len(decision_trees)):
    decision_trees[i].fit(x_train_en, y_train_en)
    prediction = decision_trees[i].predict(x_test_en)
    predictions_decision_tree.append(prediction)

    if score > max_acc_decision_tree:
        max_acc_decision_tree = score
        max_acc_decision_tree_ind = i
        
    accs_decision_tree.append(score)

idx_acc_sorted_decision_tree = sorted(range(len(accs_decision_tree)),key=accs_decision_tree.__getitem__)


In [None]:
# preparing sorted list

accs_sorted_decision_tree = []
decision_tree_sorted = []

for i in range(len(decision_trees)):
    accs_sorted_decision_tree.append(accs_decision_tree[idx_acc_sorted_decision_tree[i]])
    decision_tree_sorted.append(decision_trees[idx_acc_sorted_decision_tree[i]])

In [None]:
# getting decision tree objects parameters

params_all_decision_tree = []

for i in range(len(decision_tree_sorted)):
    params = str(decision_tree_sorted[i].get_params)
    params = params[params.find("(")+1:params.find(")")]
    params_all_decision_tree.append(params)



In [None]:
# plotting highest 10 accuracy decision tree objects 

plt.figure(figsize=(20, 5))
plt.plot(np.arange(10), accs_sorted_decision_tree[-10:])
plt.xticks(np.arange(10), params_all_decision_tree[-10:], rotation= 90)
plt.xlabel('hyperparameters')
plt.ylabel('accuracy')
plt.title('Decision tree accuracies with adjusted hyperparameters plotted')

In [None]:
n_estimators_ = [50, 100]
criterions = ['entropy'] # I initially test with gini but entropy worked better for timing constraints I dropped it
max_depths = [10, None]
min_samples_splits = [1.0, 2, 3]
min_samples_leafs = [1, 2, 3]
max_features_ = ['sqrt', 'log2'] # Initially tested with none did not give good results

random_forest_classifiers = []

for i in range(len(n_estimators_)):
    for j in range(len(criterions)):
        for k in range(len(max_depths)):
            for l in range(len(min_samples_splits)):
                for x in range(len(min_samples_leafs)):
                    for y in range(len(max_features_)):
                        n_estimators = n_estimators_[i]
                        criterion = criterions[j]
                        max_depth = max_depths[k]
                        min_samples_split = min_samples_splits[l]
                        min_samples_leaf = min_samples_leafs[x]
                        max_features = max_features_[y]
                        random_forest_classfier = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_depth=max_depth, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, max_features=max_features)
                        random_forest_classifiers.append(random_forest_classfier)

In [None]:
# fitting random forest objects
predictions_random_forest = []
accs_random_forest = []
max_acc_random_forest = 0
max_acc_random_forest_ind = 0


for i in range(len(random_forest_classifiers)):
    random_forest_classifiers[i].fit(x_train_en, y_train_en)
    prediction = random_forest_classifiers[i].predict(x_test_en)
    predictions_random_forest.append(prediction)

    score = accuracy_score(y_test_en, predictions_random_forest[i])
    
    if score > max_acc_random_forest:
        max_acc_random_forest = score
        max_acc_random_forest_ind = i
        
    accs_random_forest.append(score)


In [None]:
idx_acc_sorted_random_forest = sorted(range(len(accs_random_forest)),key=accs_random_forest.__getitem__)

In [None]:
# preparing sorted list

accs_sorted_random_forest = []
random_forest_sorted = []

for i in range(len(accs_random_forest)):
    accs_sorted_random_forest.append(accs_random_forest[idx_acc_sorted_random_forest[i]])
    random_forest_sorted.append(random_forest_classifiers[idx_acc_sorted_random_forest[i]])

In [None]:
# getting random forest objects parameters

params_all_random_forest = []

for i in range(len(random_forest_sorted)):
    params = str(random_forest_sorted[i].get_params)
    params = params[params.find("(")+1:params.find(")")]
    params_all_random_forest.append(params)



In [None]:
# plotting every four random forest objects to see the effect of hyperparameters

plt.figure(figsize=(20, 5))
plt.plot(np.arange(10), accs_sorted_random_forest[-10:])
plt.xticks(np.arange(10), params_all_random_forest[-10:], rotation= 90)
plt.xlabel('hyperparameters')
plt.ylabel('accuracy')
plt.title('Random forest accuracies with adjusted hyperparameters plotted')