In [1]:
import pandas as pd
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.metrics import f1_score, accuracy_score

from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

# Hard Majority voting

In [2]:
results = pd.read_csv('../data/results.csv', index_col=0)

In [3]:
cols = list(results.columns)
olid = []
hasoc = []

for col in cols:
    if 'olid' in col:
        olid.append(col)
    if 'hasoc' in col:
        hasoc.append(col)


In [4]:
results['Hard Majority (OLID)'] = results.loc[:,olid].mean(axis=1).round().astype(int)
results['Hard Majority (HASOC)'] = results.loc[:,hasoc].mean(axis=1).round().astype(int)

# Soft Majority voting

In [5]:
cnn_lstm_probs = pd.read_csv('../results_probs.csv')
transformers_probs = pd.read_csv('../results_transformers_probs.csv')

hatebase_dict = pd.read_csv('../hatebase_dict.csv', sep=';', header=1).iloc[0:,1:3]

with open('../glove.6B.300d.txt') as f:
    glove = f.readlines()
    
glove = [vec.split()[0] for vec in glove]

In [6]:
results['CNN_olid_small'] = cnn_lstm_probs['CNN_olid_small'].values
results['CNN_hasoc'] = cnn_lstm_probs['CNN_hasoc'].values
results['BiLSTM_olid_small'] = cnn_lstm_probs['BiLSTM_olid_small'].values
results['BiLSTM_hasoc'] = cnn_lstm_probs['BiLSTM_hasoc'].values

results['roberta_olid_small'] = transformers_probs['roberta_olid_small'].values
results['hateBERT_olid_small'] = transformers_probs['hateBERT_olid_small'].values
results['roberta_hasoc'] = transformers_probs['roberta_hasoc'].values
results['hateBERT_hasoc'] = transformers_probs['hateBERT_hasoc'].values

In [7]:
results['Soft Majority (OLID)'] = results.loc[:,olid].mean(axis=1).round().astype(int)
results['Soft Majority (HASOC)'] = results.loc[:,hasoc].mean(axis=1).round().astype(int)

# Meta Model (Logistic Regression)

In [8]:
def count_uppercase(text):
    count = 0
    for c in text:
        if c.isupper():
            count += 1
    return count

def count_nonalpha(text):
    count = 0
    for c in text:
        if c.isalpha():
            count += 1
    return count

def count_exclamation(text):
    return text.count('!')

def in_hatebase(text):
    count = 0
    for ngram in hatebase_dict['ngram']:
        if ngram in text:
            count += 1
    return count

def in_glove(text):
    count = 0
    for word in text.split():
        if word in glove:
            count += 1
    return count

In [9]:
results['num_chars'] = results['text'].str.len()
results['num_words'] = results['text'].apply(str.split).apply(len)
results['uppercase_freq'] = results['text'].apply(count_uppercase) / results['num_chars']
results['nonalpha_freq'] = results['text'].apply(count_nonalpha) / results['num_chars']
results['exclamation_freq'] = results['text'].apply(count_exclamation) / results['num_chars']

results['words_in_hatebase'] = results['text'].apply(in_hatebase)
results['words_in_glove_freq'] = results['text'].apply(in_glove) / results['num_words']

extra_features = ['num_chars', 'num_words', 'uppercase_freq', 'nonalpha_freq', 'nonalpha_freq', 'exclamation_freq',
                 'words_in_hatebase', 'words_in_glove_freq']

In [10]:
models = {
    'NN' : MLPClassifier(random_state=0, max_iter=1000, hidden_layer_sizes=(100,10)),
    'Logistic Regression' : LogisticRegression(random_state=0),
    'SVM' : SVC(),
    'RF' : RandomForestClassifier(),
    'Adaboost' : AdaBoostClassifier()
}

In [2]:
LogisticRegression?

In [11]:
# Logistic Regression selected

X = results.loc[:,hasoc+extra_features].reset_index(drop=True)
y = results['labels'].reset_index(drop=True)

scores = pd.DataFrame(columns=['model','f1'])

for model_name, model in models.items():
        res = {'model':model_name}
        
        y_pred = cross_val_predict(model, X, y, cv=10)        
        
        f1 = f1_score(y, y_pred, average='macro')
        res['f1'] = f1
        
        print(model_name)
        print('F1:', f1, '\n')
        
        scores = scores.append(res, ignore_index=True)

NN
F1: 0.724931232808202 

Logistic Regression
F1: 0.7666502616997666 

SVM
F1: 0.4189189189189189 

RF
F1: 0.7464770220156786 

Adaboost
F1: 0.7287046109082271 



In [12]:
# model = LogisticRegression(random_state=0)
# res = {}

# for i in range(2, results.shape[0], 10):
#     y_pred = cross_val_predict(model, X, y, cv=i)        
        
#     f1 = f1_score(y, y_pred, average='macro')
#     print(i, f1)
#     res[i] = f1

In [13]:
# plt.plot(res.keys(), res.values())

In [14]:
model = LogisticRegression(random_state=0)

X = results.loc[:,olid+extra_features]
y = results['labels']
y_pred = cross_val_predict(model, X, y, cv=10) 
results['Meta Model (OLID)'] = y_pred
print('olid',f1_score(y, y_pred, average='macro'))

X = results.loc[:,hasoc+extra_features]
y = results['labels']
y_pred = cross_val_predict(model, X, y, cv=10)
results['Meta Model (HASOC)'] = y_pred
print('hasoc',f1_score(y, y_pred, average='macro'))

olid 0.7959799861973775
hasoc 0.7666502616997666


In [15]:
f1_score(y, results['Soft Majority (OLID)'], average='macro')

0.7630236455036306

In [16]:
f1_score(y, results['Hard Majority (OLID)'], average='macro')

0.7839860239482346

In [17]:
f1_score(y, results['Meta Model (OLID)'], average='macro')

0.7959799861973775

In [18]:
ensembles = ['Hard Majority (OLID)', 'Hard Majority (HASOC)', 'Soft Majority (OLID)', 'Soft Majority (HASOC)',
             'Meta Model (OLID)', 'Meta Model (HASOC)']

In [19]:
results_old = pd.read_csv('../data/results.csv', index_col=0)

In [20]:
results = pd.concat([results_old, results[ensembles]], axis=1)

In [21]:
results.to_csv('results_asg4.csv')

In [22]:
results

Unnamed: 0,text,labels,CNN_olid_small,CNN_hasoc,BiLSTM_olid_small,BiLSTM_hasoc,roberta_olid_small,hateBERT_olid_small,roberta_hasoc,hateBERT_hasoc,NB_olid_small,SVM_olid_small,NB_hasoc,SVM_hasoc,Hard Majority (OLID),Hard Majority (HASOC),Soft Majority (OLID),Soft Majority (HASOC),Meta Model (OLID),Meta Model (HASOC)
15923,# WhoIsQ # WheresTheServer #...,1,1,1,0,0,1,0,1,1,1,1,0,1,1,1,1,1,1,1
27014,# ConstitutionDay is revered by Conserv...,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
30530,# FOXNews # NRA # MAGA ...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
13876,# Watching # Boomer getting the ...,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0
60133,# NoPasaran : Unity demo to oppo...,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73439,# DespicableDems lie again about rifles...,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0
25657,# MeetTheSpeakers 🙌 @ USER will ...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
67018,3 people just unfollowed me for talking about ...,1,1,1,1,1,0,1,0,0,1,1,0,1,1,0,1,1,1,1
50665,# WednesdayWisdom Antifa calls the righ...,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,0,1
