In [1]:
from gensim.test.utils import datapath, get_tmpfile
from gensim.utils import tokenize
from gensim.models import KeyedVectors
from gensim.models import fasttext
from timeit import default_timer as timer
from time import strftime as time
import pandas as pd
import itertools
import os
import re
import numpy as np
from scipy.spatial.distance import cosine

### Load Fasttext vectors
Pretrained Fasttext-Embeddings (common crawl) over 300 dimensions

In [2]:
def load_standard_model():
    fasttext_file = datapath("C:/Users/flohk/OneDrive/Uni/Projektseminar/data/crawl-300d-2M-subword/crawl-300d-2M-subword.bin")
    return fasttext.load_facebook_model(fasttext_file)

### Load twitter data
Data from https://github.com/GT-SALT/implicit-hate
Any explicit hate is filtered out so that there is only non-explicit hate and non-explicit non-hate left. It might be a good idea to continue training the model on all of the data, but for these purposes, we want to keep everything in-domain

In [3]:
df = pd.read_csv("data/implicit-hate-corpus/implicit_hate_v1_stg1_posts.tsv", delimiter="\t")
df = df[df["class"] != "explicit_hate"]
sents = [list(tokenize(item)) for item in list(df["post"])]

In [4]:
len(list(itertools.chain.from_iterable(sents)))

310440

Now, fasttext can be finetuned by reading in these additional lines. All the hyper-parameters will be left as is so that finetuning is performed in accordance with how the model is already trained (the standard hyperparameters seems to match the ones used by facebook in https://arxiv.org/pdf/1712.09405.pdf, namely a window size of 5 and usage of cbow instead of skipgram). However, the amount of epochs need to be accounted for.

Ideally, a grid-search of all relevant hyper-parameters (larning rate, window size and learning algorithm), each configuration being used to train and evaluate a classifier, would be performed. However, that would take anywhere from 15 minutes to over 5 hours on my machine and thus, only amount of epochs will be varied. 

Six different models will be created and later tested against each other.

In [5]:
def finetune(model, n_epochs):
    model.build_vocab(corpus_iterable=sents, update=True)
    model.train(sents, total_examples=len(sents), epochs=n_epochs)
    return model

In [6]:
def save_model(model, name):
    """
    # Old code for creating model names, might use again later
    models = [m for m in os.listdir("C:/Users/flohk/OneDrive/Uni/Projektseminar/data/fasttext-finetune/") if m.endswith(".model")]
    max_num = -1
    for m in models:
        try:
            max_num = max(int(re.search(r'model(\d+).model', m).group(1)), max_num)
        except AttributeError:
            pass
    max_num += 1
    """
    model.save(get_tmpfile("C:/Users/flohk/OneDrive/Uni/Projektseminar/data/fasttext-finetune/model" + name + ".model"))

Just to make sure that finetuning actually causes a significant difference in the vectors:

In [9]:
test_model = load_standard_model()

hate_old = np.copy(test_model.wv['hate'])
christ_old = np.copy(test_model.wv['christ'])
black_old = np.copy(test_model.wv['black'])
cat_old = np.copy(test_model.wv['cat'])
spruce_old = np.copy(test_model.wv['spruce'])

test_model = finetune(test_model, 5)

hate_new = np.copy(test_model.wv['hate'])
christ_new = np.copy(test_model.wv['christ'])
black_new = np.copy(test_model.wv['black'])
cat_new = np.copy(test_model.wv['cat'])
spruce_new = np.copy(test_model.wv['spruce'])

print("hate:", np.allclose(hate_old, hate_new, atol=1e-4), cosine(hate_old, hate_new))
print("christ:", np.allclose(christ_old, christ_new, atol=1e-4), cosine(christ_old, christ_new))
print("black:", np.allclose(black_old, christ_new, atol=1e-4), cosine(black_old, black_new))
print("cat:", np.allclose(cat_old, cat_new, atol=1e-4), cosine(cat_old, cat_new))
print("spruce:", np.allclose(spruce_old, spruce_new, atol=1e-4), cosine(spruce_old, spruce_new))

hate: False 0.348940372467041
christ: False 0.447144091129303
black: False 0.3751109838485718
cat: False 0.2610793709754944
spruce: False 0.3200603127479553


There is quite a significant difference after finetuning on only 10 epochs in words that are more strongly related to hatred and even (presumably) unrelated words, so anything more might be hardcore overfitting. However, still going to do it, just because. 

In [7]:
all_epochs = [1, 5, 25, 150, 625, 3125]
print("Started training at", time("%H:%M.%S"))
for epoch in all_epochs:
    save_model(finetune(load_standard_model(), epoch), str(epoch))
    print("Completed epoch", epoch, "at", time("%H:%M.%S"))

Started training at 05:28.41
Completed epoch 1 at 05:31.48
Completed epoch 5 at 05:35.04
Completed epoch 25 at 05:38.32
Completed epoch 150 at 05:43.19
Completed epoch 625 at 05:52.06
Completed epoch 3125 at 06:22.31


# Testing the implementations
For all embeddings, a model is trained with the parameters and hyperparameters previously found out. A 5 fold cross-validation is performed to evaluate the performance of each one.

In [5]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.model_selection import KFold

from nltk import TweetTokenizer
from nltk.corpus import stopwords
from gensim.parsing.porter import PorterStemmer
from collections import Counter
import emoji

In [6]:
df = pd.read_csv("data/implicit-hate-corpus/implicit_hate_v1_stg1_posts.tsv", delimiter="\t")
df_id = pd.read_csv("data/implicit-hate-corpus/implicit_hate_v1_stg1.tsv", delimiter="\t")
df["ID"] = df_id["ID"]
df = df[df["class"] != "explicit_hate"]
df_class = pd.read_csv("data/implicit-hate-corpus/implicit_hate_v1_stg2.tsv", delimiter="\t")
df = pd.merge(df, df_class, on="ID", how="left")
df

Unnamed: 0,post,class,ID,implicit_class,extra_implicit_class
0,""" : jewish harvard professor noel ignatiev w...",implicit_hate,399886440588247041,white_grievance,
1,b.higher education is a part of european cult...,not_hate,929901925100937216,,
2,"has a problem with "" the whites "" "" and "" "" ...",not_hate,728678509497954304,,
3,is yasir qadhi a hate preacher for calling ch...,not_hate,625688620444180481,,
4,"rt "" : how three million germans mass murder...",not_hate,441089979322597376,,
...,...,...,...,...,...
20386,"Seeing prostitutes is morally wrong, but being...",implicit_hate,SAP_17154,irony,
20387,I wonder how many females I raped today I spen...,implicit_hate,SAP_17179,irony,
20388,Having a criminal record is more attractive to...,implicit_hate,SAP_17314,stereotypical,
20389,Another advantage of being a female: getting b...,implicit_hate,SAP_17343,inferiority,


In [7]:
tokenizer = TweetTokenizer()
stemmer = PorterStemmer()
stop = stopwords.words("English")
caps_tokens = re.compile(r"\w+(?:'\w+)?|[^\w\s]")

def vectorize(sent, clean_input=['"'], features=["word2vec"], model=None):
    
    # Remove unwanted substrings from input
    for subs in clean_input:
        sent = sent.replace(subs, "")
    
    feature_vector = []
    
    # Fasttext vectors
    if "fasttext" in features:
        tokens = [token for token in tokenizer.tokenize(sent) if token not in stop]
        vecs = np.zeros(300)
        for token in tokens:
            vecs += model[token]
        if len(tokens) != 0:
            vecs = vecs / len(tokens)
        feature_vector += list(vecs)
    
    # Punctuation counts marks
    if "punctuation" in features:
        puncts = []
        puncts.append(sent.count("."))
        puncts.append(sent.count("..."))
        puncts.append(sent.count(","))
        puncts.append(sent.count("("))
        puncts.append(sent.count(")"))
        puncts.append(sent.count(":"))
        puncts.append(sent.count(";"))
        puncts.append(sent.count('"'))
        feature_vector += puncts
        
    # Hashtag count
    if "hashtag" in features:
        feature_vector += [sent.count("#")]
        
    # Is Retweet
    if "retweet" in features:
        retweet = []
        if "b'RT " in sent or "bRT " in sent or " rt " in sent:
            retweet.append(1)
        else:
            retweet.append(0)
        feature_vector += retweet
        
    # ratio of words in all caps
    if "caps" in features:
        filterwords = [token for token in caps_tokens.findall(sent) if token.isalpha() and len(token) > 1]
        caps = [word for word in filterwords if word.isupper()]
        if len(filterwords) > 0:
            ratio = len(caps) / len(filterwords)
        else:
            ratio = 0
        feature_vector += [ratio]
    
    # Emoji count
    if "emoji" in features:
        emojis = 0
        for char in sent:
            if emoji.is_emoji(char):
                emojis += 1
        feature_vector += [emojis]
        
    
    return np.array(feature_vector)

In [8]:
y = np.array(df['class'] == "implicit_hate")

In [9]:
all_epochs = [1, 5, 25, 150, 625, 3125]
stats = pd.DataFrame(index=pd.Index([0] + all_epochs, name="epochs"), columns=["accuracy", "precision", "recall"])
print("Started testing at", time("%H:%M.%S"))
for epoch in [0] + all_epochs:
    epoch_model = fasttext.FastText.load(get_tmpfile("C:/Users/flohk/OneDrive/Uni/Projektseminar/data/fasttext-finetune/model" + str(epoch) + ".model")).wv if epoch != 0 else load_standard_model().wv
    X = StandardScaler().fit_transform(np.array([vectorize(tweet, features=["fasttext", "punctuation", "hashtag", "retweet", "caps", "emoji"], model=epoch_model) for tweet in df["post"]]))   
    kf = KFold(n_splits=5, shuffle=True)
    acc, prec, rec = 0, 0, 0
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        svm = SVC(kernel="rbf", C=100, gamma=0.005)
        svm.fit(X_train, y_train)
        test_labels = svm.predict(X_test)
        acc += accuracy_score(y_true=y_test, y_pred=test_labels)
        prec += precision_score(y_true=y_test, y_pred=test_labels, zero_division = 0)
        rec += recall_score(y_true=y_test, y_pred=test_labels)
    stats["accuracy"][epoch] = acc/5
    stats["precision"][epoch] = prec/5
    stats["recall"][epoch] = rec/5
    print("finished epoch", epoch, "at", time("%H:%M.%S"))

Started testing at 17:15.28
finished epoch 0 at 17:40.13
finished epoch 1 at 18:13.57
finished epoch 5 at 18:50.48
finished epoch 25 at 19:27.01
finished epoch 150 at 19:47.32
finished epoch 625 at 20:06.28
finished epoch 3125 at 20:24.53


In [10]:
stats

Unnamed: 0_level_0,accuracy,precision,recall
epochs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.73145,0.632315,0.546253
1,0.711785,0.593706,0.546138
5,0.712177,0.591931,0.558137
25,0.714433,0.595154,0.562163
150,0.737286,0.648467,0.536891
625,0.742582,0.668834,0.516974
3125,0.738806,0.666247,0.50023


Finetuning does not seem to have a significant effect on the semantic strength of the embeddings for the task at hand. However, we will keep the 150 epoch set for now. The others will be deleted, because the each take up almost 10 GB of disk space.