This notebook aims at training a custom fasttext word embedding model. It uses another git located [here](https://github.com/alexandredupuy-zini/dynamic-topic-modeling), make sure that `kedro==0.15.4` is installed.

# Most similar words

In [None]:
import os

os.chdir('dynamic-topic-modeling/')
os.environ['NUMEXPR_MAX_THREADS'] = '45'

In [None]:
!python -m kedro run --pipeline get_most_similar_fasttext

____
# Imports and additionnal processing

In [None]:
import fasttext
from collections import defaultdict
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import re
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pickle
import unidecode
import nltk
from nltk.corpus import stopwords
import swifter

In [None]:
df = pd.read_csv("../dataset_tweet.csv", sep="|", parse_dates=['timestamp'], dtype={"id": int})

In [None]:
def tidy_text(text, stopwords):
    text = text.lower() # convert text to lower-case
    text = unidecode.unidecode(text)
    text = re.sub('@[^\s]+', '', text) # remove usernames
    text = re.sub(r'#([^\s]+)', r'\1', text) # remove the # in #hashtag
    #text = re.sub("([^\x00-\x7F])+"," ",text) # remove non-ASCII characters
    tkz = nltk.RegexpTokenizer("\\b[\\w-]+\\b")
    text = tkz.tokenize(text)
    text = [word for word in text if word not in stop_words and not word.isnumeric() and len(word) > 2]
    return ' '.join(text)

In [None]:
add_stopwords = ["coronavirus", "19", "ici", "via", "selon", "plus", "ça", "pers", "blumenthal",
                 "not", "against", "fauci", "the", "maxi", "nan"]

stop_words = stopwords.words("french") + add_stopwords
stop_words.remove("pas")
stop_words.remove("ne")

df["text"] = df["text"].swifter.apply(lambda x: tidy_text(str(x), stopwords))
df["date"] = df["timestamp"].apply(lambda x: x.date())

# Word selection by iteration

This section aims at enriching the vocabulary with covid-related terms, as it was learned by the model.

In [None]:
model = fasttext.load_model("data/06_models/fasttext_model")

In [None]:
def get_cos_sim_from_model(word, model, top_n=20) :
    cs=defaultdict()
    wv=model[word]
    if type(model)==fasttext.FastText._FastText :
        all_words=model.words
    else :
        all_words=list(model.wv.vocab.keys())
    for words in [i for i in all_words if i!=word] :
        curr_wv=model[words]
        cs[words]=cosine_similarity(wv.reshape(1,-1),curr_wv.reshape(1,-1)).flatten()[0]
    sorted_cs = dict(sorted(cs.items(), key=lambda kv: kv[1],reverse=True)[:top_n])
    return sorted_cs

def get_most_similar(model, words):
    total = {}
    for word in tqdm(words):
        for key, value in get_cos_sim_from_model(word,model).items():
            if key in total.keys():
                if value > total[key]:
                    total[key] = value
            else:
                add = 0
                for w in words:
                    add += len(re.findall(w, key))
                if add == 0:
                    total[key] = value
    return({k: v for k, v in sorted(total.items(), key=lambda item: item[1], reverse=True)})

In [None]:
# This list of words was iteratively completed starting with the word "vaccin", using the following function
words = ["vaccin", "moderna", "astrazeneca", "pfizer", "gsk", "dose", 'sanofi', "oxford", "pharmaceutique",
         "gilead", "medicament", "novartis", "medoc", "remdesivir", "bigpharma", "traitement", "chloroquine",
         "hydroxychloroquine", "plaquenil", "azithromycin", "raoult_didier", "azythromycin", "raoult", "lancet",
         "didier", "antiviral", "antibiotique", "etude", "labos", "dexamethasone", "lobbies", "antiviraux",
         "lobby", "remede", "gates", "corrompu", "cobaye", "tisane", "artemisia", "charlatan", "puce",
         "rfid", "surgisphere", 'soros', 'virolog', "prophylaxie", 'potion', 'miracle', 'tocilizumab', 'antidote',
         "automedication", 'interet', '5g', 'inject', 'antenne', 'complot', 'competen', 'toxi', 'conspiration',
         'steroide', 'theorie', "mondialiste", 'traitre', 'essais', 'escroc', 'soigner', 'blackrock', 'conflit']

get_most_similar(model, words)

In [None]:
# Check how many texts contain the selected words, and keep their ids

idx_words = []
idx_vaccin = []

for key in words:
    found = 0
    
    for i in range(len(df)):
        if len(re.findall(key, str(df['text'].iloc[i])))>0:
            found += 1
            if key == "vaccin":
                idx_vaccin.append(i)
            idx_words.append(i)

    print("Word {} is in {} texts".format(key, found))

In [None]:
# Two words are added only for the purpose of visualisation.
words_tmp = words + ["vacciner", "vaccination"]
inputs = [model.get_word_vector(word) for word in words_tmp]

In [None]:
pca = PCA(n_components=2, random_state=2020)
outputs = pca.fit_transform(inputs)

df_ft = pd.DataFrame(outputs, index=words_tmp)

ax = df_ft.plot(x=0, y=1, style='o', kind="scatter", figsize=(20,10))

for i in range(len(outputs)):
    if words_tmp[i] == "vaccin":
        ax.text(outputs[i][0], outputs[i][1], words_tmp[i], c='r')
    elif words_tmp[i] in ["vacciner", "vaccination"]:
        ax.text(outputs[i][0], outputs[i][1], words_tmp[i], c='b')
    else:
        ax.text(outputs[i][0], outputs[i][1], words_tmp[i])

In [None]:
pickle.dump(idx_words, open("idx_words.pkl", 'wb'))
pickle.dump(idx_vaccin, open("idx_vaccin.pkl", 'wb'))

# Dataset small

In [None]:
df.set_index(['Unnamed: 0'], inplace=True)
header = list(df.columns.values)

df.iloc[idx_words].to_csv('../sentiment_dataset/dataset_small.csv', sep="|", header=header)
df.iloc[idx_vaccin].to_csv('../sentiment_dataset/dataset_vaccin.csv', sep="|", header=header)