In [None]:
# idea:
"""
read train files for frenchmed, from frenchpress
train embeddings using skipgram, cbow, fasttext
compare candidate words "patient, treatment, disease, solution, yellow" between frenchmed and frenchmed+frenchpress
    evaluate using spatial, gensim
"""

In [25]:
import fasttext
import gensim
import pandas as pd
import nltk

In [8]:
frenchmed_file = "./TP_ISD2020/QUAERO_FrenchMed/QUAERO_FrenchMed_traindev.ospl"

frenchpress_file = "./TP_ISD2020/QUAERO_FrenchPress/QUAERO_FrenchPress_traindev.ospl"

# read frenchmed
with open(frenchmed_file, "r") as f:
    frenchmed = f.readlines()

# read frenchpress
with open(frenchpress_file, "r") as f:
    frenchpress = f.readlines()


merged_corpora = frenchmed + frenchpress
merged_file = "merged_corpora.ospl"

#save merged corpora
with open(merged_file, "w") as f:
    f.writelines(merged_corpora)



In [9]:
#train fasttext skipgram model
frenchmed_skipgram_model = fasttext.train_unsupervised(frenchmed_file, model='skipgram')
merged_skipgram_model = fasttext.train_unsupervised(merged_file, model='skipgram')

#train fasttext cbow model
frenchmed_cbow_model = fasttext.train_unsupervised(frenchmed_file, model='cbow')
merged_cbow_model = fasttext.train_unsupervised(merged_file, model='cbow')





Read 0M words
Number of words:  1202
Number of labels: 0
Progress: 100.0% words/sec/thread:   43255 lr:  0.000000 avg.loss:  2.806062 ETA:   0h 0m 0s
Read 1M words
Number of words:  13707
Number of labels: 0
Progress: 100.0% words/sec/thread:   27079 lr:  0.000000 avg.loss:  2.237906 ETA:   0h 0m 0s
Read 0M words
Number of words:  1202
Number of labels: 0
Progress: 100.0% words/sec/thread:   65355 lr:  0.000000 avg.loss:  3.049119 ETA:   0h 0m 0s
Read 1M words
Number of words:  13707
Number of labels: 0
Progress: 100.0% words/sec/thread:   40252 lr:  0.000000 avg.loss:  2.282029 ETA:   0h 0m 0s 52.2% words/sec/thread:   40702 lr:  0.023884 avg.loss:  2.482991 ETA:   0h 0m11s


In [57]:
#tokenize the text
frenchmed_tokenized = [nltk.word_tokenize(sentence) for sentence in frenchmed]
merged_corpora_tokenized = [nltk.word_tokenize(sentence) for sentence in merged_corpora]

#train gensim skipgram model
frenchmed_gensim_skipgram_model = gensim.models.Word2Vec(frenchmed_tokenized, sg=1)
frenchmed_gensim_skipgram_model.train(frenchmed_file, epochs=10, total_words=frenchmed_gensim_skipgram_model.corpus_total_words)
merged_gensim_skipgram_model = gensim.models.Word2Vec(merged_corpora_tokenized, sg=1)
merged_gensim_skipgram_model.train(merged_file, epochs=10, total_words=merged_gensim_skipgram_model.corpus_total_words)


#train gensim cbow model
frenchmed_gensim_cbow_model = gensim.models.Word2Vec(frenchmed_tokenized, sg=0)
frenchmed_gensim_cbow_model.train(frenchmed_file, epochs=10, total_words=frenchmed_gensim_cbow_model.corpus_total_words)
merged_gensim_cbow_model = gensim.models.Word2Vec(merged_corpora_tokenized, sg=0) 
merged_gensim_cbow_model.train(merged_file, epochs=10, total_words=merged_gensim_cbow_model.corpus_total_words)


(161, 190)

In [58]:
words = ["patient", "traitement", "maladie", "solution", "jaune"]

dictionary_similarities = {}
for word in words:
    dictionary_similarities[word] = {
        #yes I could've used a dictionary to get all the models, but I wanted to keep it not simple to piss off my teammate
        "frenchmed_skipgram": [elements[1] for elements in frenchmed_skipgram_model.get_nearest_neighbors(word)] if word in frenchmed_skipgram_model.get_words() else "NaN",
        "merged_skipgram": [elements[1] for elements in merged_skipgram_model.get_nearest_neighbors(word)] if word in merged_skipgram_model.get_words() else "NaN",
        "frenchmed_cbow": [elements[1] for elements in frenchmed_cbow_model.get_nearest_neighbors(word)] if word in frenchmed_cbow_model.get_words() else "NaN",
        "merged_cbow": [elements[1] for elements in merged_cbow_model.get_nearest_neighbors(word)] if word in merged_cbow_model.get_words() else "NaN",
        "frenchmed_gensim_skipgram": [elements[0] for elements in frenchmed_gensim_skipgram_model.wv.most_similar(word)] if word in frenchmed_gensim_skipgram_model.wv else "NaN",
        "merged_gensim_skipgram": [elements[0] for elements in merged_gensim_skipgram_model.wv.most_similar(word)] if word in merged_gensim_skipgram_model.wv else "NaN",
        "frenchmed_gensim_cbow": [elements[0] for elements in frenchmed_gensim_cbow_model.wv.most_similar(word)] if word in frenchmed_gensim_cbow_model.wv else "NaN",
        "merged_gensim_cbow": [elements[0] for elements in merged_gensim_cbow_model.wv.most_similar(word)] if word in merged_gensim_cbow_model.wv else "NaN"
        
    }

df = pd.DataFrame(dictionary_similarities)

In [59]:
df

Unnamed: 0,patient,traitement,maladie,solution,jaune
frenchmed_skipgram,"[patients, moment, présentant, également, étan...","[Traitement, traitements, allaitement, médicam...","[malade, Maladie, inflammatoire, problème, urg...","[évolution, diminution, son, Evolution, lésion...","[fluoxétine, forme, poids, corporel, tolcapone..."
merged_skipgram,"[patients, médicament, détient, symptôme, SMN,...","[Traitement, allaitement, traitements, gratuit...","[Maladie, maladies, malade, virus, aiguë, SIDA...","[dissolution, Evolution, dilution, résolution,...","[H, Vélodrome, Laïla, boxe, Lindsay, maillot, ..."
frenchmed_cbow,"[patients, Patients, sont, doivent, souvent, p...","[Traitement, traitements, allaitement, immédia...","[maladies, malade, malades, importance, Maladi...","[évolution, Evolution, injection, diminution, ...","[une, tolcapone, ne, médecine, aucune, insulin..."
merged_cbow,"[associent, bénéficient, médicament, doivent, ...","[Traitement, médicament, étroitement, allaitem...","[théologie, Etude, chimiothérapie, pathologie,...","[évolution, Evolution, révolution, pollution, ...","[mène, héroïne, gène, Tribune, gêne, Mnouchkin..."
frenchmed_gensim_skipgram,"[pris, qui, recommandé, nécessaire, dès, gross...","[devra, TYSABRI, médicaments, le, Le, qui, sym...","[infection, charge, VIH, après, adulte, cancer...","[perfusion, flacon, contient, injectable, mg, ...","[initiale, goutte, diarrhée, journalière, verr..."
merged_gensim_skipgram,"[infection, Epivir, hépatique, LEMP, Quel, adm...","[TYSABRI, diagnostic, natalizumab, ’, Epivir, ...","[infection, orale, VIH, Parkinson, douleur, Ep...","[injectable, perfusion, poudre, dose, diluer, ...","[maillot, Marc-Vivien, Ittifak, Pinault, triom..."
frenchmed_gensim_cbow,"[qui, pendant, peut, est, que, Epivir, Reconci...","[Le, dans, en, des, risque, le, les, médicamen...","[en, avec, une, risque, un, à, ou, après, Une,...","[flacon, contient, perfusion, 100, 150, 20, po...","[comprimés, 100, LE, contenant, faible, Health..."
merged_gensim_cbow,"[allaiter, diagnostic, VIH, Epivir, sommeil, h...","[biais, hasard, bruit, VIH, virus, natalizumab...","[construction, étude, facture, modification, z...","[dose, perfusion, barrière, budgétaire, partic...","[United, Jiang, gardien, aigu, libéral, Roland..."


In [60]:
#tsne embeddings for 

In [63]:
words = list(frenchmed_gensim_skipgram_model.wv.index_to_key)

X = frenchmed_gensim_skipgram_model.wv[frenchmed_gensim_skipgram_model.wv.index_to_key]
from sklearn.manifold import TSNE
tsne = TSNE(n_components=3)
X_tsne = tsne.fit_transform(X)

In [64]:
import plotly.express as px
df_2 = pd.DataFrame(X_tsne, columns=['x', 'y', 'z'])
df_2['word'] = words
fig = px.scatter_3d(df_2, x='x', y='y', z='z', text='word')
fig.show()

In [52]:
#find coords of word maladie
df_2[df_2['word'] == "maladie"]

Unnamed: 0,x,y,z,word
74,-10.508781,1.015591,4.626809,maladie


In [65]:
#find coords of word maladie
df_2[df_2['word'] == "maladie"]

Unnamed: 0,x,y,z,word
74,-11.542339,2.330691,-3.399505,maladie
