In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import gensim
import fasttext
from sklearn.manifold import TSNE

In [2]:
LOAD_MODELS = False

In [3]:
if not LOAD_MODELS:
    # W2V, CBOW, med
    print("Training W2V, CBOW, med...")
    w2v = gensim.models.word2vec.Word2Vec(
        corpus_file='QUAERO_FrenchMed/QUAERO_FrenchMed_traindev.ospl',
        min_count=1,
        sg=0
    )

    # W2V, Skip-gram, med
    print("Training W2V, Skip-gram, med...")
    w2v_sg = gensim.models.word2vec.Word2Vec(
        corpus_file='QUAERO_FrenchMed/QUAERO_FrenchMed_traindev.ospl',
        min_count=1,
        sg=1
    )

    # W2V, CBOW, press
    print("Training W2V, CBOW, press...")
    w2v_press = gensim.models.word2vec.Word2Vec(
        corpus_file='QUAERO_FrenchPress/QUAERO_FrenchPress_traindev.ospl',
        min_count=1,
        sg=0
    )

    # W2V, Skip-gram, press
    print("Training W2V, Skip-gram, press...")
    w2v_sg_press = gensim.models.word2vec.Word2Vec(
        corpus_file='QUAERO_FrenchPress/QUAERO_FrenchPress_traindev.ospl',
        min_count=1,
        sg=1
    )

    w2v.train(epochs=20, corpus_file='QUAERO_FrenchMed/QUAERO_FrenchMed_traindev.ospl', total_words=w2v.corpus_total_words)
    w2v_sg.train(epochs=20, corpus_file='QUAERO_FrenchMed/QUAERO_FrenchMed_traindev.ospl', total_words=w2v_sg.corpus_total_words)
    w2v_press.train(epochs=20, corpus_file='QUAERO_FrenchPress/QUAERO_FrenchPress_traindev.ospl', total_words=w2v_press.corpus_total_words)
    w2v_sg_press.train(epochs=20, corpus_file='QUAERO_FrenchPress/QUAERO_FrenchPress_traindev.ospl', total_words=w2v_sg_press.corpus_total_words)

    # FastText, med
    print("Training FastText, med...")
    ft = fasttext.train_unsupervised(
        input='QUAERO_FrenchMed/QUAERO_FrenchMed_traindev.ospl',
        minCount=1
    )

    # FastText, press
    print("Training FastText, press...")
    ft_press = fasttext.train_unsupervised(
        input='QUAERO_FrenchPress/QUAERO_FrenchPress_traindev.ospl',
        minCount=1
    )


    # save all models
    w2v.save("w2v_cbow_med.model")
    w2v_sg.save("w2v_sg_med.model")
    w2v_press.save("w2v_cbow_press.model")
    w2v_sg_press.save("w2v_sg_press.model")
    ft.save_model("ft_med.bin")
    ft_press.save_model("ft_press.bin")
else:
    # load all models
    print("Loading models...")
    w2v = gensim.models.word2vec.Word2Vec.load("w2v_cbow_med.model")
    w2v_sg = gensim.models.word2vec.Word2Vec.load("w2v_sg_med.model")
    w2v_press = gensim.models.word2vec.Word2Vec.load("w2v_cbow_press.model")
    w2v_sg_press = gensim.models.word2vec.Word2Vec.load("w2v_sg_press.model")
    ft = fasttext.load_model("ft_med.bin")
    ft_press = fasttext.load_model("ft_press.bin")

Training W2V, CBOW, med...
Training W2V, Skip-gram, med...
Training W2V, CBOW, press...
Training W2V, Skip-gram, press...
Training FastText, med...


Read 0M words
Number of words:  9105
Number of labels: 0
Progress: 100.1% words/sec/thread:   14945 lr: -0.000026 avg.loss:  2.765205 ETA:   0h 0m 0sh 0m 0s

Training FastText, press...


Progress: 100.0% words/sec/thread:   14943 lr:  0.000000 avg.loss:  2.765205 ETA:   0h 0m 0s
Read 1M words
Number of words:  39655
Number of labels: 0
Progress: 100.0% words/sec/thread:   17943 lr:  0.000000 avg.loss:  2.166822 ETA:   0h 0m 0s  1.3% words/sec/thread:   10562 lr:  0.049366 avg.loss:  3.816779 ETA:   0h 1m26sh 1m 5s ETA:   0h 0m50sm42s 28.5% words/sec/thread:   15419 lr:  0.035741 avg.loss:  2.238445 ETA:   0h 0m42s avg.loss:  2.239072 ETA:   0h 0m42s ETA:   0h 0m24s words/sec/thread:   16861 lr:  0.022070 avg.loss:  2.215211 ETA:   0h 0m24s lr:  0.017265 avg.loss:  2.200073 ETA:   0h 0m18s  17442 lr:  0.015811 avg.loss:  2.195061 ETA:   0h 0m16s 83.0% words/sec/thread:   17871 lr:  0.008503 avg.loss:  2.171144 ETA:   0h 0m 8s


In [4]:
# # visualize embeddings of w2v with t-SNE
# print("Visualizing W2V, CBOW, med...")
# words = list(w2v.wv.index_to_key)

In [5]:
# if LOAD_MODELS:
#     # load X_tsne
#     X_tsne = np.load('X_tsne.npy')
# else:
#     # save X_tsne to a file
#     X = w2v.wv[w2v.wv.index_to_key]
#     tsne = TSNE(n_components=2)
#     X_tsne = tsne.fit_transform(X)
#     np.save('X_tsne.npy', X_tsne)


In [6]:
# #plot the embeddings interactively with plotly
# import plotly.express as px
# df = pd.DataFrame(X_tsne, columns=['x', 'y'])
# df['word'] = words
# fig = px.scatter(df, x='x', y='y', text='word')
# fig.show()

# Visualizing closest words

In [7]:
# load the fasttext models with gensim
ft_gensim = gensim.models.fasttext.FastText.load_fasttext_format("ft_med.bin")
ft_press_gensim = gensim.models.fasttext.FastText.load_fasttext_format("ft_press.bin")

  ft_gensim = gensim.models.fasttext.FastText.load_fasttext_format("ft_med.bin")
  ft_press_gensim = gensim.models.fasttext.FastText.load_fasttext_format("ft_press.bin")


In [13]:
wordlist = ["patient", "traitement", "maladie", "solution", "jaune"]

model_list = {
    "w2v_med_cbow": w2v,
    "w2v_med_skipgram": w2v_sg,
    "w2v_press_cbow": w2v_press,
    "w2v_press_skipgram": w2v_sg_press,
    "ft_med_cbow": ft_gensim,
    "ft_press_cbow": ft_press_gensim
}

worddataframe = pd.DataFrame(columns = wordlist, index = model_list)
worddataframe

Unnamed: 0,patient,traitement,maladie,solution,jaune
w2v_med_cbow,,,,,
w2v_med_skipgram,,,,,
w2v_press_cbow,,,,,
w2v_press_skipgram,,,,,
ft_med_cbow,,,,,
ft_press_cbow,,,,,


In [16]:
for word in wordlist:
    for model in model_list:
        mostsim = model_list[model].wv.most_similar(word, topn=10)
        #keep only the words
        mostsim = [x[0] for x in mostsim]
        worddataframe.at[model, word] = mostsim

In [18]:
pd.set_option('display.max_colwidth', None)

worddataframe

Unnamed: 0,patient,traitement,maladie,solution,jaune
w2v_med_cbow,"[répondu, qui, délai, TYSABRI, ils, médicament, Tasmar, efficace, également, carte]","[VIH, risque, début, infectées, arrêt, réponse, infection, généraliste, antirétroviraux, cancer]","[Parkinson, charge, juger, infection, maintient, décision, antirétroviraux, lors, antagoniste, grossesse]","[diluer, buvable, injectable, poudre, intraveineuse, conditionnée, perfusion, préparer, contient, reconstituée]","[1981, Phosphate, Plaquettes, méthyle, disciplinaires, PELLICULES, Polysorbate, COMPRIMES, ≤, 36]"
w2v_med_skipgram,"[carte, alerte, Montrez, spéciale, atteint, avoir, montrer, déjà, minimum, aptitude]","[premiers, instauration, second, six, concomitant, cycle, contrôlée, débuter, passage, antérieur]","[Parkinson, liée, avancée, Crohn, avancé, Bourneville, Recklinghausen, leucémie, affection, coronarienne]","[buvable, diluer, réchauffer, Voie, préparée, Lepirudine, poudre, ambiante, aseptique, bolus]","[pâle, fer, orange, Polysorbate, oxyde, 15ml, Chlorure, Hydroxyde, triacétine, concentré]"
w2v_press_cbow,"[concessionnaire, rescapé, axiome, cancéreux, reversion, morceau, prononciation, déferlement, lac, boulet]","[coût, financement, système, outil, collectif, logement, sida, statut, diagnostic, jalon]","[couverture, Caisse, caisse, douleur, publicité, contamination, population, créatine, propagation, nature]","[alternative, règle, représentation, foire, formule, démarche, difficulté, firme, bataille, stratégie]","[maillot, Lachhab, endossé, Saâdoune, Trier, von, Baden, métal, emparé, Lars]"
w2v_press_skipgram,"[hospitalisé, flagrant, humble, innocente, cash, mollah, admets, révolté, soignant, Coran]","[médicamenteux, générateurs, mineur, antidouleur, anti-douleur, asservissement, détournera, réaffirmation, préconisations, subalterne]","[virale, orpheline, pneumopathie, Alzheimer, pulmonaire, neurologique, coronarovirus, assurance, atteints, vieillesse]","[pacifique, consensuelle, lancinant, garantissant, suffisante, skier, sodium, carbure, esquisse, solutions]","[maillot, emparé, Pena, pois, Baden, mouiller, lauréat, grenadine, Cooke, 390]"
ft_med_cbow,"[Patient, comment, cent, segment, présentaient, présent, présentent, ciment, Comment, pèsent]","[Taaitement, Traitement, lentement, évitement, règlement, Allaitement, jugement, avortement, hautement, traitements]","[Maladie, malade, malignité, insulinothérapie, immunothérapie, maltraitance, immunodéficience, monothérapie, corticosurrenale, cortical]","[dilution, Dissolution, évolution, evolution, Pollution, Evolution, élocution, injection, exécution, conduction]","[hirudine, titane, Lepirudine, désulfohirudine, lépirudine, toluidine, quotidienne, lamivudine, microcristalline, hydroquinone]"
ft_press_cbow,"[impatient, patientent, impatientent, quotient, abstient, détient, échoient, côtoient, obtient, oublient]","[retraitement, logement, subitement, modestement, strictement, enfermement, immodestement, dépècement, dédommagement, vêtement]","[épidémiologie, malade, virus, malnutrie, pneumopathie, génie, épidémie, médecin, pneumonie, morphine]","[dissolution, résolution, dilution, évolution, pollution, révolution, caution, résorption, abolition, exhibition]","[Jeune, Love, Messine, Pampelune, Noé, Trotski, Pilote, brune, paillote, Fleuve]"
