# Word2Vec

In [None]:
import gensim 
import gensim.downloader as api
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize

In [None]:
df = pd.read_pickle("../../data/dreams_annotated_cleaned.pkl")
df = df.sample(frac = 0.5)
df["report_cleaned"] = df["report_cleaned"].apply(lambda x: x[0])

## Training your own model

https://radimrehurek.com/gensim/models/word2vec.html

Almost from scratch, here: https://github.com/dcavar/python-tutorial-notebooks/blob/master/notebooks/Word2Vec.ipynb

In [None]:
documents = [word_tokenize(x) for x in df["report_cleaned"]]

In [None]:
model = gensim.models.Word2Vec(documents, vector_size=150, window=10, min_count=2, workers=10)
model.train(documents,total_examples=len(documents),epochs=10)

In [None]:
model.save("custom_word2vec.model")

In [None]:
w1 = "dream"
model.wv.most_similar(positive=w1)

In [None]:
# similarity between two different words
model.wv.similarity(w1="dream", w2="lucid")

In [None]:
# similarity between the same word
model.wv.similarity(w1="dirty", w2="dirty")

## Using pre-trained embeddings

https://huggingface.co/fse/word2vec-google-news-300

https://github.com/piskvorky/gensim-data

https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html

In [None]:
w2v = api.load('word2vec-google-news-300')

In [None]:
w2v.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)

In [None]:
w2v.doesnt_match("breakfast cereal dinner lunch".split())

In [None]:
w2v.similarity('woman', 'man')

In [None]:
w1 = "dream"
w2v.most_similar(positive=w1)

In [None]:
# similarity between two different words
w2v.similarity(w1="dream", w2="lucid")

### From word to docs

https://github.com/sdimi/average-word2vec/blob/master/notebook.ipynb

In [None]:
def document_vector(word2vec_model, doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc if word in list(model.wv.key_to_index.keys())]
    return np.mean(word2vec_model[doc], axis=0)

In [None]:
df["report_cleaned"][1]

In [None]:
document_vector(w2v, df["report_cleaned"][1])

In [None]:
document_vector(w2v, df["report_cleaned"][1]).shape

### Exercise

Use the doc embedding representation to train a classification model for the annotated dream dataset, and compare the results w.r.t the previous approaches. 

(es. https://www.kaggle.com/code/ananyabioinfo/text-classification-using-word2vec)