## Import packages

You'll first need to install either the ```spaCY``` medium or large model!

->> terminal

```cd cds-language
source ./lang101/bin/activate
python -m spacy download en_core_web_md
deactivate```

In [None]:
# preprocessing
import os
import pandas as pd
from tqdm import tqdm

# nlp
import spacy
nlp = spacy.load("en_core_web_md")

# gensim
from gensim.models import Word2Vec
import gensim.downloader

## Using pretrained vectors in ```spaCy```

In [None]:
nlp("denmark").vector

__Comparing individual words__

In [None]:
banana = nlp("banana")
apple = nlp("apple")
scotland = nlp("scotland")
denmark = nlp("denmark")

__Inspect word similarities__

In [None]:
banana.similarity(apple)

In [None]:
banana.similarity(scotland)

In [None]:
denmark.similarity(scotland)

__Document similarities__

In [None]:
doc1 = nlp("I like bananas")
doc2 = nlp("I like apples")
doc3 = nlp("I come from Scotland")
doc4 = nlp("I live in Denmark")

In [None]:
doc1.similarity(doc3)

In [None]:
doc3.similarity(doc4)

## Working with ```gensim```

__Download pretrained models__

In [None]:
list(gensim.downloader.info()['models'].keys())

__Download a pretrained model__

In [None]:
pretrained_vectors = gensim.downloader.load('glove-wiki-gigaword-100')

__Inspect vector for specific word__

In [None]:
pretrained_vectors['denmark']

__Find most similar words to target__

In [None]:
pretrained_vectors.most_similar('denmark')

__Compare specific words__

In [None]:
pretrained_vectors.similarity('denmark', 'scotland')

In [None]:
pretrained_vectors.similarity('denmark', 'sweden')

__Vector algebra__

*Man* is to *woman* as *cat* is to ...

In [None]:
pretrained_vectors.most_similar(positive=['woman', 'dog'], 
                                negative=['man'], 
                                topn=1)

In [None]:
pretrained_vectors.most_similar(positive=['walk', 'swim'], 
                           negative=['walked'], 
                           topn=1)

In [None]:
pretrained_vectors.most_similar(positive=['berlin', 'denmark'], 
                           negative=['germany'], 
                           topn=1)

__Odd one out!__

In [None]:
pretrained_vectors.doesnt_match(["france", "germany", "dog", "japan"])

## Train your own models

__Load data with pandas__

In [None]:
filename = os.path.join("..", "data", "labelled_data", "fake_or_real_news.csv")

In [None]:
data = pd.read_csv(filename)

In [None]:
data.head()

__Tokenize with ```spaCy```__

In [None]:
sentences = []

for post in tqdm(data["text"]):
    # create a temporary list
    tmp_list = []
    # create spaCy doc object
    doc = nlp(post.lower())
    # loop over
    for token in doc:
        tmp_list.append(token.text)
    # append tmp_list to sentences
    sentences.append(tmp_list)

__Train model with ```gensim```__

In [None]:
model = Word2Vec(sentences=sentences,  # input data
                 size=50,              # embedding size
                 window=5,             # context window
                 sg=1,                 # cbow or skip-gram (cbow=0, sg=1)
                 negative=5,           # number of negative samples
                 min_count=3,          # remove rare words
                 workers=6)            # number of CPU processes

__Inspect most similar word__

In [None]:
model.wv.most_similar('faith', topn=10)

__Compare words__

In [None]:
model.wv.similarity('jesus', 'god')