https://www.kaggle.com/pierremegret/gensim-word2vec-tutorial

# <b> Natural Language Processing and Word Embeddings using Spacy and Gensim

## <b> Word2Vec

Two words sharing similar contexts also share a similar meaning and consequently a similar vector representation from the model.  

Word2Vec can be used to find out the relations between words in a dataset, compute the similarity between them, or use 

the vector representation of those words as input for other applications such as text classification or clustering.

### <b> Imports

In [None]:
import re 
import pandas as pd  
from time import time 
from collections import defaultdict
import spacy 

from src import PreProcessing

### <b> Loading Data

In [None]:
df = pd.read_csv('../datasets/twitter.csv')

In [None]:
df['Clean Sentences'] = df['Conteúdo'].apply(lambda x: PreProcessing.clean_text(x))

In [None]:
df[['Título', 'Conteúdo', 'Clean Sentences']].head()

## <b> Processing Data with Spacy

#### Portuguese Model

In [None]:
nlp = spacy.load('pt_core_news_sm', disable=['ner', 'parser']) 

#### Cleaning

In [None]:
def cleaning(sentence):
    text = [token.lemma_ for token in sentence if not token.is_stop]
    # filter small sentences
    if len(text) > 2:
        return ' '.join(text)

In [None]:
title_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['Título'])
text_cleaning = (re.sub("[^A-Za-z']+", ' ', str(row)).lower() for row in df['Conteúdo'])

In [None]:
# NLP pipeline speeds up the time processing
title = [cleaning(doc) for doc in nlp.pipe(title_cleaning, batch_size=5000)]
text = [cleaning(doc) for doc in nlp.pipe(text_cleaning, batch_size=5000)]

In [None]:
df_clean = pd.DataFrame({'clean_text': text, 'clean_title':title})
df_clean = df_clean.dropna().drop_duplicates()
df_clean.shape

In [None]:
df_clean.head()

# <b> Ngrams

In [None]:
from gensim.models.phrases import Phrases, Phraser

In [None]:
sent = [row.split() for row in df['Clean Sentences']]

In [None]:
# removing small words
sent = [[x for x in line if len(x) > 2] for line in sent ] 

In [None]:
phrases = Phrases(sent, min_count=30, progress_per=100)

In [None]:
bigram = Phraser(phrases)

In [None]:
sentences = bigram[sent]

In [None]:
for sent in sentences:
    print(sent)
    break

#### Most Frequent Words

In [None]:
word_freq = defaultdict(int)
for sent in sentences:
    for i in sent:
        word_freq[i] += 1
len(word_freq)

In [None]:
print(sorted(word_freq, key=word_freq.get, reverse=True)[:10])

## <b> Training Gensim

In [None]:
import multiprocessing
from gensim.models import Word2Vec

In [None]:
cores = multiprocessing.cpu_count()
print(cores)

Parameters:

min_count = int - Ignores all words with total absolute frequency lower than this - (2, 100)

window = int - The maximum distance between the current and predicted word within a sentence. E.g. window words on the left and window words on the left of our target - (2, 10)

size = int - Dimensionality of the feature vectors. - (50, 300)

sample = float - The threshold for configuring which higher-frequency words are randomly downsampled. Highly influencial. - (0, 1e-5)

alpha = float - The initial learning rate - (0.01, 0.05)

min_alpha = float - Learning rate will linearly drop to min_alpha as training progresses. To set it: alpha - (min_alpha * epochs) ~ 0.00

negative = int - If > 0, negative sampling will be used, the int for negative specifies how many "noise words" should be drown. If set to 0, no negative 
sampling is used. - (5, 20)

workers = int - Use these many worker threads to train the model (=faster training with multicore machines)

In [None]:
w2v_model = Word2Vec(min_count=2,
                     window=2,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

In [None]:
t = time()

w2v_model.build_vocab(sentences, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

In [None]:
t = time()

w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

In [None]:
# memory efficient
w2v_model.init_sims(replace=True)

## <b> Exploring the model

#### Positives

In [None]:
w2v_model.wv.most_similar(positive=["joy"])

#### Negatives

In [None]:
w2v_model.wv.most_similar(negative=["hate"])

#### Similarities

In [None]:
w2v_model.wv.similarity("hate", "angry")

## <b> t-SNE visualizations

In [None]:
from src import TSNE

In [None]:
words = list(w2v_model.wv.most_similar(positive=["happy"], topn=20))
words = [w[0] for w in words]

In [None]:
print(words)

In [None]:
TSNE.tsnescatterplot(w2v_model, 'happy', words)

### <b> Save Embeddings Model

In [None]:
w2v_model.save("models/word2vec.model")

### <b> Save Word Vectors

In [None]:
word_vectors = w2v_model.wv

In [None]:
word_vectors.save("embeddings/word2vec.wordvectors")

### <b> Load Word Vectors

In [None]:
from gensim.models import KeyedVectors
wv = KeyedVectors.load("embeddings/word2vec.wordvectors", mmap='r')

In [None]:
vector = wv['luz']  # Get numpy vector of a word