## GENSIM ----> word2vec

## Word2Vec Demo

In [None]:
# import gensim.downloader as api
# wv = api.load('word2vec-google-news-300')
# # A common operation is to retrieve the vocabulary of a model.
# for index, word in enumerate(wv.index_to_key):
#     if index == 10:
#         break
#     print(f"word #{index}/{len(wv.index_to_key)} is {word}")

# vec_king = wv['king']

# try:
#     vec_cameroon = wv['cameroon']
# except KeyError:
#     print("The word 'cameroon' does not appear in this model")
# vec_king


# pairs = [
#     ('car', 'minivan'),   # a minivan is a kind of car
#     ('car', 'bicycle'),   # still a wheeled vehicle
#     ('car', 'airplane'),  # ok, no wheels, but still a vehicle
#     ('car', 'cereal'),    # ... and so on
#     ('car', 'communism'),
# ]
# for w1, w2 in pairs:
#     print('%r\t%r\t%.2f' % (w1, w2, wv.similarity(w1, w2)))

# print(wv.most_similar(positive=['car', 'minivan'], topn=5))
# print(wv.doesnt_match(['fire', 'water', 'land', 'sea', 'air', 'car']))

## Training Your Own Model

In [None]:
from sklearn.decomposition import IncrementalPCA    # inital reduction
from sklearn.manifold import TSNE                   # final reduction
import numpy as np      

In [None]:
import matplotlib.pyplot as plt
import random

from plotly.offline import init_notebook_mode, iplot, plot
import plotly.graph_objs as go

In [None]:
def reduce_dimensions(model):
    num_dimensions = 2  # final num dimensions (2D, 3D, etc)

    # extract the words & their vectors, as numpy arrays
    vectors = np.asarray(model.wv.vectors)
    labels = np.asarray(model.wv.index_to_key)  # fixed-width numpy strings

    # reduce using t-SNE
    tsne = TSNE(n_components=num_dimensions, random_state=0)
    vectors = tsne.fit_transform(vectors)

    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
    return x_vals, y_vals, labels




def plot_with_plotly(x_vals, y_vals, labels, plot_in_notebook=True):
    
    trace = go.Scatter(x=x_vals, y=y_vals, mode='text', text=labels)
    data = [trace]

    if plot_in_notebook:
        init_notebook_mode(connected=True)
        iplot(data, filename='word-embedding-plot')
    else:
        plot(data, filename='word-embedding-plot.html')


def plot_with_matplotlib(x_vals, y_vals, labels):
    import matplotlib.pyplot as plt
    import random

    random.seed(0)

    plt.figure(figsize=(12, 12))
    plt.scatter(x_vals, y_vals)

    #
    # Label randomly subsampled 25 data points
    #
    indices = list(range(len(labels)))
    selected_indices = random.sample(indices, round(.75*len(labels)))
    for i in selected_indices:
        plt.annotate(labels[i], (x_vals[i], y_vals[i]))

In [None]:
from gensim.test.utils import datapath
from gensim import utils
import gensim.models

class MyCorpus:
    """An iterator that yields sentences (lists of str)."""

    def __iter__(self):
        # corpus_path = datapath('lee_background.cor')
        for line in open("../data/t2.txt"):
            # assume there's one document per line, tokens separated by whitespace
            yield utils.simple_preprocess(line)

In [None]:

sentences = MyCorpus()
print(type(sentences))
model = gensim.models.Word2Vec(sentences=sentences,min_count=2)
print(type(model))

print(sentences)

the_word='propulsion'

try:
    vec_word = model.wv[the_word]
    print(f'size {vec_word.size}\nshape {vec_word.shape}')
except KeyError:
    print(f"The word {the_word} does not appear in this model")

In [None]:
# Simularity check

pairs = [
    ('missions', 'oxygen'), 
    ('access', 'crew'),
    ('zero', 'supply')
    ]
for w1, w2 in pairs:
    try:
        print('%r\t%r\t%.2f' % (w1, w2, model.wv.similarity(w1, w2)))
    except:
        print('One of the keywords is not in the document')

In [None]:
# model.wv.cosine_similarities(model.wv['rocket'],model.wv['shuttle','rocket','engine'])

In [None]:
x_vals, y_vals, labels = reduce_dimensions(model)

In [None]:
plot_with_matplotlib(x_vals, y_vals, labels)

In [None]:
# plot_with_plotly(x_vals, y_vals, labels, plot_in_notebook=True)

## datacamp example

In [None]:
from gensim.corpora.dictionary import Dictionary
from nltk.tokenize import word_tokenize, sent_tokenize

with open("../data/t1.txt") as f:   
        text = f.read()

sentences=sent_tokenize(text.lower())

my_documents = ['The movie was about a spaceship and aliens.',
'I really liked the movie!',
'Awesome action scenes, but boring characters.',
'The movie was awful! I hate alien films.',
'Space is cool! I liked the movie.',
'More space films, please!'
]


my_documents=sentences

In [None]:
list(my_documents)

In [None]:
tokenized_docs=[word_tokenize(doc.lower()) for doc in my_documents]

tokenized_docs

In [None]:
dictionary=Dictionary(tokenized_docs)

In [None]:
dictionary.token2id

In [None]:
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]
corpus

In [None]:
len(corpus)

In [None]:
# Save the fifth document: doc
doc = corpus[4]
doc


In [None]:
# Sort the doc for frequency: bow_doc
bow_doc = sorted(doc, key=lambda w: w[1], reverse=True)
bow_doc

In [None]:
# Print the top 5 words of the document alongside the count
for word_id, word_count in bow_doc[:5]:
    print(dictionary.get(word_id), word_count)

In [None]:
import itertools
from collections import defaultdict
# Create the defaultdict: total_word_count
total_word_count = defaultdict(int)
for word_id, word_count in itertools.chain.from_iterable(corpus):
    total_word_count[word_id] += word_count

In [None]:
total_word_count

### Tf-idf

In [None]:
from gensim.models.tfidfmodel import TfidfModel
tfidf = TfidfModel(corpus)


In [None]:
tfidf.num_docs

In [None]:
tfidf[corpus[5]]

In [None]:
for word_id, word_weight in list(tfidf[corpus])[2]:
    print(dictionary.get(word_id), word_weight)

In [None]:
list(tfidf[corpus])

# Sidebar

In [None]:

a=round(.75*len(labels))
a

In [None]:
range(5)

In [None]:
a=[['experience',
  'with',
  'the',
  'building',
  'and',
  'testing',
  'of',
  'fluid',
  'and/or',
  'electrical',
  'systems',
  '.']]

In [None]:
a[0]

In [None]:
[x[i].upper() for i in range(len(a)) for x in a ]