In [1]:
!pip install --upgrade nltk gensim bokeh



In [2]:
# download the data:
!wget https://www.dropbox.com/s/obaitrix9jyu84r/quora.txt?dl=1 -O ./quora.txt

--2023-10-05 22:01:01--  https://www.dropbox.com/s/obaitrix9jyu84r/quora.txt?dl=1
Resolving www.dropbox.com (www.dropbox.com)... 162.125.6.18, 2620:100:601c:18::a27d:612
Connecting to www.dropbox.com (www.dropbox.com)|162.125.6.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: /s/dl/obaitrix9jyu84r/quora.txt [following]
--2023-10-05 22:01:01--  https://www.dropbox.com/s/dl/obaitrix9jyu84r/quora.txt
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc0799962b5134539a886069c228.dl.dropboxusercontent.com/cd/0/get/CFDYpXhmS0vm8nGGf7ESGC6GCFVhdfkIDKaMJw7w5Ah2XvEi8fJF8xNA4DvKp0KvkuDGUzWlkabKgZbKKse16x1s4VkzcKMBouENPIIgIjhfu6hz3zYeS_xcX7WplL87rTw/file?dl=1# [following]
--2023-10-05 22:01:01--  https://uc0799962b5134539a886069c228.dl.dropboxusercontent.com/cd/0/get/CFDYpXhmS0vm8nGGf7ESGC6GCFVhdfkIDKaMJw7w5Ah2XvEi8fJF8xNA4DvKp0KvkuDGUzWlkabKgZbKKse16x1s4VkzcKMBouENPIIgIjhfu6hz3zYeS_xcX7WplL87rT

In [3]:
import numpy as np

with open("./quora.txt", encoding="utf-8") as file:
    data = list(file)

data[50]

"What TV shows or books help you read people's body language?\n"

In [4]:
from nltk.tokenize import WordPunctTokenizer
tokenizer = WordPunctTokenizer()

print(tokenizer.tokenize(data[50].lower()))

['what', 'tv', 'shows', 'or', 'books', 'help', 'you', 'read', 'people', "'", 's', 'body', 'language', '?']


In [5]:

data_tok = [tokenizer.tokenize(row.lower()) for row in data]

In [6]:
print([' '.join(row) for row in data_tok[:2]])

["can i get back with my ex even though she is pregnant with another guy ' s baby ?", 'what are some ways to overcome a fast food addiction ?']


In [7]:
from gensim.models import Word2Vec
model = Word2Vec(data_tok, 
                 vector_size=32,      # embedding vector size
                 min_count=5,  # consider words that occured at least 5 times
                 window=5).wv  # define context as a 5-word window around the target word

In [8]:
# now you can get word vectors !
model.get_vector('anything')

array([-3.6705184 ,  0.15038745,  1.6393185 ,  2.553356  ,  3.0439389 ,
        2.7595148 ,  1.4224917 , -4.3146915 ,  2.0184262 ,  1.6314199 ,
       -1.2848033 ,  3.319112  ,  4.0094404 ,  1.5808349 ,  2.5065715 ,
       -1.584282  , -0.5537907 , -1.0058753 ,  1.0366745 , -0.6948822 ,
       -3.1887562 ,  0.16117893, -1.3835508 , -2.2677662 ,  1.2314115 ,
       -1.7679586 , -0.96612245, -0.5095072 ,  0.28183788,  0.05451391,
       -1.0221356 , -0.8061581 ], dtype=float32)

In [9]:
# or query similar words directly. Go play with it!
model.most_similar('bread')

[('rice', 0.954008936882019),
 ('fruit', 0.9394801259040833),
 ('cheese', 0.9302470684051514),
 ('butter', 0.9259763956069946),
 ('beer', 0.9259364008903503),
 ('wine', 0.9229521751403809),
 ('sauce', 0.9184145331382751),
 ('beans', 0.9124134182929993),
 ('chocolate', 0.9117215275764465),
 ('orange', 0.9115733504295349)]

In [10]:
import gensim.downloader as api
model = api.load('glove-twitter-100')

In [11]:
words = model.index_to_key[:1000] 

print(words[::100])

['<user>', '_', 'please', 'apa', 'justin', 'text', 'hari', 'playing', 'once', 'sei']


In [12]:
# for each word, compute it's vector with model
word_vectors = np.array([model.get_vector(item) for item in words])

In [13]:
from sklearn.decomposition import PCA

# map word vectors onto 2d plane with PCA. Use good old sklearn api (fit, transform)
# after that, normalize vectors to make sure they have zero mean and unit variance
pca = PCA(n_components=2, svd_solver='full')
word_vectors_pca=pca.fit_transform(word_vectors)
mean=np.mean(word_vectors_pca,axis=0)
std=np.std(word_vectors_pca,axis=0)
word_vectors_pca=(word_vectors_pca-mean)/std

In [14]:
def get_phrase_embedding(phrase):
    """
    Convert phrase to a vector by aggregating it's word embeddings. See description above.
    """
    # 1. lowercase phrase
    # 2. tokenize phrase
    # 3. average word vectors for all words in tokenized phrase
    # skip words that are not in model's vocabulary
    # if all words are missing from vocabulary, return zeros
    
    vector = np.zeros([model.vector_size], dtype='float32')
    phrase=phrase.lower()
    phrase=tokenizer.tokenize(phrase)
    words=[model.get_vector(item) for item in phrase if item in model.key_to_index]
    if len(words):
        vector=np.array(words).mean(axis=0)
    
    # YOUR CODE
    
    return vector

In [15]:
vector = get_phrase_embedding("I'm very sure. This never happened to me before...")


In [16]:
# let's only consider ~5k phrases for a first run.
chosen_phrases = data[::len(data) // 1000]

#print(len(chosen_phrases))
# compute vectors for chosen phrases
phrase_vectors = [get_phrase_embedding(item) for item in chosen_phrases]
phrase_vectors= np.array(phrase_vectors)

In [17]:
# map vectors into 2d space with pca, tsne or your other method of choice
# don't forget to normalize

phrase_vectors_2d = TSNE().fit_transform(phrase_vectors)

phrase_vectors_2d = (phrase_vectors_2d - phrase_vectors_2d.mean(axis=0)) / phrase_vectors_2d.std(axis=0)

NameError: name 'TSNE' is not defined

In [18]:
# compute vector embedding for all lines in data
data_vectors = np.array([get_phrase_embedding(l) for l in data])

In [19]:
from sklearn.metrics.pairwise import cosine_similarity

In [20]:
def find_nearest(query, k=10):
    """
    given text line (query), return k most similar lines from data, sorted from most to least similar
    similarity should be measured as cosine between query and line embedding vectors
    hint: it's okay to use global variables: data and data_vectors. see also: np.argpartition, np.argsort
    """
    q=get_phrase_embedding(query)
    q=q[None,]
    similarity=cosine_similarity(data_vectors,q)[...,0]
    topk=similarity.argsort()[::-1][:k]
    topk_phrase=[data[k] for k in topk]
    return topk_phrase

In [None]:
results = find_nearest(query="How do i enter the matrix?", k=10)

print(''.join(results))

assert len(results) == 10 and isinstance(results[0], str)
assert results[0] == 'How do I get to the dark web?\n'
assert results[3] == 'What can I do to save the world?\n'