# Text 4: Word2Vec
**Internet Analytics - Lab 4**

---

**Group:** *L*

**Names:**

* *Imane Benkamoun*
* *Gabriel Yehouda Gozlan*
* *Mathis Le Dortz*
* *Hervé Sérandour*

---

#### Instructions

*This is a template for part 4 of the lab. Clearly write your answers, comments and interpretations in Markodown cells. Don't forget that you can add $\LaTeX$ equations in these cells. Feel free to add or remove any cell.*

*Please properly comment your code. Code readability will be considered for grading. To avoid long cells of codes in the notebook, you can also embed long python functions and classes in a separate module. Don’t forget to hand in your module if that is the case. In multiple exercises, you are required to come up with your own method to solve various problems. Be creative and clearly motivate and explain your methods. Creativity and clarity will be considered for grading.*

In [1]:
import pickle
import re
import numpy as np
from scipy.sparse import csr_matrix
from collections import defaultdict
import json
from utils import *
import gensim
from sklearn.cluster import KMeans
from numpy.linalg import norm


courses = load_json('data/courses.txt')
stopwords = load_pkl('data/stopwords.pkl')

  LARGE_SPARSE_SUPPORTED = LooseVersion(scipy_version) >= '0.14.0'


 First, we'll do a little data exploration, to get an idea of the pre-processing required.

In [None]:
!hdfs dfs -ls /ix/

In [None]:
vectors = sc.textFile("/ix/model.txt")

In [None]:
def get_word_and_vector(row):
    
    list_row = row.split(" ")
    word = list_row[0]
    vector = np.array([float(coord) for coord in list_row[1:]])
    
    return (word, vector)

In [None]:
words_and_vectors = vectors.map(get_word_and_vector)

In [None]:
word_list = (words_and_vectors.map(lambda row: row[0]).collect())[1:]

In [None]:
[word for word in word_list if word[1:].lower()!=word[1:]][:20]

## Redo pre-processing

In [None]:
def to_remove(course):
    
    """
    Remove punctuation, one-letter words and spaces
    """
   
    without_punctuation = re.sub(r'[^\w\s]', ' ', course["description"])
    
    words_to_dell = r'\b(?:' + '|'.join(map(re.escape, stopwords)) + r')\b'
    without_stopwords = re.sub(words_to_dell, ' ', without_punctuation)
    
    without_one_letter = re.sub(r'\b\w{1}\b', ' ', without_stopwords) 

    without_useless_spaces = re.sub(r'\s+', ' ', without_one_letter).strip()
    
    
    return without_useless_spaces 

In [None]:
courses_description = [to_remove(course) for course in courses]

In [None]:
courses_description[:2]

In [None]:
def unique_words(courses_description):
    
    unique_words_set = set()
    
    # course_description is a list of course descriptions
    unique_in_course = [set(description.split()) for description in courses_description] 

    for description_cleaned in unique_in_course:
        unique_words_set.update(description_cleaned)
    
    return list(unique_words_set)
    
    
    

In [None]:
unique_words_list = unique_words(courses_description)

In [None]:
len(unique_words_list)

## Exercise 4.12 : Clustering word vectors

In [None]:
model_path = '/ix/model.txt'
model = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=False)

In [None]:
def words_to_vectors(unique_words_list):
    
    """
    This function produces a word: vector dictionary for words in the model
    that are also in the course description dataset, 
    otherwise, a list of words without associated vectors is created.
    """
    words_as_keys = {}
    words_not_in_model = []
    
    for word in unique_words_list:
        try:
            words_as_keys[word] = model.get_vector(word)
        except KeyError:
            words_not_in_model.append(word)
    
    return words_as_keys, words_not_in_model
    

def set_default_vector(words_as_keys, words_not_in_model):
    
    """
    This function assigns a default vector to words in the process dataset that are not 
    in the model.
    This default vector corresponds to the average of the vectors in the process dataset, 
    and not in the entire model, to get a vector that better matches our data.
    """
    vectors_for_mean = list(words_as_keys.values())
    default_vector = np.mean(vectors_for_mean, axis=0)
    
    for word in words_not_in_model:
        words_as_keys[word] = default_vector
    
    return words_as_keys


In [None]:
def normalize(vector):
    
    norm=np.linalg.norm(vector)
    if norm==0:
        return
    return vector/norm


def normalize_all_vectors(all_words_and_vectors):
    return {word: normalize(vector) for word, vector in all_words_and_vectors.items()}

In [None]:
(words_as_keys, words_not_in_model) = words_to_vectors(unique_words_list)
all_words_and_vectors = set_default_vector(words_as_keys, words_not_in_model)

# We have chosen to post-normalize
# so that the default vector takes into account the weight of the words in the processed dataset

all_words_and_vectors = normalize_all_vectors(all_words_and_vectors)

In [None]:
all_words_and_vectors["Studio"]

In [None]:
import matplotlib.pyplot as plt

def plot_for_opt_k(all_words_and_vectors):
    
    """
    We use Elbow's algorithm to get an idea of the optimal number of clusters.
    Knowing that visually the k_opt corresponds to the k for which the slope
    of the curve below no longer varies significantly
    """
    wcss = [] 
    for i in range(1, 20):
        data = list(all_words_and_vectors.values())
        kmeans = KMeans(n_clusters = i)
        kmeans.fit(data) 
        wcss.append(kmeans.inertia_)
        
    return wcss

wcss = plot_for_opt_k(all_words_and_vectors)


In [None]:
fig, ax = plt.subplots(1,1)

ax.plot(range(1, 20), wcss, marker='o')
ax.set_xlabel("Number of clusters")
ax.set_ylabel("WCSS")
ax.set_title("Find the optimal k")

plt.show()

In [None]:
opt_k = 15 # we choose a number of clusters equal to 10

In [None]:
def clustering(all_words_and_vectors):
    
    data = list(all_words_and_vectors.values())
    kmeans = KMeans(n_clusters = opt_k).fit(data)
    
    return kmeans

In [None]:
kmeans = clustering(all_words_and_vectors)
centers = kmeans.cluster_centers_

In [None]:
def top_10_for_a_cluster(model, unique_words_list, center, step):
    similars = []
    similar_with_all = []
    nb = 0
    while len(similars) < 10:
        
        # we look at all the steps to see if we can add words, with the condition they are also in our dataset
        # As model.similar_vector sends the vectors in descending order of similarity, 
        # we know that we'll get the most similar vectors first, which are also in the processed dataset

        similar_with_all = [ele[0] for ele in model.similar_by_vector(center, topn=nb+step)[nb:nb+step]]
        for word in similar_with_all:
            if (word in unique_words_list) and (word not in similars):
                similars.append(word)
        nb+=step
    return similars[:10]

In [None]:
def display_top_10_for_each_cluster(model, unique_words_list, centers):
    for i, center in enumerate(centers):
        top10 = top_10_for_a_cluster(model, unique_words_list, center, 100)
        print(f"**Cluster number : {i}**")
        for word in top10:
            print(word)
        print()

In [None]:
display_top_10_for_each_cluster(model, unique_words_list, centers)

In [None]:
def labels_for_clusters(all_words_and_vectors, unique_words_list, model, centers):
    
    """
    For the cluster labels, we choose the closest word in terms of similarity, 
    but in the entire model this time.
    """
    for i, center in enumerate(centers[:10]):
        label =  model.similar_by_vector(center, topn=1)[0][0]
        print(f"Cluster number {i} - Label : {label}")
        print()
    
    return 

In [None]:
labels_for_clusters(all_words_and_vectors, unique_words_list, model, centers)

Word2Vec and LSI have cluster labels on similar subjects, science and methods, the notable differences are related to the presence of a cluster containing names (label: "Miller") for Word2Vec, which is not present in
LSI, but also of a cluster containing numbers (label: "245"), not present in LSI either.

It's probably because Word2Vec captures meaning better, even if the names or numbers don't appear in the same context, Word2Vec managed to make the association, which is a priori more complicated for LSI.

## Exercise 4.13 : Document similarity search¶

In [None]:
"""
This part on TF-IDF calculation has been taken from the first notebook,
but has been adapted for this one, since the pre-processing has been different.
"""


corpus = [course.split() for course in courses_description]


# Create a vocabulary and term-to-index mapping
vocabulary = set(word for text in corpus for word in text)
vocab_index = {word: idx for idx, word in enumerate(vocabulary)}

# Create a document-to-index mapping
doc_Id_to_index = {course['courseId'] : idx  for idx, course in enumerate(courses)}
doc_index_to_ID = {idx : course['courseId']  for idx, course in enumerate(courses)}

len(courses)

In [None]:
with open('dictionaries.pkl', 'rb') as file:
    vocab_index, doc_index = pickle.load(file)

In [None]:
import math

# Compute term frequencies (TF)
term_frequencies = defaultdict( lambda : defaultdict(int))
for doc_id, text in enumerate(corpus):
    for word in text:
        term_frequencies[doc_id][word] += 1

# Compute document frequencies (DF)
df = defaultdict(int)
for word in vocabulary:
    for doc_id in term_frequencies:
        if word in term_frequencies[doc_id]:
            df[word] += 1

# Compute the inverse document frequencies (IDF)
N = len(courses)
idf = {word: math.log(N / df[word]) for word in vocabulary}

# Compute the TF-IDF matrix
rows, cols, data = [], [], []
for doc_id in term_frequencies:
    for word in term_frequencies[doc_id]:
        rows.append(vocab_index[word])
        cols.append(doc_id)
        data.append(term_frequencies[doc_id][word] * idf[word])

In [None]:
def convert_course_to_vector(course):
    
    """
    For a given course, 
    we calculate the associated total vector, using the word weights given by TD-IDF 
    and the normalized vectors calculated above
    """
    doc_id = doc_Id_to_index[course["courseId"]]
    cleaned_course = to_remove(course).split()
    
    vectors_for_mean = []
    for word in cleaned_course:
        vectors_for_mean.append((term_frequencies[doc_id][word] * idf[word])*all_words_and_vectors[word])
    
    return (doc_id, np.mean(vectors_for_mean, axis=0))

In [None]:
def average_tf_idf_for_word(word):
    
    """
    This function is useful for obtaining 
    the average weight of a word over the entire corpus, 
    we have chosen to use the average TF-IDF to translate the average importance of the word in the corpus
    """
    nb_doc = len(courses)
    weights_for_word = []
    for doc_id in range(nb_doc):
        if word in to_remove(courses[doc_id]).split():
            weights_for_word.append((term_frequencies[doc_id][word] * idf[word]))
    
    n = len(weights_for_word)
    if n:
        return np.sum(weights_for_word)/n
    else: # if the word is not in the processed dataset
        return 1
        
    
    

In [None]:
def convert_query_to_vector(query):
    
    """
    The query vector is calculated from the average of the vectors for each word, 
    weighted by their average TF-IDF over the corpus 
    """
    query_word_list = query.split()
    vectors_for_mean = []
    for word in query_word_list:
        if word in unique_words_list:
            vectors_for_mean.append((average_tf_idf_for_word(word))*all_words_and_vectors[word])
    
    return np.mean(vectors_for_mean, axis=0)
            

In [None]:
def cosine_similarity(vector1, vector2):
    '''
    Computes the cosine similarity between two vectors.
    '''
    dot_product = np.dot(vector1, vector2)
    norm1 = norm(vector1)
    norm2 = norm(vector2)
    if norm1 == 0 or norm2 == 0:
        return 0
    return dot_product / (norm1 * norm2)

In [None]:
def top_courses_for_query(query):
    
    """
    This function returns a global list of courses related to the query
    in descending order of similarity
    """
    query_vector = convert_query_to_vector(query)
    similarity_with_courses = []
    for course in courses:
        (doc_id, course_vector) = convert_course_to_vector(course)
        similarity_with_courses.append((doc_id, cosine_similarity(query_vector, course_vector)))
        
    return sorted(similarity_with_courses, key=lambda x: x[1], reverse=True)
        

In [None]:
def display(query):
    top_five = top_courses_for_query(query)[:5] # because the list is sorted in descending order of similarity
    print(f"Here are the top5 courses most closely related to the query - {query} :\n")
    for i, doc_index in enumerate([ele[0] for ele in top_five]):
        course_id = courses[doc_index]["courseId"]
        course_name = courses[doc_index]["name"]
        similarity = top_five[i][1]
        print(f"{course_id} : {course_name} - Similarity: {similarity}")

In [None]:
display("Markov chains")

In [None]:
print("Markov chains" in  courses[doc_Id_to_index["MGT-602"]]["description"])

In [None]:
print("Markov chains" in  courses[doc_Id_to_index["COM-512"]]["description"])

In comparison with LSI, We obtain almost the same list of courses for the query “Markov chains”. Only one course is different (COM-512 for LSI vs. MGT-602 for Word2Vec), and the course order and similarity values also differ. In fact, COM-512 isn't even in the top10 courses with Word2Vec, even though it contains “Markov chains” in its description (as you can see). This is undoubtedly because Word2Vec captures the relationships between words, and therefore captures the link between mathematical models for the supply chain and Markov chains in particular, which LSI does less well.

In [None]:
print("Markov" in  courses[doc_Id_to_index["EE-516"]]["description"])

In [None]:
print(courses[doc_Id_to_index["EE-516"]]["description"])

For VSM, We get the same order of courses as with Word2Vec, except that here again, only one course differs (the 5th: MGT-602 for Word2Vec versus EE-516 for the vector space model). We can see that EE-516 talks about HMMs (Hidden Markov Models) which are an extension of Markov chains. However, it seems that VSM cited EE-516 only because it contains
the term “Markov”, without making the link between HMMs and Markov chains.

In [None]:
display("Facebook")

In [None]:
print("social" in courses[doc_Id_to_index["HUM-432(a)"]]["description"])

In [None]:
print("media" in courses[doc_Id_to_index["HUM-432(a)"]]["description"])

In comparison with LSI, again, we get the same course with the greatest similarity. Only two courses are different between LSI and Word2Vec. Where “EE-552”, which is present with LSI but not with Word2Vec, remains relevant, because it concerns “Media Security”, the “HUM-432(a)” course turns out to be a misinterpretation for LSI, since this course is not directly related to Facebook, rather, it seems to have been chosen by LSI because the description contains the word “social” (but not “media”, so not “social media” either).

As for VSM, here again, we obtain the same course with the most similarities, but for the other 4 courses, the results don't seem relevant. Where Markov Chains was in the course description, for Facebook it's different, and VSM is clearly less relevant.

To conclude this analysis, from what we can see it seems that the method that best captures the meaning of words is Word2Vec. LSI also manages to make relevant associations, whereas VSM focuses too much on the presence of the words themselves, without paying attention to meaning. And this is consistent with the course, since Word2Vec was created to capture the meaning of words, whereas VSM is more effective at labelling label documents in a corpus.

## Exercise 4.14: Document similarity search with outside terms

In [None]:
def generalized_convert_query_to_vector(model, query):
    
    """
    We add the model as a parameter, to allow this time 
    to use words outside the corpus, but within the model.
    The TF-IDF of a word outside the corpus is set to 1 by default.
    """
    query_word_list = query.split()
    vectors_for_mean = []
    for word in query_word_list:
        if word in unique_words_list:
            vectors_for_mean.append((average_tf_idf_for_word(word))*all_words_and_vectors[word])
        else:
            vectors_for_mean.append(normalize(model.get_vector(word)))
    
    return np.mean(vectors_for_mean, axis=0)

In [None]:
def generalized_top_courses_for_query(model, query):
    query_vector = generalized_convert_query_to_vector(model, query)
    similarity_with_courses = []
    for course in courses:
        (doc_id, course_vector) = convert_course_to_vector(course)
        similarity_with_courses.append((doc_id, cosine_similarity(query_vector, course_vector)))
        
    return sorted(similarity_with_courses, key=lambda x: x[1], reverse=True)

In [None]:
def generalized_display(model, query):
    top_five = generalized_top_courses_for_query(model, query)[:5]
    print(f"Here are the top5 courses most closely related to the query - {query} :\n")
    for i, doc_index in enumerate([ele[0] for ele in top_five]):
        course_id = courses[doc_index]["courseId"]
        course_name = courses[doc_index]["name"]
        similarity = top_five[i][1]
        print(f"{course_id} : {course_name} - Similarity: {similarity}")

In [None]:
generalized_display(model, "MySpace Orkut")

In [None]:
"MySpace" in  courses[doc_Id_to_index["EE-727"]]["description"]

In [None]:
display("Facebook")

In [None]:
"Facebook" in  courses[doc_Id_to_index["EE-727"]]["description"]

The results are the same for Facebook and MySpace Orkut, with the exception of one course, which is different, and a different order and values for the similarities. As for the difference in value, Facebook is notably present in the description of the first course, which is not the case for MySpace.

In [None]:
generalized_display(model, "coronavirus")

These courses are all in the biology section and talk globally about infections, diseases, viruses or bacteria, so it's consistent with coronavirus.