In [2]:
# Load the pretrained GLoVe embeddings (https://nlp.stanford.edu/projects/glove/)
# You can download and unzip the following archive: http://nlp.stanford.edu/data/glove.6B.zip
import os.path
import numpy as np
import nltk
import re
GLOVE_DIR = ""
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'), "r")
for line in f:
 values = line.split()
 word = values[0]
 coefs = np.asarray(values[1:], dtype='float32')
 embeddings_index[word] = coefs
f.close()
print('In total, there are %s word vectors.' % len(embeddings_index))


In total, there are 8564 word vectors.


Remove characters innecesaries of the questions list

In [10]:
def clean(text):
    text = text.lower()
    text = re.sub('@','',text)
    text = re.sub('_','',text)
    text = re.sub("'","",text)
    text = re.sub('"',"",text)
    text = re.sub('-',' ',text)
    text = re.sub(':','',text)
    text = re.sub('/','',text)
    text = re.sub('<','',text)
    text = re.sub('>','',text)
    text = re.sub('!','',text)
    text = re.sub('\?','',text)
    text = re.sub('/','',text)
    text = re.sub('\.','',text)
    text = re.sub('\.','',text)
    text = re.sub('[0,9]','',text)
    text = re.sub('\'s','',text)
    
    
    return text

Calculate the most similars sentences

-------------Function Cosine Similiraty Token-------------

In [12]:
def cosine_similarity_token(a, b):
    nominator = np.dot(a, b)
    
    a_norm = np.sqrt(np.sum(a**2))
    b_norm = np.sqrt(np.sum(b**2))
    
    denominator = a_norm * b_norm
    
    cosine_similarity = nominator / denominator
    
    return cosine_similarity

-------------Function Cosine Similarity Sentence------------

In [13]:
import numpy as np

def cosine_similarity_sentence(a,b):

    a = clean(a);
    b = clean(b);
    values_a = a.split()
    values_b = b.split()

    sum_a = np.zeros(100)
    for element in values_a:
      if element in embeddings_index:
        sum_a += embeddings_index[element]
      else:
        sum_a = sum_a
 
    average_a = sum_a / len(values_a)

    sum_b = np.zeros(100)
    for element in values_b:
      if element in embeddings_index:
        sum_b += embeddings_index[element]
      else:
        sum_a = sum_a
    average_b = sum_b / len(values_b)

    nominator = np.dot(average_a, average_b)

    a_norm = np.sqrt(np.sum(average_a**2))
    b_norm = np.sqrt(np.sum(average_b**2))
    
    denominator = a_norm * b_norm
    
    cosine_similarity = nominator / denominator
    
    return cosine_similarity


In [14]:
cosine_similarity_token(embeddings_index['desk'], embeddings_index['table'])


0.4992076

In [15]:
last_max_cosine = 0

top_similar = list()

#Calculate the most similars
for i in range(len(questions_cleaned)):
  if len(top_similar) < 10 and cosine_similarity_sentence(questions_cleaned[i], questions_cleaned[i+1]) > last_max_cosine:
      top_similar.append(questions_cleaned[i])
      last_max_cosine = cosine_similarity_sentence(questions_cleaned[i], questions_cleaned[i+1])



In [16]:
print("The cosine similarity is: ",cosine_similarity_sentence("I am dead", "Having a desk is boring"))

The cosine similarity is:  0.6916605936245704


In [17]:
#We list al the auxiliary verbs in order to exclude them from our verb selection
aux_verbs = ["be","am","are","is","was","were","being", "can", "could", "do", "did", "does", "doing", "have", "had", "has", "having", "may", "might", "must", "shall", "should", "will", "would"]

def cosine_similarity_sentence_with_POS(a,b):

    a = a.lower()
    b = b.lower()

    #We use split() and nltk.pos_tag to separate the sentence in words and transform each word into a tuple with
    #each word and it0s corresponding gramatical category.
    values_a = a.split()
    values_b = b.split()
    nltk.download('averaged_perceptron_tagger')
    values_a = nltk.pos_tag(values_a)
    values_b = nltk.pos_tag(values_b)


    #We know that the length of embeddings_index of "desk" is 100 and we store this value.
    #We also check that the word is a noun or a verb using the tags
    sum_a = np.zeros(len(embeddings_index['desk']))
    for element in values_a:
      if element[0] in embeddings_index:
        if element[0] not in aux_verbs:
          if element[1] == "NN" or "V" in element[1]:
            sum_a += embeddings_index[element[0]]
    average_a = sum_a / len(values_a)
    
    #We do the same for the second sentence
    sum_b = np.zeros(len(embeddings_index['desk']))
    for element in values_b:
      if element[0] in embeddings_index:
        if element[0] not in aux_verbs:
          if element[1] == "NN" or "V" in element[1]:
            sum_b += embeddings_index[element[0]]
    average_b = sum_b / len(values_b)

    #we construct the nominator of the cosine similarity function
    nominator = np.dot(average_a, average_b)

    a_norm = np.sqrt(np.sum(average_a**2))
    b_norm = np.sqrt(np.sum(average_b**2))
    
    #we construct the denominator of the cosine similarity function
    denominator = a_norm * b_norm
    
    cosine_similarity = nominator / denominator
    
    return cosine_similarity

In [18]:
print("The cosine similarity is: ",cosine_similarity_sentence_with_POS("I have a table that is red", "Having a desk become boring"))

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
The cosine similarity is:  0.5404111862926111


In [20]:

top_similar = []

top_pairs = []

last_max_cosine = 0.9
import json
with open('dev-v2.0.json') as json_file:
    data = json.load(json_file)
for element in data['data']:
  for qas in element['paragraphs']:
    for question in qas['qas']:
      q1 = question['question']
      for element in re.split(',|\.',qas['context']):

          q2 = element

          if q1 != q2:
            if cosine_similarity_sentence(q1,q2) > last_max_cosine and q1 not in top_pairs:
              top_pairs.append([q1,q2,cosine_similarity_sentence(q1,q2)])


import operator


top_pairs.sort(key = operator.itemgetter(2), reverse = True)


display(top_pairs[0:10])





[['When would a program not be useful for very small instances and in that sense the intractability of a problem is somewhat independent of technological progress?',
  ' the program would only be useful for very small instances and in that sense the intractability of a problem is somewhat independent of technological progress',
  0.9987318612702049],
 ['What compiles and reports on data about the size of design and construction companies?',
  ' ENR compiles and reports on data about the size of design and construction companies',
  0.9980851271323278],
 ['What is not used for a precise definition of what it means to solve a problem using a given amount of time and space?',
  'For a precise definition of what it means to solve a problem using a given amount of time and space',
  0.9979406167815413],
 ['When had the Brotherhood renounced violence as a means of achieving its goals?',
  ' the Brotherhood had renounced violence as a means of achieving its goals',
  0.9974885484685991],
 ['W

In [21]:
top_similar = []
last_max_cosine = 0
tokens = []
import json
with open('dev-v2.0.json') as json_file:
    data = json.load(json_file)
for element in data['data']:
  for qas in element['paragraphs']:
    for question in qas['qas']:
      q1 = clean(question['question']).split()
      for word in q1:
        if word not in tokens:
          tokens.append(word)

for element in tokens:
  token1 = element
  for element2 in tokens:
    token2 = element2
    if token1 != token2 and token1 in embeddings_index and token2 in embeddings_index:
      if len(top_similar) < 10 and cosine_similarity_token(embeddings_index[token1],embeddings_index[token2]) > last_max_cosine:
        top_similar.append(token2)
print(top_similar)

['what', 'country', 'is', 'located', 'when', 'were', 'the', 'from', 'which', 'countries']


Comparation

In [None]:
from google.colab import drive
import numpy as np
drive.mount('/content/drive')

#The path of our shared drive in the folder named Proc Lleng Natural
data_path = '/content/drive/Shared drives/Proc Lleng Natural/Lab1/dataset/'

!ls '/content/drive/Shared drives/Proc Lleng Natural/Lab1/dataset'

#Load the dataset 
train_data = data_path + 'wiki-news-300d-1M-subword.vec'

word_vectors = {}
with open(train_data, 'r') as fin:
  for line in fin:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        word_vectors[word] = coefs
del word_vectors["999994"]
print('In total, there are %s word vectors.' % len(word_vectors))

In [None]:
import numpy as np

def cosine_similarity_sentence(a,b):

    a = clean(a);
    b = clean(b);
    values_a = a.split()
    values_b = b.split()

    sum_a = np.zeros(len(word_vectors['desk']))
    for element in values_a:
      if element in word_vectors:
        sum_a += word_vectors[element]
      else:
        sum_a = sum_a
 
    average_a = sum_a / len(values_a)

    sum_b = np.zeros(len(word_vectors['desk']))
    for element in values_b:
      if element in word_vectors:
        sum_b += word_vectors[element]
      else:
        sum_a = sum_a
    average_b = sum_b / len(values_b)

    nominator = np.dot(average_a, average_b)

    a_norm = np.sqrt(np.sum(average_a**2))
    b_norm = np.sqrt(np.sum(average_b**2))
    
    denominator = a_norm * b_norm
    
    cosine_similarity = nominator / denominator
    
    return cosine_similarity

In [None]:
import numpy as np

#We list al the auxiliary verbs in order to exclude them from our verb selection
aux_verbs = ["be","am","are","is","was","were","being", "can", "could", "do", "did", "does", "doing", "have", "had", "has", "having", "may", "might", "must", "shall", "should", "will", "would"]

def cosine_similarity_sentence_with_POS(a,b):

    a = a.lower()
    b = b.lower()

    #We use split() and nltk.pos_tag to separate the sentence in words and transform each word into a tuple with
    #each word and it0s corresponding gramatical category.
    values_a = a.split()
    values_b = b.split()
    nltk.download('averaged_perceptron_tagger')
    values_a = nltk.pos_tag(values_a)
    values_b = nltk.pos_tag(values_b)


    #We know that the length of embeddings_index of "desk" is 100 and we store this value.
    #We also check that the word is a noun or a verb using the tags
    sum_a = np.zeros(len(word_vectors['desk']))
    for element in values_a:
      if element[0] in word_vectors:
        if element[0] not in aux_verbs:
          if element[1] == "NN" or "V" in element[1]:
            sum_a += word_vectors[element[0]]
    average_a = sum_a / len(values_a)
    
    #We do the same for the second sentence
    sum_b = np.zeros(len(word_vectors['desk']))
    for element in values_b:
      if element[0] in word_vectors:
        if element[0] not in aux_verbs:
          if element[1] == "NN" or "V" in element[1]:
            sum_b += word_vectors[element[0]]
    average_b = sum_b / len(values_b)

    #we construct the nominator of the cosine similarity function
    nominator = np.dot(average_a, average_b)

    a_norm = np.sqrt(np.sum(average_a**2))
    b_norm = np.sqrt(np.sum(average_b**2))
    
    #we construct the denominator of the cosine similarity function
    denominator = a_norm * b_norm
    
    cosine_similarity = nominator / denominator
    
    return cosine_similarity

print("The cosine similarity token is: ",cosine_similarity_token(word_vectors['desk'], word_vectors['table']))
print("The cosine similarity is: ",cosine_similarity_sentence("I am dead", "Having a desk is boring"))
print("The cosine similarity with POS is: ",cosine_similarity_sentence_with_POS("I have a table that is red", "Having a desk become boring"))

In [None]:
from google.colab import drive
import numpy as np
drive.mount('/content/drive')

#The path of our shared drive in the folder named Proc Lleng Natural
data_path = '/content/drive/Shared drives/Proc Lleng Natural/Lab1/dataset/'

!ls '/content/drive/Shared drives/Proc Lleng Natural/Lab1/dataset'

#Load the dataset 
train_data = data_path + 'GoogleNews-vectors-negative300.vec'

word_vectors2 = {}
with open(train_data, 'r') as fin:
  for line in fin:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        word_vectors2[word] = coefs
print('In total, there are %s word vectors.' % len(word_vectors2))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
GoogleNews-vectors-negative300.vec     wiki-news-300d-1M-subword.vec
GoogleNews-vectors-negative300.vec.gz
In total, there are 3000000 word vectors.


In [None]:
import numpy as np

def cosine_similarity_sentence(a,b):

    a = clean(a);
    b = clean(b);
    values_a = a.split()
    values_b = b.split()

    sum_a = np.zeros(len(word_vectors2['desk']))
    for element in values_a:
      if element in word_vectors2:
        sum_a += word_vectors2[element]
      else:
        sum_a = sum_a
 
    average_a = sum_a / len(values_a)

    sum_b = np.zeros(len(word_vectors2['desk']))
    for element in values_b:
      if element in word_vectors2:
        sum_b += word_vectors2[element]
      else:
        sum_a = sum_a
    average_b = sum_b / len(values_b)

    nominator = np.dot(average_a, average_b)

    a_norm = np.sqrt(np.sum(average_a**2))
    b_norm = np.sqrt(np.sum(average_b**2))
    
    denominator = a_norm * b_norm
    
    cosine_similarity = nominator / denominator
    
    return cosine_similarity

In [None]:
import numpy as np

#We list al the auxiliary verbs in order to exclude them from our verb selection
aux_verbs = ["be","am","are","is","was","were","being", "can", "could", "do", "did", "does", "doing", "have", "had", "has", "having", "may", "might", "must", "shall", "should", "will", "would"]

def cosine_similarity_sentence_with_POS(a,b):

    a = a.lower()
    b = b.lower()

    #We use split() and nltk.pos_tag to separate the sentence in words and transform each word into a tuple with
    #each word and it0s corresponding gramatical category.
    values_a = a.split()
    values_b = b.split()
    nltk.download('averaged_perceptron_tagger')
    values_a = nltk.pos_tag(values_a)
    values_b = nltk.pos_tag(values_b)


    #We know that the length of embeddings_index of "desk" is 100 and we store this value.
    #We also check that the word is a noun or a verb using the tags
    sum_a = np.zeros(len(word_vectors2['desk']))
    for element in values_a:
      if element[0] in word_vectors2:
        if element[0] not in aux_verbs:
          if element[1] == "NN" or "V" in element[1]:
            sum_a += word_vectors2[element[0]]
    average_a = sum_a / len(values_a)
    
    #We do the same for the second sentence
    sum_b = np.zeros(len(word_vectors2['desk']))
    for element in values_b:
      if element[0] in word_vectors2:
        if element[0] not in aux_verbs:
          if element[1] == "NN" or "V" in element[1]:
            sum_b += word_vectors2[element[0]]
    average_b = sum_b / len(values_b)

    #we construct the nominator of the cosine similarity function
    nominator = np.dot(average_a, average_b)

    a_norm = np.sqrt(np.sum(average_a**2))
    b_norm = np.sqrt(np.sum(average_b**2))
    
    #we construct the denominator of the cosine similarity function
    denominator = a_norm * b_norm
    
    cosine_similarity = nominator / denominator
    
    return cosine_similarity

print("The cosine similarity token is: ",cosine_similarity_token(word_vectors2['desk'], word_vectors2['table']))
print("The cosine similarity is: ",cosine_similarity_sentence("I am dead", "Having a desk is boring"))
print("The cosine similarity with POS is: ",cosine_similarity_sentence_with_POS("I have a table that is red", "Having a desk become boring"))

The cosine similarity token is:  0.4045672
The cosine similarity is:  0.3065082589603107
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
The cosine similarity with POS is:  0.3179761596730724


# Nueva sección