### Load GLOVE model

1) Download model from http://nlp.stanford.edu/data/glove.42B.300d.zip
2) Unzip the file
3) Obtain glove.42B.300d.txt from the folder
4) Place the .txt inside the same folder as this script

### Paper


In [38]:
import importlib
import glove
import numpy as np
importlib.reload(glove)

<module 'glove' from 'c:\\Users\\Esteban\\Documents\\Embeddings\\glove.py'>

In [32]:
model = glove.Glove("glove.42B.300d.txt")


Processing file glove.42B.300d.txt
Finished processing file glove.42B.300d.txt in 2.4789509812990826 minutes


NameError: name 'model' is not defined

### Generate feature vector


In [None]:
def get_feature_vector(a, b):
	"""
	a: list of length n of words on one end of the feature continuum
	b: list of length m of words on the other end of the continuum

	example:
	get_feature_vector(["small", "little", "tiny"], ["large", "big", "huge"])
	
	"""

	# generate list of GloVe embeddings for each end of continuum
	A = glove(a)
	B = glove(b)

	# generate the n x m possible vector differences ("lines") between the two ends
	lines = [] # will be a (n x m, 300) matrix
	for A_i in A:
		for B_j in B:
			lines.append(B_j - A_i)

	# generate the average of the n x m differences
	feature_vector = np.mean(np.array(lines), axis = 0) # shape (300,)
	return feature_vector	  

In [None]:
size = get_feature_vector(["small", "little", "tiny"], ["large", "big", "huge"])
len(size)
size

### Generate vector for each of the ends of the feature vector

In [None]:
def get_end_vector(a):

  A = glove(a)

  end_vector = np.mean(np.array(A), axis = 0)

  return end_vector

In [None]:
small = get_end_vector(["small", "little", "tiny"])


### Vector addition

In [None]:
def VectorAddition(a, b):
  
  A = glove(a)
  B = glove(b)
  c = np.add(A,B)

  return c


### Project words onto feature subspace

In [None]:
def get_orthogonal_projection(u, v):
	"""
	Project vector u on vector v
	"""
	projection = (np.dot(u, v)/np.dot(v, v)) * v
	return projection

def get_word_projections(words, feature_set_1, feature_set_2):
	"""
	All params are lists of strings 

	example:
	get_word_projection(["mouse", "elephant"],["small", "little", "tiny"], ["large", "big", "huge"])

	"""
	# get GloVe embeddings of words
	word_embeddings = glove(words)

	# get feature subspace
	feature_vector = get_feature_vector(feature_set_1, feature_set_2)

	word_projections = [get_orthogonal_projection(word, feature_vector) for word in word_embeddings]

	return word_projections

In [None]:
mouse_elephant = get_word_projections(["mouse", "elephant"],["small", "little", "tiny"], ["large", "big", "huge"])
print(mouse_elephant)

### Ranking words along a feature subspace

In [None]:
def get_projection_score(u, v):
	"""
	Get a scalar magnitude of u on v
	"""
	projection_score = (np.dot(u, v)/np.dot(v, v))
	return projection_score

def get_scores(words, feature_set_1, feature_set_2):
	"""
	
	get_scores(["mouse", "elephant"],["small", "little", "tiny"], ["large", "big", "huge"])

	"""
	# get GloVe embeddings of words
	word_embeddings = glove(words)

	# get feature subspace
	feature_vector = get_feature_vector(feature_set_1, feature_set_2)

	# get projection scores
	projection_scores = [get_projection_score(word, feature_vector) for word in word_embeddings]

	return projection_scores

def get_rankings(words, feature_set_1, feature_set_2):
	"""
	All params are lists of strings 

	Ranks words on an axis from feature 1 to feature 2

	example:
	get_rankings(["mouse", "elephant"],["small", "little", "tiny"], ["large", "big", "huge"])

	"""
	# get projection scores
	projection_scores = get_scores(words, feature_set_1, feature_set_2)

	# order the words by rank
	ranks = np.argsort(projection_scores)

	return ranks

def order_words_along_feature(words, feature_set_1, feature_set_2):
	"""
	Return a list of words ordered along feature axis
	"""
	ranks = get_rankings(words, feature_set_1, feature_set_2)
	return list(np.array(words)[ranks])

In [None]:
get_scores(["sugar", "honey", "caramel","cheese","lemon","vinegar","milk","water","sweet","sour"],["sweet", "sugary", "candied"], ["sour", "acidic", "bitter"])

In [None]:
# example
words = ["mouse", "elephant", "whale", "ant"]
feature_1 = ["small", "little", "tiny"]
feature_2 = ["large", "big", "huge"]
order_words_along_feature(words, feature_1, feature_2)

In [None]:
glove(words)

In [None]:
embeddings =glove(('elephant', 'mouse'))

In [None]:
from sklearn.manifold import TSNE
from keras.datasets import mnist
from sklearn.datasets import load_iris
from numpy import reshape
import seaborn as sns
import pandas as pd  

Y = TSNE(n_components=2, random_state=0, perplexity=15).fit_transform(embeddings)

### Perform PCA


In [None]:
def performPCA(words, n):
  # words must be an array of vectors produced with the function GloVe_Model_42B[word]
  from sklearn.decomposition import PCA

  arrays = np.empty((0, 300), dtype='f')
  arrays = np.append(arrays, words, axis=0)

  wordsPCA =  PCA(n_components=n).fit_transform(arrays)
  
  return wordsPCA


In [None]:
words_PCA = [GloVe_Model_42B["small"],GloVe_Model_42B["tiny"],GloVe_Model_42B["little"], GloVe_Model_42B["huge"], GloVe_Model_42B["big"], GloVe_Model_42B["enormous"]]
palabras = performPCA(words_PCA,2)
palabras

In [None]:
#from sklearn.decomposition import PCA


#arrays = np.empty((0, 300), dtype='f')

	# adds the vector of the query word
#arrays = np.append(arrays, [GloVe_Model_42B["mouse"],GloVe_Model_42B["elephant"],GloVe_Model_42B["whale"], GloVe_Model_42B["dog"], GloVe_Model_42B["tiny"], size], axis=0)

#principalComponents =  PCA(n_components=2).fit_transform(arrays)
#principalComponents


### Graph vectors

In [None]:
def graphVectors(wordsPCA, names):
  # wordsPCA is the output of performPCA function
  # names is an array with the names for the word vectors in the same order as wordsPCA's output.

  import numpy as np
  import matplotlib.pyplot as plt
	
  # The data are given as list of lists (2d list)

  # Taking transpose
  x, y = wordsPCA.T
	  
  # plot our list in X,Y coordinates
  plt.scatter(x, y)
  for i, label in enumerate(names):
	  plt.annotate(label, (x[i], y[i]))
	  
  plt.show()

In [None]:

graphVectors(palabras, ["small", "tiny", "little", "huge", "big", "enormous", "size"] )

# Images

### Hot-cold scale

In [None]:
# Generate vector for each of the ends of the feature vector

cold = get_end_vector(["cold", "frozen", "icy"])
hot = get_end_vector(["hot", "boiling", "burning"])

# Generate list of words to be evaluated

words = ["fire", "lava", "sun", "wood", "metal", "ice", "snow"]

PCA_words = glove(words)

# Append both ends of the scale to our list
PCA_words.append(cold)
PCA_words.append(hot)

#Perform PCA on list
words_2D = performPCA(PCA_words, 2)



In [None]:
type(PCA_words)

In [None]:
from sympy.abc import x

X, Y = words_2D.T

#Define coordinates for each end of the scale
x_c, y_c = words_2D[7].T
x_h, y_h = words_2D[8].T

#Data to produce the function
m = (y_h - y_c) / (x_h - x_c)
b = y_c - (m * x_c)
F = (m * x) + b



Plot function that passes through both ends of the scale

In [None]:
import numpy as np

def plotFunction(F):

  X_axis = np.linspace(-5, 5, 100)
  Y_axis = np.zeros_like(X_axis)

  for i in range(len(X_axis)):
	
	Y_axis[i] = F.subs(x, X_axis[i])

  return X_axis, Y_axis


Function to return query coordinates evaluated in F. Returns array with evaluated Y's.

In [None]:
def evaluateWords(words_2D, F):

  PCA_words_F = []

  for word in words_2D:
	PCA_words_F.append(F.subs(x, word[0]))

  return PCA_words_F
  

In [None]:
Y_data_F = evaluateWords(words_2D, F)
Y_data_F

Plot everything

In [None]:
import matplotlib.pyplot as plt

#Plot words from list
plt.scatter(X, Y)

#Plot labels for words in list
for i, label in enumerate(words):
	plt.annotate(label, (X[i], Y[i]))

#Plot cold end of the scale
plt.scatter(x_c, y_c, color = 'yellow')

#Plot hot end of the scale
plt.scatter(x_h, y_h, color = 'orange')

#Plot line connecting both ends
plt.plot([x_c, x_h], [y_c, y_h], color = 'red')


#Plot function connecting two points
X_data, Y_data = plotFunction(F)
plt.plot(X_data, Y_data,  c ="red" )

#Plot differences between points and scale
for i in range(len(words_2D)):
  plt.plot([X[i], X[i]], [Y[i], Y_data_F[i]] )

plt.savefig('Hot_cold_scale')
plt.show()


### Animal size

In [None]:
# Generate vector for each of the ends of the feature vector

small = get_end_vector(["small", "tiny", "little"])
big = get_end_vector(["big", "huge", "large"])

# Generate list of words to be evaluated

animals = ["mouse", "hamster", "ant", "dog", "rhino", "elephant", "whale"]
PCA_animals = glove(animals)

# Append both ends of the scale to our list
PCA_animals.append(small)
PCA_animals.append(big)

#Perform PCA on list
animals_2D = performPCA(PCA_animals,2)

In [None]:
# Define x's and y's for scatter plot
X_animals, Y_animals = animals_2D.T

#Define coordinates for each end of the scale
x_s, y_s = animals_2D[7].T
x_b, y_b = animals_2D[8].T

#Data to produce the function
m_animals = (y_b - y_s) / (x_b - x_s)
b_animals = y_s - (m_animals * x_s)
F_animals = (m_animals * x) + b_animals

# List of Y coordinates evaluated in F
Y_data_F_animals = evaluateWords(animals_2D, F_animals)

In [None]:
import matplotlib.pyplot as plt

#Plot words from list
plt.scatter(X_animals, Y_animals)

#Plot labels for words in list
for i, label in enumerate(animals):
	plt.annotate(label, (X_animals[i], Y_animals[i]))

#Plot small end of the scale
plt.scatter(x_s, y_s, color = 'yellow')

#Plot big end of the scale
plt.scatter(x_b, y_b)

#Plot line connecting both ends
plt.plot([x_s, x_b], [y_s, y_b], color = 'red')


#Plot function connecting two points
X_data_animals, Y_data_animals = plotFunction(F_animals)
plt.plot(X_data_animals, Y_data_animals,  c ="red" )

#Plot differences between points and scale
for i in range(len(animals_2D)):
  plt.plot([X_animals[i], X_animals[i]], [Y_animals[i], Y_data_F_animals[i]] )

plt.savefig('animal_size_scale.png')
plt.show()


### Sweet-Sour scale

In [None]:
# Generate vector for each of the ends of the feature vector

sweet = get_end_vector(["sweet", "sugary", "candied"])
sour = get_end_vector(["acid", "tart", "bitter"])

# Generate list of words to be evaluated

food = ["sugar", "honey", "salt", "lemon", "vinegar", "water", "milk"]
PCA_food = glove(food)

# Append both ends of the scale to our list
PCA_food.append(sweet)
PCA_food.append(sour)

#Perform PCA on list
food_2D = performPCA(PCA_food,2)

In [None]:
# Define x's and y's for scatter plot
X_food, Y_food = food_2D.T

#Define coordinates for each end of the scale
x_sweet, y_sweet = food_2D[7].T
x_sour, y_sour = food_2D[8].T

#Data to produce the function
m_food = (y_sour - y_sweet) / (x_sour - x_sweet)
b_food = y_sweet - (m_food * x_sweet)
F_food = (m_food * x) + b_food

# List of Y coordinates evaluated in F
Y_data_F_food = evaluateWords(food_2D, F_food)

In [None]:
#Plot words from list
plt.scatter(X_food, Y_food)

#Plot labels for words in list
for i, label in enumerate(food):
	plt.annotate(label, (X_food[i], Y_food[i]))

#Plot small end of the scale
plt.scatter(x_sweet, y_sweet, color = 'yellow')

#Plot big end of the scale
plt.scatter(x_sour, y_sour)

#Plot line connecting both ends
plt.plot([x_sweet, x_sour], [y_sweet, y_sour], color = 'red')


#Plot function connecting two points
X_data_food, Y_data_food = plotFunction(F_food)
plt.plot(X_data_food, Y_data_food,  c ="red" )

#Plot differences between points and scale
for i in range(len(food_2D)):
  plt.plot([X_food[i], X_food[i]], [Y_food[i], Y_data_F_food[i]] )


plt.show()

### Justice-Injustice scale

In [None]:
# Generate vector for each of the ends of the feature vector

justice = get_end_vector(["justice", "equity", "legal"])
injustice = get_end_vector(["injustice", "inequity", "opression"])

# Generate list of words to be evaluated

words_justice = ["gay", "migrant", "woman", "man", "american", "transexual", "latino"]
PCA_justice = glove(words_justice)

# Append both ends of the scale to our list
PCA_justice.append(justice)
PCA_justice.append(injustice)

#Perform PCA on list
justice_2D = performPCA(PCA_justice,2)

In [None]:
# Define x's and y's for scatter plot
X_justice, Y_justice = justice_2D.T

#Define coordinates for each end of the scale
x_justice, y_justice = justice_2D[7].T
x_injustice, y_injustice = justice_2D[8].T

#Data to produce the function
m_justice = (y_injustice - y_justice) / (x_injustice - x_justice)
b_justice = y_justice - (m_justice * x_justice)
F_justice = (m_justice * x) + b_justice

# List of Y coordinates evaluated in F
Y_data_F_justice = evaluateWords(justice_2D, F_justice)

In [None]:
#Adjust size of plot
plt.figure(figsize=(10,10))

#Plot words from list
plt.scatter(X_justice, Y_justice)

#Plot labels for words in list
for i, label in enumerate(words_justice):
	plt.annotate(label, (X_justice[i], Y_justice[i]))

#Plot small end of the scale
plt.scatter(x_justice, y_justice, color = 'yellow')

#Plot big end of the scale
plt.scatter(x_injustice, y_injustice, color = 'orange')

#Plot line connecting both ends
plt.plot([x_justice, x_injustice], [y_justice, y_injustice], color = 'red')


#Plot function connecting two points
X_data_justice, Y_data_justice = plotFunction(F_justice)
plt.plot(X_data_justice, Y_data_justice,  c ="red" )

#Plot differences between points and scale
for i in range(len(justice_2D)):
  plt.plot([X_justice[i], X_justice[i]], [Y_justice[i], Y_data_F_justice[i]] )


plt.savefig('justice_injustice_scale.png')
plt.show()

## Find scale vector from centroids

In [None]:
# 1 Generate end vectors

small = glove(["small", "tiny", "little"])
big = glove(["big", "huge", "large"])

small_2D = performPCA(small, 2)
big_2D = performPCA(big, 2)

x_s, y_s = small_2D[0].T
x_t, y_t = small_2D[1].T
x_l, y_l = small_2D[2].T

x_b, y_b = small_2D[0].T
x_h, y_h = small_2D[1].T
x_large, y_large = big_2D[2].T



In [None]:
# Find centroid of small end

x_avg_small = ((x_s + x_t + x_l) / 3)
y_avg_small = ((y_s + y_t + y_l) / 3)
 
small_centroid = [x_avg_small, y_avg_small]
 
small_centroid

# Find centroid of big end
x_avg_big = ((x_b + x_h + x_large) / 3)
y_avg_big = ((y_b + y_h + y_large) / 3)
 
big_centroid = [x_avg_big, y_avg_big]
 
big_centroid

In [None]:
#Plot line between centroids

x = (small_centroid[0], big_centroid[0])
y = (small_centroid[1], big_centroid[1])

plt.plot([small_centroid[0], big_centroid[0]], [small_centroid[1], big_centroid[1]])

Plot animals alongside SIZE scale

In [None]:
animals = ["mouse", "hamster", "ant", "dog", "rhino", "elephant", "whale"]

PCA_animals = glove(animals)

#Perform PCA on list
animals_2D = performPCA(PCA_animals, 2)

x, y = animals_2D.T

plt.scatter(x, y)
plt.plot([small_centroid[0], big_centroid[0]], [small_centroid[1], big_centroid[1]], color = "red")

for i, label in enumerate(animals):
	plt.annotate(label, (x[i], y[i]))

plt.show()

In [None]:
'''
animals = ["mouse", "hamster", "ant", "dog", "rhino", "elephant", "whale"]
small_feature = get_end_vector(["small", "tiny", "little"])
big_feature = get_end_vector(["big", "huge", "large"])


PCA_animals = glove(animals)
PCA_animals.append(small_feature)
PCA_animals.append(big_feature)


#Perform PCA on list
animals_2D = performPCA(PCA_animals, 2)

x, y = animals_2D.T

x_s, y_s = animals_2D[7].T
x_b, y_b = animals_2D[8].T


plt.scatter(x, y)
plt.plot([small_centroid[0], big_centroid[0]], [small_centroid[1], big_centroid[1]], color = "purple")
for i, label in enumerate(animals):
	plt.annotate(label, (x[i], y[i]))

plt.scatter(x_s, y_s)
plt.scatter(x_b, y_b)
plt.plot([x_s, x_b], [y_s, y_b], color = 'red')

plt.show()
'''

In [None]:
animals = ["mouse", "hamster", "ant", "dog", "rhino", "elephant", "whale", "small", "big"]


PCA_animals = glove(animals)


#Perform PCA on list
animals_2D = performPCA(PCA_animals, 2)

x, y = animals_2D.T

x_s, y_s = animals_2D[7].T
x_b, y_b = animals_2D[8].T


plt.scatter(x, y)

for i, label in enumerate(animals):
	plt.annotate(label, (x[i], y[i]))

plt.scatter(x_s, y_s)
plt.scatter(x_b, y_b)
plt.plot([x_s, x_b], [y_s, y_b], color = 'red')

plt.show()

https://plotly.com/python/3d-scatter-plots/


In [None]:
import plotly.graph_objects as go
import numpy as np

animals = ["mouse", "hamster", "ant", "dog", "rhino", "elephant", "whale", "small", "big"]


PCA_animals = glove(animals)


#Perform PCA on list
animals_2D = performPCA(PCA_animals, 3)

x, y, z = animals_2D.T

# Helix equation
t = np.linspace(0, 10, 50)


data=go.Scatter3d(x=x, y=y, z=z, mode='markers+text', text = ["mouse", "hamster", "ant", "dog", "rhino", "elephant", "whale", "small", "big"])
fig = go.Figure(data = data)

fig.show()

Por hacer:
* Lollipops
* Sumas
* Heat maps


# Lollipops

In [None]:
import pandas as pd
import matplotlib.pyplot as plt


# Data frame must contain word and its respective value in the scale.
# First we need to define the scale vector. For that we are going to use the Generate_feature_vector() function.

# Now we need a function that projects words onto feature subspace. 

words = ["gay", "migrant", "woman", "man", "american", "transexual", "latino", "king", "queen", "traitor", "killer", "outsider", "heterosexual", "lesbian", "bisexual", "pansexual"]
b =  ["good", "valid", "legal"]
a = ["bad", "odd", "opression"] 

word_rankings = get_scores(words, a , b)

words_and_projections = zip(words,word_rankings)

# Create a pandas data frame
df = pd.DataFrame(words_and_projections, columns = ["word", "score"])

# Sort data frame from lowest to highest
ordered_df = df.sort_values(by='score')
my_range=range(1,len(df.index)+1)

# For vertical plot
'''
# For vertical plot
plt.stem(ordered_df['score'])
plt.xticks( my_range, ordered_df['word'])
plt.show()
'''
# For horizontal plot:

# The horizontal plot is made using the hline function

my_color=np.where(ordered_df['word']=='gay' , 'orange', 'skyblue')
my_size=np.where(ordered_df['word']=='gay', 70, 30)

plt.figure(figsize=(8,8))
plt.hlines(y=my_range, xmin=0, xmax=ordered_df['score'], color=my_color, alpha=0.4)
plt.scatter(ordered_df['score'], my_range, color=my_color, s=my_size, alpha=1)
 
# Add titles and axis names
plt.yticks(my_range, ordered_df['word'])
plt.title("Validity scale for different entities", loc='center')
plt.xlabel('validity score')
plt.ylabel('entity')

# Show the plot
plt.savefig('justice_injustice_lollipops')
plt.show()



# Vector addition projections

### Projection scores for compound words

In [None]:
def get_projection_score_sum(u, v):
	"""
	Get a scalar magnitude of u on v
	"""
	projection_score_sum = (np.dot(u, v)/np.dot(v, v))
	return projection_score_sum

def get_scores_sum(words, feature_set_1, feature_set_2):
	"""
	
	get_scores(["mouse", "elephant"],["small", "little", "tiny"], ["large", "big", "huge"])

	"""

	# get feature subspace
	feature_vector_sum = get_feature_vector(feature_set_1, feature_set_2)

	# get projection scores
	projection_scores_sum = [get_projection_score_sum(word, feature_vector_sum) for word in words]

	return projection_scores_sum



In [None]:
big = get_end_vector(["large", "big", "huge"])

In [None]:
big_dog = VectorAddition(["big"], ["dog"])
small_dog = VectorAddition(["small"], ["dog"])

In [None]:
words = ["dog","whale","mouse"]

glove_words = glove(words)

# Append both ends of the scale to our list
glove_words.append(big_dog)
glove_words.append(small_dog)

get_scores_sum(glove_words, ["small", "little", "tiny"], ["large", "big", "huge"])

word_labels = ["dog", "whale", "mouse", "big_dog", "small_dog"]

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

def LollipopsWordSum(words, word_label, a, b, title, x_axis):

  word_rankings = get_scores_sum(words, a , b)
  words_and_projections = zip(word_label,word_rankings)
	
  # Create a pandas data frame
  df = pd.DataFrame(words_and_projections, columns = ["word", "score"])

  # Sort data frame from lowest to highest
  ordered_df = df.sort_values(by='score')
  my_range=range(1,len(df.index)+1)

  # For vertical plot
  '''
  # For vertical plot
  plt.stem(ordered_df['score'])
  plt.xticks( my_range, ordered_df['word'])
  plt.show()
  '''
  # For horizontal plot:

  # The horizontal plot is made using the hline function

  my_color=np.where(ordered_df['word']=='gay' , 'orange', 'skyblue')
  my_size=np.where(ordered_df['word']=='gay', 70, 30)

  plt.figure(figsize=(8,8))
  plt.hlines(y=my_range, xmin=0, xmax=ordered_df['score'], color=my_color, alpha=0.4)
  plt.scatter(ordered_df['score'], my_range, color=my_color, s=my_size, alpha=1)
  
  # Add titles and axis names
  plt.yticks(my_range, ordered_df['word'])
  plt.title(title, loc='center')
  plt.xlabel(x_axis)
  plt.ylabel('entity')

  # Show the plot
  plt.savefig('justice_injustice_lollipops')
  plt.show()


In [None]:
LollipopsWordSum(glove_words, word_labels, ["small", "little", "tiny"], ["large", "big", "huge"], "animal size", "size score")