# Step 1: Setup

In [None]:
# Connect to google drive to access model files

from google.colab import drive
drive.mount('/content/drive')
%cd '/content/drive/Shareddrives/Computational Semantics A3/Code'

Mounted at /content/drive
/content/drive/Shareddrives/Computational Semantics A3/Code


In [None]:
# Import Necessary Packages

import os
import pandas as pd
import numpy as np
import spacy
import gensim.downloader as api
from gensim.models import FastText
import string
import numpy as np
from scipy.linalg import orthogonal_procrustes
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances

In [None]:
# Load the models
old_model = FastText.load('coha_ft.model')
modern_model = FastText.load('coca_ft.model')

Additional context on the models:

Both models were created using the same embedding model, FastText, with the same hyperparameters - a context window of 5, an embedding size of 100 and 5 training epochs. The only difference between the two models is that the "old_model" was trained on the COHA corpus that was filtered for entries tagged before 1910. While the "modern_model" was trained on COCA containing text from 1990-2010.

The COHA model was filtered because the original date range was from 1810-2000. So in order for both corpora not to overlap and have a clear distinction between the time periods of both corpora, it was decided to filter it to contain words from 1810 to 1910 and use it as our "old" corpus.

# Step 2: Initial Analysis

Replicate the global and local neighborhood measure that was developed by Hamilton et al. (https://drive.google.com/file/d/15KhHUT5QsmEiQR4rFLTt7Jyv4pKJ823n/view?usp=drive_link) Please see section 2.1 for more details.

This is just meant to be a starting point to identify what works and what doesn't and then discuss how we can further improve on the methodology.

In [None]:
target_words = [
    'accident','business','disease', 'face', 'girl', 'governor','nice', 'pudding', 'thing','wife',
    'broadcast', 'bug', 'cool', 'gay', 'post','queer', 'silly', 'terrific','virus', 'web'
    ]


In [None]:
def global_measure(word:str):
  try:
    old_wv = old_model.wv[word].reshape(1,-1)
    modern_wv = modern_model.wv[word].reshape(1,-1)

    output = cosine_similarity(old_wv, modern_wv)
    return output[0][0]
  except:
    None

def local_neighborhood_measure(word:str):
  try:
    # get K-nearest-neighbors where K=50
    old_similarity_pairs = old_model.wv.most_similar(word, topn=50)
    modern_similarity_pairs = modern_model.wv.most_similar(word, topn=50)

    # get the similar words
    old_similarity_words = [words for words, similarity_score in old_similarity_pairs]
    modern_similarity_words = [words for words, similarity_score in modern_similarity_pairs]

    # get the union of the most similar words from both the old and modern set
    union_words = old_similarity_words + modern_similarity_words
    # deduplicate words by converting the list into a set and back into a list
    union_words = list(set(union_words))

    # for each model get the similarity score of the target words to each word in the list of union words
    # this list of cosine similarity scores can be treated as a second order embedding
    old_second_order_embedding = np.array([old_model.wv.similarity(word, w) for w in union_words]).reshape(1,-1)
    modern_second_order_embedding = np.array([modern_model.wv.similarity(word, w) for w in union_words]).reshape(1,-1)

    # compare the cosine similarity between the two
    output = cosine_distances(old_second_order_embedding, modern_second_order_embedding)
    return output[0][0]
  except:
    return None


In [None]:
global_measure_scores = {}

for word in target_words:
  global_measure_scores[word] = global_measure(word)

for word, score in global_measure_scores.items():
  print(f'{word}: Global Measure = {score}')

accident: Global Measure = 0.1941033899784088
business: Global Measure = 0.1111987754702568
disease: Global Measure = 0.010570740327239037
face: Global Measure = 0.15259996056556702
girl: Global Measure = 0.0609852559864521
governor: Global Measure = 0.03833675757050514
nice: Global Measure = 0.03708072006702423
pudding: Global Measure = -0.09021998196840286
thing: Global Measure = 0.15583807229995728
wife: Global Measure = 0.09058663249015808
broadcast: Global Measure = 0.08077164739370346
bug: Global Measure = -0.04566376283764839
cool: Global Measure = 0.1561077982187271
gay: Global Measure = 0.15243610739707947
post: Global Measure = -0.09178338944911957
queer: Global Measure = -0.013361874036490917
silly: Global Measure = -0.04037310928106308
terrific: Global Measure = 0.24985845386981964
virus: Global Measure = 0.22041653096675873
web: Global Measure = -0.016600143164396286


In [None]:
# do the same thing for the local neighborhood measure

local_neighborhood_scores = {}

for word in target_words:
  local_neighborhood_scores[word] = local_neighborhood_measure(word)

for word, score in local_neighborhood_scores.items():
  print(f'{word}: Local Neighborhood Measure = {score}')


accident: Local Neighborhood Measure = 0.0420495867729187
business: Local Neighborhood Measure = 0.007288217544555664
disease: Local Neighborhood Measure = 0.03096938133239746
face: Local Neighborhood Measure = 0.012492835521697998
girl: Local Neighborhood Measure = 0.03965604305267334
governor: Local Neighborhood Measure = 0.024483978748321533
nice: Local Neighborhood Measure = 0.08520054817199707
pudding: Local Neighborhood Measure = 0.03461003303527832
thing: Local Neighborhood Measure = 0.00948566198348999
wife: Local Neighborhood Measure = 0.02271634340286255
broadcast: Local Neighborhood Measure = 0.14218676090240479
bug: Local Neighborhood Measure = 0.049362242221832275
cool: Local Neighborhood Measure = 0.09286993741989136
gay: Local Neighborhood Measure = 0.40256887674331665
post: Local Neighborhood Measure = 0.08789438009262085
queer: Local Neighborhood Measure = 0.2586768865585327
silly: Local Neighborhood Measure = 0.07165944576263428
terrific: Local Neighborhood Measure = 

## Initial Analysis Discussion

Interestingly the two measures show greatly different results. The global measure shows that the old and modern embeddings are completely different from each other, with a cosine similarity score closer to zero. Although this methodology has clear problems as outlined by Kutuzov et al., where he states:

> "It usually does not make sense to, for example, directly calculate cosine similarities between embeddings of one and the same word in two different models. The reason is that most modern word embedding algorithms are inherently stochastic and the resulting embedding sets are invariant under rotation. Thus, even when trained on the same data, separate learning runs will produce entirely different numerical vectors (though with roughly the same pairwise similarities between vectors for particular words). This is expressed even stronger for models trained on different corpora. It means that even if word meaning is completely stable, the direct cosine similarity between its vectors from different time periods can still be quite low, simply because the random initializations of the two models were different."

## Next Steps...

### Fixing the Global Measure

In order to resolve the issues presented by Kutuzov et al., with respect to the Global Measure we can try to apply his suggested which is to align the two vector embeddings before performing a cosine similarity. Aligning two vectors is possible through the orthagonal procrustes problem (https://en.wikipedia.org/wiki/Orthogonal_Procrustes_problem)

>  In its classical form, one is given two matrices $A$ and $B$ and asked to find an orthogonal matrix $\Omega$  which most closely maps $A$ to $B$

Given this we have two approaches for aligning the two matrices:
1. The traditional approach which is to find $\Omega$ that aligns the matrix of all the vectors for all the common words between A and B
2. Another approach which is to focus on aligning a smaller subset of words between A and B and using these as "anchor points" to align the two matrices.
    - In this case we will use stop words as "anchor points" then given the $\Omega$ that aligns vectors of the stop words between A and B we will use this $\Omega$ to align all the common words of A and B
    - To be able to do this we will use the transpose of $\Omega$
    - My intuition for this approach is that it does not force words that are not meant to be aligned to align. By focusing on a small subset of words that we are sure to have not undergone a semantic shift this will not "lose" the shifts when we align the two matrices

### Fixing the local neighborhood measure

As of now there aren't any fixes for the results we obtained in the local neighborhood measure. But, what we can do is to further investigate what went wrong by taking a deeper look into the similar words for each of our target words.

Although it is mentioned that aligning the matrices then performing the local neighborhood measure is shown to provide better results:

> Hamilton et al. (2016c) showed that these two approaches can be used simul- taneously: they employed both second order embeddings' and orthogonal Procrustes transformations to align diachronic models

# Step 3.1 Fixing the Global Measure

## Step 3.1.1 Align everything

In [None]:
# Align the words using Procrustes

# get common words between both models
common_words = list(set(old_model.wv.index_to_key) | set(modern_model.wv.index_to_key))

# create the common word matrix for each corpus
old_matrix = np.vstack([old_model.wv[w] for w in common_words])
modern_matrix = np.vstack([modern_model.wv[w] for w in common_words])

# apply procrustes to both matrices
T, _ = orthogonal_procrustes(old_matrix, modern_matrix)

old_aligned = np.dot(old_matrix, T)

In [None]:
def aligned_global_measure(word:str, matrixA, matrixB):
  if word in common_words:
    idx = common_words.index(word)

    return cosine_similarity(matrixA[idx].reshape(1,-1), matrixB[idx].reshape(1,-1))[0][0]
  else:
    print('word is not present in both corpora')
    return 0

In [None]:
global_measure_scores = {}

for word in target_words:
  global_measure_scores[word] = aligned_global_measure(word, old_aligned, modern_matrix)

for word, score in global_measure_scores.items():
  print(f'{word}: Aligned Global Measure = {score}')

accident: Aligned Global Measure = 0.6539490818977356
business: Aligned Global Measure = 0.7339191436767578
disease: Aligned Global Measure = 0.6245746612548828
face: Aligned Global Measure = 0.7371307611465454
girl: Aligned Global Measure = 0.7367798089981079
governor: Aligned Global Measure = 0.6273795962333679
nice: Aligned Global Measure = 0.6467378735542297
pudding: Aligned Global Measure = 0.633198618888855
thing: Aligned Global Measure = 0.7289757132530212
wife: Aligned Global Measure = 0.77433180809021
broadcast: Aligned Global Measure = 0.32349973917007446
bug: Aligned Global Measure = 0.35556283593177795
cool: Aligned Global Measure = 0.44020798802375793
gay: Aligned Global Measure = 0.15426336228847504
post: Aligned Global Measure = 0.322643518447876
queer: Aligned Global Measure = 0.23175914585590363
silly: Aligned Global Measure = 0.462099552154541
terrific: Aligned Global Measure = 0.4677477777004242
virus: Aligned Global Measure = 0.4765472412109375
web: Aligned Global M

We can see that because of the alignment the global measures are now more similar as expected but we need to investigate the distribution of all the words and their global measures to see if our target words are out of the expected distribution, which means they exhibit an uncommon behavior of having undergone a semantic shift.

In [None]:
# Generate a histogram to visualize the distributions

# First create a dataframe of all the words and their global global_measure_scores

gm_scores =[cosine_similarity(old_aligned[i].reshape(1,-1), modern_matrix[i].reshape(1,-1))[0][0] for i in range(len(common_words))]


df = pd.DataFrame({
  'word': [word for word in common_words],
  'global_measure': gm_scores
})

In [None]:
import plotly.express as px
fig = px.histogram(df, x="global_measure")
fig.show()

Output hidden; open in https://colab.research.google.com to view.

In comparison to the original distribution of the unaligned matrices:

In [None]:
gm_scores =[cosine_similarity(old_matrix[i].reshape(1,-1), modern_matrix[i].reshape(1,-1))[0][0] for i in range(len(common_words))]


df2 = pd.DataFrame({
  'word': [word for word in common_words],
  'global_measure': gm_scores
})

In [None]:
fig = px.histogram(df2, x="global_measure")
fig.show()

Output hidden; open in https://colab.research.google.com to view.

## Step 3.1.2 Align only stop words

In [None]:
# apply procrustes only to the stop words
import spacy
from spacy.lang.en import stop_words

stop_idx = []

# get the indices in common_words of all the STOP_WORDS
for word in stop_words.STOP_WORDS:
  if word in common_words:
    stop_idx.append(common_words.index(word))
  else:
    print(word)

old_matrix_subset = old_matrix[stop_idx, :]
modern_matrix_subset = modern_matrix[stop_idx, :]

T, _ = orthogonal_procrustes(old_matrix_subset, modern_matrix_subset)

old_aligned_2 = np.dot(old_matrix, T.T)

whereafter
n’t
’d
’ve
hereupon
’m
’s
n‘t
‘d
‘m
‘ve
’re
‘re
’ll
‘ll
‘s


In [None]:
global_measure_scores = {}

for word in target_words:
  # same as before just changed it to use old_aligned_2 instead of old_aligned
  global_measure_scores[word] = aligned_global_measure(word, old_aligned_2, modern_matrix)

for word, score in global_measure_scores.items():
  print(f'{word}: Aligned Global Measure = {score}')

accident: Aligned Global Measure = -0.01868441142141819
business: Aligned Global Measure = -0.1662360578775406
disease: Aligned Global Measure = -0.22333256900310516
face: Aligned Global Measure = 0.15100474655628204
girl: Aligned Global Measure = -0.07697530835866928
governor: Aligned Global Measure = -0.09049860388040543
nice: Aligned Global Measure = -0.1214643269777298
pudding: Aligned Global Measure = 0.053418781608343124
thing: Aligned Global Measure = 0.06506972759962082
wife: Aligned Global Measure = 0.015466730110347271
broadcast: Aligned Global Measure = -0.1257447749376297
bug: Aligned Global Measure = -0.045666199177503586
cool: Aligned Global Measure = -0.011359778232872486
gay: Aligned Global Measure = -0.031925052404403687
post: Aligned Global Measure = -0.18162915110588074
queer: Aligned Global Measure = 0.09371275454759598
silly: Aligned Global Measure = 0.08335381001234055
terrific: Aligned Global Measure = -0.06432860344648361
virus: Aligned Global Measure = 0.195737

In [None]:
# Again we look at the distribution of the global measure metrics for the stopword aligned matrices

gm_scores =[cosine_similarity(old_aligned_2[i].reshape(1,-1), modern_matrix[i].reshape(1,-1))[0][0] for i in range(len(common_words))]


df3 = pd.DataFrame({
  'word': [word for word in common_words],
  'global_measure': gm_scores
})

In [None]:
fig = px.histogram(df3, x="global_measure")
fig.show()

Output hidden; open in https://colab.research.google.com to view.

Again using somewhere around -0.25 as a cutoff point for denoting semantic shifts, we get none of our words actually representing a semantic shfit. Given these results I think we should use the methodology of aligning all the words together.

# Step 3.2 Investigate the similar words

In [None]:
def k_most_similar_words(word:str, k:int):
  old_similarity_pairs = old_model.wv.most_similar(word, topn=k)
  modern_similarity_pairs = modern_model.wv.most_similar(word, topn=k)

  old_words = [word for word, score in old_similarity_pairs]
  modern_words = [word for word, score in modern_similarity_pairs]

  old_scores = [score for word, score in old_similarity_pairs]
  modern_scores = [score for word, score in modern_similarity_pairs]

  print(f'{word}:')
  print('\t\t\tOLD\t\t-\t\tMODERN')
  for i in range(k):
    print(f'\t{old_words[i]} ({old_scores[i]}) - {modern_words[i]} ({modern_scores[i]})')

In [None]:
for i in range(len(target_words)):
  k_most_similar_words(target_words[i], 50)
  print("------------\n------------")

accident:
			OLD		-		MODERN
	accident-or (0.850218653678894) - occident (0.9511099457740784)
	Accident (0.8386437892913818) - accidentaUy (0.9457829594612122)
	accidental (0.7438274621963501) - accident-no (0.9202734231948853)
	accidents (0.6518253684043884) - accidentslly (0.8879091143608093)
	accidentally (0.6251242160797119) - accidental (0.880827784538269)
	implication (0.5794974565505981) - Accident (0.8725574016571045)
	Incident (0.565422534942627) - accidently (0.8642246127128601)
	extermination (0.564968466758728) - accidentally (0.860022783279419)
	change (0.5617852210998535) - occidental (0.8322111368179321)
	inoment (0.5609436631202698) - incident (0.8239510655403137)
	obstruction (0.5575515031814575) - Occidental (0.8139122724533081)
	lameiitation (0.5536296963691711) - implosion (0.8052433729171753)
	hinderance (0.552862823009491) - Incident (0.8006395101547241)
	eventide (0.5472465753555298) - accidents (0.7918665409088135)
	accusation (0.5437805652618408) - accion (0.791

## Step 3.3: Investigate target word context

In [None]:
with open('coha_corpus.txt', "r", encoding="utf-8") as file:
  coha_corpus = file.read()

with open('coca_corpus.txt', "r", encoding="utf-8") as file:
  coca_corpus = file.read()

In [None]:
def find_substring_indices(main_string, substring):
    indices = []
    index = main_string.find(substring)
    while index != -1:
        indices.append(index)
        index = main_string.find(substring, index + 1)
    return indices

def extract_context(corpus, target_word, window=50):

  indices = find_substring_indices(corpus, target_word)

  output = [corpus[i-window:i+window] for i in indices]

  return output

In [None]:
contexts = extract_context(coha_corpus, ' sick ')

# Step 4: Final improvements

Create a combined method using the local neighborhood and global measure to detect semantic shifts by using the aligned matrix and then getting the second order embeddings to find the words that have shifted.

In [None]:
common_words.index('thing')

109315

In [None]:
# Aligned local neighborhood measure

def top_n_indices(lst, n):
    # Use sorted with a lambda function to sort based on the values
    # Use the [:n] slicing to get the top n values
    # Use enumerate to get both the index and value
    # Use a list comprehension to extract the indices
    indices = [index for index, value in sorted(enumerate(lst), key=lambda x: x[1], reverse=True)[:n]]
    return indices

def aligned_local_neighborhood_measure(word:str):

  if word in common_words:
    idx = common_words.index(word)

    old_similarities = []
    modern_similarities = []
    for i in range(len(common_words)):
      old_similarities.append(cosine_similarity(old_aligned[idx].reshape(1,-1), old_aligned[i].reshape(1,-1))[0][0])
      modern_similarities.append(cosine_similarity(modern_matrix[idx].reshape(1,-1), modern_matrix[i].reshape(1,-1))[0][0])

    old_top_50_indices = top_n_indices(old_similarities, 50)
    modern_top_50_indices = top_n_indices(modern_similarities, 50)

    union_indices = old_top_50_indices + modern_top_50_indices
    union_indices = list(set(union_indices))

    old_second_order_embedding = np.array([old_similarities[j] for j in union_indices]).reshape(1,-1)
    modern_second_order_embedding = np.array([modern_similarities[j] for j in union_indices]).reshape(1,-1)


    output = cosine_distances(old_second_order_embedding, modern_second_order_embedding)[0][0]
    return output
  else:
    print('word is not present in both corpora')
    return None

In [None]:
# this takes really really long to run

aligned_local_neighborhood_scores = {}

for word in target_words:
  aligned_local_neighborhood_scores[word] = aligned_local_neighborhood_measure(word)

for word, score in aligned_local_neighborhood_scores.items():
  print(f'{word}: Aligned Local Neighborhood = {score}')

accident: Aligned Local Neighborhood = [0.04081875]
business: Aligned Local Neighborhood = [0.008573]
disease: Aligned Local Neighborhood = [0.03568202]
face: Aligned Local Neighborhood = [0.01795679]
girl: Aligned Local Neighborhood = [0.04351741]
governor: Aligned Local Neighborhood = [0.0220356]
nice: Aligned Local Neighborhood = [0.08502078]
pudding: Aligned Local Neighborhood = [0.0369342]
thing: Aligned Local Neighborhood = [0.00782627]
wife: Aligned Local Neighborhood = [0.01986361]
broadcast: Aligned Local Neighborhood = [0.13790423]
bug: Aligned Local Neighborhood = [0.05200964]
cool: Aligned Local Neighborhood = [0.12429684]
gay: Aligned Local Neighborhood = [0.37317693]
post: Aligned Local Neighborhood = [0.08332485]
queer: Aligned Local Neighborhood = [0.24697548]
silly: Aligned Local Neighborhood = [0.08229655]
terrific: Aligned Local Neighborhood = [0.06611282]
virus: Aligned Local Neighborhood = [0.04368043]
web: Aligned Local Neighborhood = [0.30129576]


In [None]:
# we examine the distribution of the local neighborhood measure metrics for the aligned matrices

alnm_scores =[aligned_local_neighborhood_measure(word) for word in common_words]


df3 = pd.DataFrame({
  'word': [word for word in common_words],
  'global_measure': gm_scores
})