# Field of Research (FoR) Embedding Analysis 
* Authors: Donna & Khalid
* Date: September 14, 2020

This notebook provides various analysis and visualization of the FoR embeddings and how they change over time. We use Cosine similarity as a measure of proximity between FoR embeddings, and we analyze how the overall similarity and neighbourhood of FoRs change over time.

The notebook covers the following analysis:
1. **Topic similarity change over time** - For each year, pairwise Cosine similarity is computed for topic embeddings, and the average similarity is aggegated at different levels: overall, area, cluster, and level 1.

2. **Topic neighbourhood size change over time** - For each year, neighbourhood size is computed for each topic, and the average size is aggregated at different levels: overall, area, cluster, and level 1.

3. **Top and bottom topics with neighbourhood size**

4. **Top and bottom topics with neighbourhood size change** - The topics with the most & least neighbourhood size change from initial year to reference year are retreived.

5. **Top and bottom topics with neighbourhood profile change** - The profile change is computed based how the neighbours ordering of each topic changed.

6. **Top and bottom topics with self-similarity change across time** - The topic embeddings in the intial and reference years are aligned, and self-similarity for each topic between initial and reference year is computed. 




## Initialization

In [None]:
import os
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict, OrderedDict
import numpy as np
import pandas as pd
from statistics import mean
import matplotlib.pyplot as plt
import seaborn as sns

### Mounting gDrive

In [None]:
from google.colab import drive
drive.mount('/gdrive', force_remount = True) 
print("Authorized to access the data on Google Drive.")

In [None]:
%ls '/gdrive/My Drive/AI UK/data/'
#%ls '/gdrive/My Drive/SpringerNature-Google-Turing/data/Three-year-sg-d12-w2-ns1'


### Configuring parameters

In [None]:
ROOT = '/gdrive/My Drive/AI UK/data/'
#ROOT = '/gdrive/My Drive/SpringerNature-Google-Turing/data'
BASE_DIR = '/gdrive/My Drive/SpringerNature-Google-Turing'
DIR_OUT = os.path.join(BASE_DIR, 'analysis', 'distance0.2')
if not os.path.exists(DIR_OUT):
    os.makedirs(DIR_OUT)
DIR_OUT2 = os.path.join(BASE_DIR, 'analysis', 'profile change')
if not os.path.exists(DIR_OUT2):
    os.makedirs(DIR_OUT2)
#EMBEDDING_LOCATIONS = os.path.join(ROOT, 'Three-year-sg-d12-w4-ns5')
EMBEDDING_LOCATIONS = os.path.join(ROOT, 'Three-year-sg-d12-w2-ns1')
FOR_ONTOLOGY_FILE = os.path.join(ROOT, 'FoR-Ontology.csv')
NUM_DIMENSIONS = 12
DISTANCE = 0.2
year_labels = {
    '17_19':'2017-2019', 
    '14_16':'2014-2016', 
    '11_13':'2011-2013', 
    '08_10':'2008-2010', 
    '05_07':'2005-2007', 
    '02_04':'2002-2004', 
    '99_01':'1999-2001', 
    '96_98':'1996-1998', 
    '93_95':'1993-1995', 
    '90_92':'1990-1992'}

reference_year_label = '2017-2019'
initial_year_label = '1990-1992'

os.listdir(EMBEDDING_LOCATIONS)

## Helper Functions

### Load embedding files
This function returns a dictionary **{topic: embedding array}**

In [None]:
def load_embeddings(embedding_file):

  embeddings = {}
  with open(embedding_file) as file_reader:
    lines = file_reader.readlines()
    
    # skip the first line
    lines = lines[1:]

    # parse embedding lines
    for line in lines:
      parts = line.split(' ')
      embedding = [float(value) for value in parts[-NUM_DIMENSIONS:]]
      topic = ' '.join(parts[:-NUM_DIMENSIONS])
      embeddings[topic] = embedding


  return embeddings

### Compute pairwise  similarity
This function returns a nested dictionary **{topic: {topic: similarity}}**


In [None]:
def compute_pairwise_similarities(embedding_dict):
  # store pairwise similarities
  pairwise_embedding_similarities = defaultdict(dict)

  # get list of topics
  topics = list(embedding_dict.keys())

  total_similarity = 0
  count = 0

  for i in range(len(topics)):
    #for j in range(i + 1, len(topics)):
    for j in range(len(topics)):
      topic1 = topics[i]
      topic2 = topics[j]
      if topic1 == topic2: continue

      # get first topic's embedding
      embed1 = embedding_dict[topic1]
      # get second topic's embedding
      embed2 = embedding_dict[topic2]
      # compute cosine similarity
      similarity = cosine_similarity([embed1], [embed2])
      # store in dictionary
      pairwise_embedding_similarities[topic1][topic2] = float(similarity)
  
  # Order topics by similarity
  for topic in pairwise_embedding_similarities:
    dictionary = pairwise_embedding_similarities[topic]
    sorted_dictionary = OrderedDict(sorted(dictionary.items(), key=lambda x: x[1], reverse=True))
    pairwise_embedding_similarities[topic] = sorted_dictionary

  return pairwise_embedding_similarities

### Get neighbourhood
This function returns a list of **{topic: similarity}** that are within a certian distance of a given one.

In [None]:
def get_neighbourhood(similarities, topic, distance):
  results = []
  for item in similarities[topic].items():
    t, similarity = item
    if similarity >= 1- distance and topic != t :
      results.append(item)
  return results

## Computation

### Load FoR Ontology

In [None]:
ontology = pd.read_csv(FOR_ONTOLOGY_FILE)
print("L2 value count: {}".format(len(ontology.f2.unique())))
print("L1 value count: {}".format(len(ontology.f1.unique())))
print("cluster value count: {}".format(len(ontology.cluster.unique())))
print("Area value count: {}".format(len(ontology.area.unique())))

In [None]:
ontology.head()

### Query ontology

This function returns a list of level 2 FoR given a group name and ontology level

In [None]:
def get_topics_by_group(group_name, level_name):
  if level_name not in ["f1", "cluster", "area"]:
    raise ValueError("Error - Group name must be f1, cluster or area")
  return list(ontology[ontology[level_name] == group_name].f2)

This function returns the group name given Level 2 FoR and group level

In [None]:
def get_group_by_topic(topic_name, level_name):
  if level_name not in ["f1", "cluster", "area"]:
    raise ValueError("Error - Group name must be f1, cluster or area")
  
  results = list(ontology[ontology.f2 == topic_name][level_name])
  if len(results) > 0:
    return results[0]
  else:
    return 'Unknown'

### Load embeddings by year

In [None]:
embeddings_by_year = dict()
for f in os.listdir(EMBEDDING_LOCATIONS):
  embedding_file = os.path.join(EMBEDDING_LOCATIONS, f)
  year_label = year_labels[embedding_file[-9:-4]]
  embeddings = load_embeddings(embedding_file)
  print('{} topic embeddings loaded for year {}'.format(len(embeddings),year_label ))
  embeddings_by_year[year_label] = embeddings


### Compute topic similarities by year

In [None]:
pairwise_similarity_by_year = dict()
for year_label in year_labels.values():
  year_embeddings = embeddings_by_year[year_label]
  pairwise_similarity = compute_pairwise_similarities(year_embeddings)
  print("Pairwise Cosine similarities computed for year {}".format(year_label))
  pairwise_similarity_by_year[year_label] = pairwise_similarity

In [None]:
topic_similarity_df = pd.DataFrame.from_records(
    [
        (level1, level2, level3, leaf)
        for level1, level2_dict in pairwise_similarity_by_year.items()
        for level2, level3_dict in level2_dict.items()
        for level3, leaf in level3_dict.items()
    ],
    columns=['year', 'topic1', 'topic2', 'similarity']
)

topic_similarity_df = pd.merge(topic_similarity_df, ontology, left_on='topic1',right_on='f2', how='inner')
topic_similarity_df = pd.merge(topic_similarity_df, ontology, left_on='topic2',right_on='f2', how='inner')
topic_similarity_df = topic_similarity_df.drop(['f2_x', 'f2_y'], axis=1)
topic_similarity_df.head(10).T

### Get topic neighbours by year

In [None]:
topic_neighbours_by_year = dict()
for label in year_labels.values():
  pairwise_similarity = pairwise_similarity_by_year[label]
  topic_neighbours = dict()
  for topic in pairwise_similarity:
    topic_neighbours[topic] = get_neighbourhood(
        pairwise_similarity, topic, DISTANCE)
  
  topic_neighbours_by_year[label] = topic_neighbours

In [None]:
topic_neighbours_df = pd.DataFrame.from_records(
    [
        (level1, level2, len(leaf))
        for level1, level2_dict in topic_neighbours_by_year.items()
        for level2, leaf in level2_dict.items()
    ],
    columns=['year', 'topic',  'neighbourhood_size']
)

topic_neighbours_df = pd.merge(topic_neighbours_df, ontology, left_on='topic',right_on='f2', how='inner')
topic_neighbours_df.head()

## Embedding Visualization

In [None]:
from sklearn import manifold

n_neighbors = 10
n_components = 2

In [None]:
initial_embeddings = embeddings_by_year[initial_year_label]
reference_embeddings = embeddings_by_year[reference_year_label]

def _to_nparray(embedding_dict):
  topics = []
  embeddings = []
  for key, value in embedding_dict.items():
    topics.append(key)
    embeddings.append(value)

  return topics, np.array(embeddings)

initial_topics, initial_embd_arrays = _to_nparray(initial_embeddings)
reference_topics, reference_embd_arrays = _to_nparray(reference_embeddings)

In [None]:
tsne = manifold.TSNE(n_components=n_components, init='pca', random_state=0)
initial_compressed = tsne.fit_transform(initial_embd_arrays)
reference_compressed = tsne.fit_transform(reference_embd_arrays)
data_list = [initial_compressed, reference_compressed]
titles = ['T-STNE: {}'.format(initial_year_label), 
          'T-STNE: {}'.format(reference_year_label)]

In [None]:
def visualize(data_list, titles, label_list):

  fig, axs = plt.subplots(ncols=2, figsize=(25, 8))
  p = 0

  for data in data_list:
    
    title = titles[p]
    labels = label_list[p]
    
    g = sns.scatterplot(
        x=data[:, 0], 
        y=data[:, 1], 
        hue=labels,
        ax=axs[p])
    
    if p==0:
      try: g.get_legend().set_visible(False)
      except: pass
    else:
      g.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

    # g.set(ylim=(-12, 12))
    # g.set(xlim=(-12, 12))
    p+=1

    

### Visualize by Area

In [None]:
label_list =[
  [get_group_by_topic(v, 'area') for v in initial_topics],
  [get_group_by_topic(v, 'area') for v in reference_topics]
  ]
  
visualize(data_list, titles, label_list)
# save image as FoR_areas.png

### Visualize by Cluster

In [None]:
label_list =[
  [get_group_by_topic(v, 'cluster') for v in initial_topics],
  [get_group_by_topic(v, 'cluster') for v in reference_topics]
  ]
  
visualize(data_list, titles, label_list)
# save image as FoR_clusters.png

### Visualize by Level 1

In [None]:
label_list =[
  [get_group_by_topic(v, 'f1') for v in initial_topics],
  [get_group_by_topic(v, 'f1') for v in reference_topics]
  ]
  
visualize(data_list, titles, label_list)
# save image as FoR_level1.png

## Analysis

In [None]:

def plot_line_chart(dataframes, titles, ylabel, ncolumns=3):

  num_charts = len(dataframes)
  if num_charts in [1, 2]:   
    ncolumns = num_charts
  else:
    ncolumns = ncolumns

  nrows = (num_charts // ncolumns)
  if num_charts % ncolumns > 0:
    nrows+=1

  figure, axs = plt.subplots(ncols=ncolumns, nrows=nrows, figsize=(25, 7+(2*nrows)))
  figure.tight_layout(h_pad=7.0)

  for i, dataframe in enumerate(dataframes):

    ax= None
    if ncolumns == 1:
      ax = None
    elif nrows == 1:
      ax = axs[i]
    else:
      r = i // ncolumns
      c = i % ncolumns
      ax = axs[r][c]

    g = sns.lineplot(
        data=dataframe,
        sort=False,
        ax=ax,
        dashes=False,
        marker=False
        )
    
    g.set(
        xlabel='', 
        ylabel=ylabel, 
        title=titles[i],
        )
    
    g.set_xticklabels(labels=dataframe.index, rotation=90)
    

  plt.show()


## 1) FoR Similarity change over time

### 1.1. Overall Cosine similarity change over time

In [None]:
data = topic_similarity_df.groupby('year').agg({'similarity':'mean'}).sort_values(by='year', ascending=True)
plot_line_chart(
    [data], ["Overall Trend"], 'Average Pairwise Cosine Similarity')
# save this figure as Cos-sim-trend.png

### 1.2. Overall Cosine similarity change over time - Areas

In [None]:
data = topic_similarity_df[topic_similarity_df['area_x']==topic_similarity_df['area_y']].drop('area_y', axis=1)
data.rename(columns = {'area_x':'area'}, inplace = True)
data = data.groupby(['year', 'area']).agg({'similarity':'mean'}).sort_values(by='year', ascending=True).unstack()
data.columns = list(data.columns.get_level_values(1))
plot_line_chart(
    [data], ["Overall Area Trends"], 'Average Pairwise Cosine Similarity')
# save this as Cos-sim-trend-areas.png

In [None]:
data

### 1.3. Overall Cosine similarity change over time - Clusters

In [None]:
result_dataframes = []
areas = list(ontology.area.unique())
data = topic_similarity_df[topic_similarity_df['area_x']==topic_similarity_df['area_y']].drop('area_y', axis=1)
data.rename(columns = {'area_x':'area'}, inplace = True)
for area in areas:
  current = data[data['area']==area].drop('area', axis=1)
  current = current[current['cluster_x']==current['cluster_y']].drop('cluster_y', axis=1)
  current.rename(columns = {'cluster_x':'cluster'}, inplace = True)
  current = current.groupby(['year', 'cluster']).agg({'similarity':'mean'}).sort_values(by='year', ascending=True).unstack()
  current.columns = list(current.columns.get_level_values(1))
  result_dataframes.append(current)

plot_line_chart(
    result_dataframes, areas, 'Average Pairwise Cosine Similarity')
# save image as Cos-sim-trend-clusters.png

### 1.4. Overall Cosine similarity change over time - Level 1s 

In [None]:
result_dataframes = []
clusters = list(ontology.cluster.unique())
data = topic_similarity_df[topic_similarity_df['cluster_x']==topic_similarity_df['cluster_y']].drop('cluster_y', axis=1)
data.rename(columns = {'cluster_x':'cluster'}, inplace = True)
for cluster in clusters:
  current = data[data['cluster']==cluster].drop('cluster', axis=1)
  current = current[current['f1_x']==current['f1_y']].drop('f1_y', axis=1)
  current.rename(columns = {'f1_x':'level_1'}, inplace = True)
  current = current.groupby(['year', 'level_1']).agg({'similarity':'mean'}).sort_values(by='year', ascending=True).unstack()
  current.columns = list(current.columns.get_level_values(1))
  result_dataframes.append(current)

plot_line_chart(
    result_dataframes, clusters, 'Average Pairwise Cosine Similarity')
# save image as Cos-sim-trend-level1.png

## 2) Neighbourhood Size Change over time

### 1.1. Overall Neighbourhood Size change over time

In [None]:
data = topic_neighbours_df.groupby('year').agg({'neighbourhood_size':'mean'}).sort_values(by='year', ascending=True)
plot_line_chart(
    [data], ["Overall Trend"], 'Average topic Neighbourhood size')

### 1.2. Overall Neighbourhood Size change over time - Area

In [None]:
data = topic_neighbours_df.groupby(['year', 'area']).agg({'neighbourhood_size':'mean'}).sort_values(by='year', ascending=True).unstack()
data.columns = list(data.columns.get_level_values(1))
plot_line_chart(
    [data], ["Overall Area Trends"], 'Average topic Neighbourhood size')
# image Neigh-trend-areas.png

### 1.3. Overall Neighbourhood Size change over time - Cluster

In [None]:
result_dataframes = []
areas = list(ontology.area.unique())
data = topic_neighbours_df
for area in areas:
  current = data[data['area']==area].drop('area', axis=1)
  current = current.groupby(['year', 'cluster']).agg({'neighbourhood_size':'mean'}).sort_values(by='year', ascending=True).unstack()
  current.columns = list(current.columns.get_level_values(1))
  result_dataframes.append(current)

plot_line_chart(
    result_dataframes, areas, 'Average topic Neighbourhood size')
# image Neigh-trend-cluster.png

### 1.4. Overall Neighbourhood Size change over time - Level 1s

In [None]:
result_dataframes = []
clusters = list(ontology.cluster.unique())
data = topic_neighbours_df
for cluster in clusters:
  current = data[data['cluster']==cluster].drop('cluster', axis=1)
  current = current.groupby(['year', 'f1']).agg({'neighbourhood_size':'mean'}).sort_values(by='year', ascending=True).unstack()
  current.columns = list(current.columns.get_level_values(1))
  result_dataframes.append(current)

plot_line_chart(
    result_dataframes, clusters, 'Average topic Neighbourhood size')
# image Neigh-trend-level1.png

### 1.5. Overall Neighbourhood Size change over time - Level 2s

In [None]:
result_dataframes = []
l1s = list(ontology.f1.unique())
titles = []
data = topic_neighbours_df
for l1 in l1s:
  current = data[data['f1']==l1].drop('f1', axis=1)
  current = current.groupby(['year', 'f2']).agg({'neighbourhood_size':'mean'}).sort_values(by='year', ascending=True).unstack()
  current.columns = list(current.columns.get_level_values(1))
  columns = list(current.columns)
  i = 0
  counter = 1
  while i < len(columns):
    c = columns[i:i+6]
    sub_crrent = current[c]
    result_dataframes.append(sub_crrent)
    titles.append("{} - {}".format(l1, counter))
    counter += 1
    i = i + 6

plot_line_chart(
   result_dataframes, titles, 'Average topic Neighbourhood size', ncolumns=3)
# image Neigh-trend-level2.png

## 3) Top and Bottom Topics with Neighbourhood size

In [None]:
_, axs = plt.subplots(ncols=2, nrows=1, figsize=(25, 8))

top_initial = topic_neighbours_df[topic_neighbours_df['year']==initial_year_label].sort_values(
    by='neighbourhood_size', ascending=False).head(15)[['topic', 'neighbourhood_size']]

g1 = sns.barplot(data=top_initial, x='topic', y='neighbourhood_size', ax=axs[0])
g1.set(title="Top Topics with Neighbourhood size - {}".format(initial_year_label))
_=g1.set_xticklabels(g1.get_xticklabels(), rotation=90)

bottom_initial = topic_neighbours_df[topic_neighbours_df['year']==initial_year_label].sort_values(
    by='neighbourhood_size', ascending=True).head(15)[['topic', 'neighbourhood_size']]

g2 = sns.barplot(data=bottom_initial, x='topic', y='neighbourhood_size', ax=axs[1])
g2.set(title="Bottom Topics with Neighbourhood size - {}".format(initial_year_label))
_=g2.set_xticklabels(g2.get_xticklabels(), rotation=90)


In [None]:
top_initial.topic

In [None]:
_, axs = plt.subplots(ncols=2, nrows=1, figsize=(25, 8))

top_reference = topic_neighbours_df[topic_neighbours_df['year']==reference_year_label].sort_values(
    by='neighbourhood_size', ascending=False).head(15)[['topic', 'neighbourhood_size']]

g1 = sns.barplot(data=top_reference, x='topic', y='neighbourhood_size', ax=axs[0])
g1.set(title="Top Topics with Neighbourhood size - {}".format(reference_year_label))
_=g1.set_xticklabels(g1.get_xticklabels(), rotation=90)

bottom_reference = topic_neighbours_df[topic_neighbours_df['year']==reference_year_label].sort_values(
    by='neighbourhood_size', ascending=True).head(15)[['topic', 'neighbourhood_size']]

g2 = sns.barplot(data=bottom_reference, x='topic', y='neighbourhood_size', ax=axs[1])
g2.set(title="Bottom Topics with Neighbourhood size - {}".format(reference_year_label))
_=g2.set_xticklabels(g2.get_xticklabels(), rotation=90)

In [None]:
top_removed = set(top_initial.topic) - set(top_reference.topic)
top_added = set(top_reference.topic) - set(top_initial.topic)
top_same = set(top_reference.topic).intersection(set(top_initial.topic))
print("Top topics were in the initial year and are not in the reference year:")
for t in top_removed:
  print("-", t)
print("------------------------------------------------------------------------")
print("Top topics are in the reference year and were are not in the initial year:")
for t in top_added:
  print("-", t)

print("------------------------------------------------------------------------")
print("Top topics are in both initial and reference year:")
for t in top_same:
  print("-", t)

In [None]:
bottom_removed = set(bottom_initial.topic) - set(bottom_reference.topic)
bottom_added = set(bottom_reference.topic) - set(bottom_initial.topic)
bottom_same = set(bottom_reference.topic).intersection(set(bottom_initial.topic))
print("Bottom topics were in the initial year and are not in the reference year:")
for t in bottom_removed:
  print("-", t)
print("------------------------------------------------------------------------")
print("Bottom topics are in the reference year and were are not in the initial year:")
for t in bottom_added:
  print("-", t)

print("------------------------------------------------------------------------")
print("Bottom topics are in both initial and reference year:")
for t in bottom_same:
  print("-", t)

## 4) Top and Bottom Topics with Neighbourhood size change over time

In [None]:
initial_data = topic_neighbours_df[topic_neighbours_df['year']==initial_year_label]
reference_data = topic_neighbours_df[topic_neighbours_df['year']==reference_year_label]

data = pd.merge(initial_data, reference_data, on='topic')[['topic', 'neighbourhood_size_x', 'neighbourhood_size_y']]

data['change'] = abs(data.neighbourhood_size_x - data.neighbourhood_size_y)

_, axs = plt.subplots(ncols=2, nrows=1, figsize=(25, 8))

top_change = data.sort_values(by='change', ascending=False).head(15)[['topic', 'change']]
g1 = sns.barplot(data=top_change, x='topic', y='change', ax=axs[0])
g1.set(title="Top Topics with Neighbourhood size change")
_=g1.set_xticklabels(g1.get_xticklabels(), rotation=90)


bottom_change = data.sort_values(by='change', ascending=True).head(15)[['topic', 'change']]
g2 = sns.barplot(data=bottom_change, x='topic', y='change', ax=axs[1])
g2.set(title="Bottom Topics with Neighbourhood size change")
_=g2.set_xticklabels(g2.get_xticklabels(), rotation=90)

In [None]:
print("Top topic change:")
for t in top_change.topic:
  print("-", t)
print("-----------------")
print("Bottom topic change:")
for t in bottom_change.topic:
  print("-", t)

## 5) Profile Change over time 

In [None]:
import math

neighbours_by_year = dict()
for year_label in [initial_year_label, reference_year_label]:
  pairwise_similarity = pairwise_similarity_by_year[year_label]
  topic_neighbours = dict()
  for topic in pairwise_similarity:
    topic_neighbours[topic] = [item[0] for item in get_neighbourhood(
        pairwise_similarity, topic, math.inf)]
  
  neighbours_by_year[year_label] = topic_neighbours


def compute_ranking_change(initial_topic_neighbours, reference_topic_neighbours):
  total_change = 0

  for i, topic in enumerate(initial_topic_neighbours):
    position1 = i + 1
    position2 = reference_topic_neighbours.index(topic) + 1 

    change = abs(position1 - position2) / min(position1, position2) 
    total_change += change

  return total_change


topic_profile_change_score = {}

for topic in neighbours_by_year[initial_year_label]:
  
  initial_topic_neighbours =  neighbours_by_year[initial_year_label][topic]
  reference_topic_neighbours = neighbours_by_year[reference_year_label][topic]
  total_change = compute_ranking_change(initial_topic_neighbours, reference_topic_neighbours)
  topic_profile_change_score[topic] = total_change


topic_profile_change_score_df = pd.DataFrame.from_dict(topic_profile_change_score, orient='index').reset_index()
topic_profile_change_score_df.columns = ['topic', 'profile_change_score']
topic_profile_change_score_df.sort_values(by='profile_change_score', ascending=False).head(10)

In [None]:
_, axs = plt.subplots(ncols=2, nrows=1, figsize=(25, 8))

top_change = topic_profile_change_score_df.sort_values(by='profile_change_score', ascending=False).head(15)
g1 = sns.barplot(data=top_change, x='topic', y='profile_change_score', ax=axs[0])
g1.set(title="Top Topics with Neighbourhood Profile change")
_=g1.set_xticklabels(g1.get_xticklabels(), rotation=90)


bottom_change = topic_profile_change_score_df.sort_values(by='profile_change_score', ascending=True).head(15)
g2 = sns.barplot(data=bottom_change,  x='topic', y='profile_change_score', ax=axs[1])
g2.set(title="Bottom Topics with Neighbourhood Profile change")
_=g2.set_xticklabels(g2.get_xticklabels(), rotation=90)

In [None]:
top_topics_with_profile_change = list(top_change.topic)
top_topics_with_profile_change

In [None]:
#topic = top_topics_with_profile_change[0]

for topic in top_topics_with_profile_change[:2]:
  print("Neighbourhood Profile of {} in {}:".format(topic, initial_year_label))
  for t in neighbours_by_year[initial_year_label][topic][:10]:
    print("-", t)
  print()
  print("Neighbourhood Profile of {} in {}:".format(topic, reference_year_label))
  for t in neighbours_by_year[reference_year_label][topic][:10]:
    print("-", t)
  print("============================================================")
  print("")

## 6) Topic embedding self-similarity across time

In [None]:
def diff(first, second):
        second = set(second)
        return [item for item in first if item not in second]

def intersection_align(m1,m2):

  # Get the vocab for each model
  vocab_m1 = set(m1.keys())
  vocab_m2 = set(m2.keys())

  # Find the common vocabulary
  common_vocab = vocab_m1&vocab_m2
  common_vocab = list(sorted(common_vocab))

  new_m1 = dict()
  new_m2 = dict()
    
	# If no alignment necessary because vocab is identical...
  if not diff(list(vocab_m1),common_vocab) and not diff(list(vocab_m2),common_vocab):
    new_m1 = m1
    new_m2 = m2
  else:
  	# Otherwise sort by frequency (summed for both)
    # Each model is replaced with a new version, only defined on the intersection of the vocabularies
    for field in common_vocab:
      new_m1[field] = m1[field]
      new_m2[field] = m2[field]

  return (common_vocab, new_m1,new_m2)

def smart_procrustes_align(base_embed, other_embed, num_dimensions_par):

  base_embeddings_aligned = dict()
  other_embeddings_aligned  = dict()

  # make sure vocabularies are aligned
  common_vocab, in_base_embed, in_other_embed = intersection_align(base_embed, other_embed)

	# get the embedding matrices
  base_array = np.empty((0, num_dimensions_par))
  other_array = np.empty((0, num_dimensions_par))
  for field in common_vocab:
    base_array = np.append(base_array, [base_embed[field]], axis=0)
    other_array = np.append(other_array, [other_embed[field]], axis=0)

  # just a matrix dot product with numpy
  m = other_array.T.dot(base_array) 
  # SVD method from numpy
  u, _, v = np.linalg.svd(m)
  # another matrix operation
  ortho = u.dot(v) 
  # Replace original array with modified one
  # i.e. multiplying the embedding matrix by "ortho"
  other_array = (other_array).dot(ortho)

  for i,topic in enumerate(common_vocab):
    base_embeddings_aligned[topic] = base_array[i]
    other_embeddings_aligned[topic] = other_array[i]

  return base_embeddings_aligned, other_embeddings_aligned



In [None]:
initial_year_embeddings_aligned, reference_year_embeddings_aligned = smart_procrustes_align(
    embeddings_by_year[initial_year_label], embeddings_by_year[reference_year_label], NUM_DIMENSIONS)

In [None]:
topic_self_similarity = dict()
for topic in initial_year_embeddings_aligned:
  embed1 = initial_year_embeddings_aligned[topic]
  embed2 = reference_year_embeddings_aligned[topic]
  similarity = cosine_similarity([embed1], [embed2])
  topic_self_similarity[topic] = float(similarity)

In [None]:
topic_self_similarity_df = pd.DataFrame.from_dict(topic_self_similarity, orient='index').reset_index()
topic_self_similarity_df.columns = ['topic', 'self_similarity']
topic_self_similarity_df.sort_values(by='self_similarity', ascending=False).head(10)

In [None]:
_, axs = plt.subplots(ncols=2, nrows=1, figsize=(25, 8))

top_similarity = topic_self_similarity_df.sort_values(by='self_similarity', ascending=False).head(15)
g1 = sns.barplot(data=top_similarity, x='topic', y='self_similarity', ax=axs[0])
g1.set(title="Top Topics with self similarity (less change over time)")
_=g1.set_xticklabels(g1.get_xticklabels(), rotation=90)


bottom_similarity = topic_self_similarity_df.sort_values(by='self_similarity', ascending=True).head(15)
g2 = sns.barplot(data=bottom_similarity,  x='topic', y='self_similarity', ax=axs[1])
g2.set(title="Bottom Topics with self similarity (more change over time)")
_=g2.set_xticklabels(g2.get_xticklabels(), rotation=90)
# image self-similarity.png