This code analyses FoR embeddings within each time period, using cosine and neighbourhood similarity measures.

* Author: Gard
* Date: July 16
* Based on code written by Khalid and Donna + notebook by Barbara (https://colab.research.google.com/drive/1zZG6_0z_ESRUol0X7nuhtD04rSb4H-8f?usp=sharing).



# Initialization


In [3]:
from google.colab import drive
drive.mount('/gdrive', force_remount = True) 
%ls '/gdrive/My Drive/SpringerNature-Google-Turing/data/'

Mounted at /gdrive
 [0m[01;34mFive-year-sg-d100-w2-ns5[0m/            [01;34mThree-year-sg-d100-w2-ns5[0m/
 [01;34mFive-year-sg-d12-w4-ns1[0m/             [01;34mThree-year-sg-d12-w2-ns1[0m/
 [01;34mFive-year-sg-d12-w4-ns5[0m/             [01;34mThree-year-sg-d12-w2-ns2[0m/
 for_co-occurrence_counts_11-13.csv   [01;34mThree-year-sg-d12-w2-ns3[0m/
 for_co-occurrence_counts_14-16.csv   [01;34mThree-year-sg-d12-w2-ns5[0m/
 for_co-occurrence_counts_17-19.csv   [01;34mThree-year-sg-d12-w4-ns1[0m/
'FoR mapping - Extract 1.csv'         [01;34mThree-year-sg-d12-w4-ns2[0m/
'FoR mapping - Extract 1.gsheet'      [01;34mThree-year-sg-d12-w4-ns3[0m/
 for_nsize_negcorr.csv                [01;34mThree-year-sg-d12-w4-ns5[0m/
 FoR-Ontology.csv                     [01;34mThree-year-sg-journals[0m/
 for_regression_coef.csv              wv_11_13.txt
 FoR_W2V_v01.html                     wv_14_16.txt
 [01;34mMean-title-centroids-3yrs[0m/           wv_17_19.txt
 [01;34mMean-tit

In [4]:
import os
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict, OrderedDict
import numpy as np
import pandas as pd
from statistics import median, mean
from os import listdir
from os.path import isfile, join
import re
import plotly.express as px
import matplotlib.pyplot as plt
import statistics
import glob
from datetime import datetime
from random import randrange
from pandas import Series
from matplotlib import pyplot
from statsmodels.tsa.seasonal import seasonal_decompose
import scipy.stats



pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.



## Parameters

Note that window_size = 2 and negative_sampling = 1 are the best hyperparameters according to the intrinsic evaluation, so we choose these.

In [5]:
# options are: 
#     "forembbyarticle" for co-occurrence-based FoR embeddings at the article level
#     "title" for aggregated title embeddings
#     "forembbyjournal" for FoR-code co-occurrences aggregated at the journal level.
embedding_type = "forembbyjournal" 
f_code = "f2" # level of FoR codes for which the embeddings are created, can be f1 or f2 can be f1 or f2
if embedding_type == "title":
  year_intervals = 3
  num_dimensions = 512
elif embedding_type == "forembbyarticle":
  year_intervals = 3 
  num_dimensions = 12 # number of dimensions of embeddings
  window_size = 2 
  negative_sampling = 1
  f_code = "f2" # level of FoR codes for which the embeddings are created; it is f2
elif embedding_type == "forembbyjournal":
  year_intervals = 3
  num_dimensions = 100
  f_code = "f2"


In [6]:
last_year = 2019
#if year_intervals == 3:
first_year = 1990 
#if year_intervals == 5:
#  first_year = 1950 # this doesn't apply because we have excluded old embeddings from our analysis

In [7]:
BASE_DIR = '/gdrive/My Drive/SpringerNature-Google-Turing'
DATA_DIR = BASE_DIR + '/data'
if embedding_type == "forembbyarticle":
  if year_intervals == 3:
    years_words = "Three"
  elif year_intervals == 5:
    years_words = "Five"
  DATA_DIR_yrs = DATA_DIR + '/' + years_words + '-year-sg-d' + str(num_dimensions) + '-w' + str(window_size) + '-ns' + str(negative_sampling)
elif embedding_type == "title":
  DATA_DIR_yrs = os.path.join(DATA_DIR, 'Mean-title-centroids-3yrs-v02')
elif embedding_type == "forembbyjournal":
  DATA_DIR_yrs = os.path.join(DATA_DIR, "Three-year-sg-journals")

# Functions

Function for loading embeddings

In [8]:
# Function that loads the embeddings from the embedding file (From Khalid's and Donna's script)
def load_embeddings(embedding_file):

  embeddings = {}
  with open(os.path.join(DATA_DIR_yrs, embedding_file)) as file_reader:
    lines = file_reader.readlines()
    
    # skip the first line
    lines = lines[1:]

    # parse embedding lines
    for line in lines:
      parts = line.split(' ')
      embedding = [float(value) for value in parts[-num_dimensions:]]
      field = ' '.join(parts[:-num_dimensions])
      embeddings[field] = embedding

  return embeddings

Function for calculating pairwise cosine similarity scores


In [9]:
def compute_pairwise_similarities(embedding_dict):
  # store pairwise similarities
  pairwise_embedding_similarities = defaultdict(dict)

  # get list of topics
  topics = list(embedding_dict.keys())

  total_similarity = 0
  similarity_distribution = list() # list of similarity values
  count = 0

  for i in range(len(topics)):
    #for j in range(i + 1, len(topics)):
    for j in range(len(topics)):
      topic_1 = topics[i]
      topic_2 = topics[j]
      #topic_1, topic_2 = max(topic_1, topic_2), min(topic_1, topic_2)

      # get first topic's embedding
      embed1 = embedding_dict[topic_1]
      # get second topic's embedding
      embed2 = embedding_dict[topic_2]
      # compute cosine similarity
      similarity = cosine_similarity([embed1], [embed2])
      # store in dictionary
      pairwise_embedding_similarities[topic_1][topic_2] = float(similarity)
      total_similarity += float(similarity)
      similarity_distribution.append(float(similarity))
      count += 1
  
  # Order topics by similarity
  for topic in pairwise_embedding_similarities:
    dictionary = pairwise_embedding_similarities[topic]
    sorted_dictionary = OrderedDict(sorted(dictionary.items(), key=lambda x: x[1], reverse=True))
    pairwise_embedding_similarities[topic] = sorted_dictionary

  return pairwise_embedding_similarities, total_similarity / count, median(similarity_distribution)

Functions for calculating neighbourhood similarity scores


In [10]:
def get_nearest_neighbours(similarities, topic, k):
  return list(similarities[topic].items())[1:k+1]

In [11]:
def get_neighbourhood(similarities, topic, distance):
  results = []
  for item in similarities[topic].items():
    t, similarity = item
    if similarity >= 1- distance and topic != t :
      results.append(item)
  return results

In [12]:
def get_neighbourhood_sizes(similarities, distance):
  results = {}
  sizes = []
  topics = list(similarities.keys())
  for topic in topics:
    results[topic] = len(get_neighbourhood(similarities, topic, distance))
    sizes.append(results[topic])
  return results, mean(sizes), median(sizes)


In [13]:
def get_average_neighbourhood_sizes(similarities, distances=np.arange(0.01, 0.31, 0.01)):
  sizes = {}
  for distance in distances:
    _, avg_size, median_size = get_neighbourhood_sizes(similarities, distance)
    sizes[round(distance, 3)] = round(avg_size, 5)
  return sizes

def get_median_neighbourhood_sizes(similarities, distances=np.arange(0.01, 0.31, 0.01)):
  sizes = {}
  for distance in distances:
    _, avg_size, median_size = get_neighbourhood_sizes(similarities, distance)
    sizes[round(distance, 3)] = round(median_size, 5)
  return sizes


jitter for plotting

In [14]:
def rand_jitter(arr, s=0.01):
    stdev = s*(max(arr)-min(arr))
    return arr + np.random.randn(len(arr)) * stdev

In [15]:
def retention_rate(df_list):

  retention_rates = []
  for di in range(0, len(df_list)):
    d = df_list[di]
    if d.year.unique()[0] == 1990:
      rr = 1.0
    else:
      d_prev = df_list[di-1]
      rr = len(set(d.field.unique()).intersection(d_prev.field.unique()))/d.field.nunique()
    retention_rates.append(rr)
  
  return retention_rates




# Read data

Read embedding files


In [16]:
if embedding_type in ["forembbyarticle", "forembbyjournal"]:
  embeddingfiles = [f for f in listdir(DATA_DIR_yrs) if isfile(join(DATA_DIR_yrs, f))]
elif embedding_type == "title":
  embeddingfiles = glob.glob(os.path.join(DATA_DIR_yrs, "*{}.csv".format(f_code)))
print("There are", str(len(embeddingfiles)), "embedding files")


There are 10 embedding files


In [17]:
year2embeddings = dict() # dictionary that maps a year_start to the embedding file for that (year, year + year_intervals) period
for f in embeddingfiles:
  if embedding_type in ["forembbyarticle", "forembbyjournal"]:
    m = re.match(r'wv_(\d\d)_(\d\d)', f)
  elif embedding_type == "title":
    m = re.match(r'.*?wv_(\d\d)_(\d\d).*?', f)
  year_start = m.group(1)
  #year_end = m.group(2)
  #print("file", f, "Year start:", str(year_start), "Year end:", str(year_end))
  year2embeddings[int(year_start)] = f
print(str(year2embeddings))

{2: 'wv_02_04.txt', 99: 'wv_99_01.txt', 96: 'wv_96_98.txt', 93: 'wv_93_95.txt', 90: 'wv_90_92.txt', 17: 'wv_17_19.txt', 14: 'wv_14_16.txt', 11: 'wv_11_13.txt', 8: 'wv_08_10.txt', 5: 'wv_05_07.txt'}


## Load embeddings


In [18]:
all_embeddings = dict() # Dictionary that maps a year_start with the embeddings for the (year, year + year_intervals-1) period; 
# the embeddings are dictionaries that map a FoR to the list of its embedding's numeric values
#print(str(first_year), "!")
#print(str(last_year), "!")
for y in range(first_year, last_year, year_intervals):
  #print("Year", str(y))
  y0 = int(str(y)[2:4])
  #print("y0=", str(y0))
  embeddings = load_embeddings(year2embeddings[y0])
  all_embeddings[y] = embeddings
  

In [19]:
# test
#pair_sim = compute_pairwise_similarities()
#get_neighbourhood_sizes(similarities=pair_sim[0], distance=0.2)

# Qualitative neighbourhood analysis for selected FoRs

In [20]:
NHOOD = 0.8

## Computer hardware

In [21]:
YEAR_LIST = (1990, 1993, 1996, 1999, 2002, 2005, 2008, 2011, 2014, 2017)
df_ch_res = pd.DataFrame()
list_of_years = []
list_of_fields = []
list_of_sims = []

for y in YEAR_LIST:
  y_res = compute_pairwise_similarities(all_embeddings[y])
  print("---------------------------------------------")

  for k,v in y_res[0].items():
    if k == "Computer Hardware":
      for_res = v 
      for f in for_res:
        if for_res[f] >= NHOOD and f != "Computer Hardware":
          print(y, "CH sim =", f, "->", for_res[f])
          list_of_years.append(y)
          list_of_fields.append(f)
          list_of_sims.append(for_res[f])
  # print(y, ":", all_embeddings[y]["Computer Hardware"][0])
df_ch_res["year"] = list_of_years 
df_ch_res["field"] = list_of_fields
df_ch_res["cosine"] = list_of_sims  

---------------------------------------------
1990 CH sim = Building -> 0.9936205995964467
1990 CH sim = Aerospace Engineering -> 0.9850150578539109
1990 CH sim = Statistics -> 0.9793698207185287
1990 CH sim = Other Engineering -> 0.9701554299718732
1990 CH sim = Applied Mathematics -> 0.970042825247036
1990 CH sim = Manufacturing Engineering -> 0.9697176447491981
1990 CH sim = Distributed Computing -> 0.9696352279255454
1990 CH sim = Automotive Engineering -> 0.9688719625521501
1990 CH sim = Communication and Media Studies -> 0.9681671816282849
1990 CH sim = Film, Television and Digital Media -> 0.9668313366819836
1990 CH sim = Artificial Intelligence and Image Processing -> 0.9653804038469446
1990 CH sim = Civil Engineering -> 0.9652131096105012
1990 CH sim = Electrical and Electronic Engineering -> 0.9639489031825157
1990 CH sim = Other Agricultural and Veterinary Sciences -> 0.9624409712876945
1990 CH sim = Classical Physics -> 0.9620789372452213
1990 CH sim = Pure Mathematics -> 0

In [22]:
df_ch_res.head()

Unnamed: 0,year,field,cosine
0,1990,Building,0.993621
1,1990,Aerospace Engineering,0.985015
2,1990,Statistics,0.97937
3,1990,Other Engineering,0.970155
4,1990,Applied Mathematics,0.970043


In [23]:
ch_x = pd.crosstab(df_ch_res.year, df_ch_res.field, margins=True)
ch_x

field,Aerospace Engineering,Agricultural Biotechnology,"Agriculture, Land and Farm Management",Applied Economics,Applied Mathematics,Architecture,Art Theory and Criticism,Artificial Intelligence and Image Processing,Astronomical and Space Sciences,Atmospheric Sciences,"Atomic, Molecular, Nuclear, Particle and Plasma Physics",Automotive Engineering,Building,Chemical Engineering,Civil Engineering,Classical Physics,Communication and Media Studies,Communications Technologies,Computation Theory and Mathematics,Computer Software,Condensed Matter Physics,Curatorial and Related Studies,Data Format,Design Practice and Management,Distributed Computing,Ecological Applications,Econometrics,Education Systems,Electrical and Electronic Engineering,Environmental Biotechnology,Environmental Engineering,"Film, Television and Digital Media",Geochemistry,Geology,Geomatic Engineering,Geophysics,Industrial Biotechnology,Information Systems,Interdisciplinary Engineering,Journalism and Professional Writing,Law,Library and Information Studies,Literary Studies,Manufacturing Engineering,Maritime Engineering,Materials Engineering,Mathematical Physics,Mechanical Engineering,Nanotechnology,Numerical and Computational Mathematics,Oceanography,Optical Physics,Other Agricultural and Veterinary Sciences,Other Biological Sciences,Other Built Environment and Design,Other Chemical Sciences,Other Engineering,Other Environmental Sciences,"Other Language, Communication and Culture",Other Law and Legal Studies,Other Philosophy and Religious Studies,Other Physical Sciences,Other Psychology and Cognitive Sciences,Other Technology,Physical Chemistry (incl. Structural),Physical Geography and Environmental Geoscience,Psychology,Pure Mathematics,Quantum Physics,Resources Engineering and Extractive Metallurgy,Statistics,Theoretical and Computational Chemistry,Tourism,Transportation and Freight Services,Urban and Regional Planning,Visual Arts and Crafts,All
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1
1990,1,1,1,1,1,1,0,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,0,1,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,66
1993,1,1,1,1,1,1,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,0,1,1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1,0,1,1,0,0,0,1,0,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,64
1996,1,1,0,0,1,1,0,1,1,0,1,1,1,0,1,1,1,1,1,1,1,0,1,1,1,0,1,1,1,0,0,1,0,0,1,1,1,1,1,0,1,1,0,1,1,1,1,1,1,1,0,1,0,0,0,0,1,1,0,0,0,0,1,0,1,0,0,1,1,1,1,1,0,1,1,1,50
1999,1,0,0,0,1,0,1,1,1,0,1,1,1,0,1,1,1,1,1,1,1,0,1,1,1,0,0,0,1,0,0,1,0,0,1,1,0,1,1,0,0,1,0,1,1,1,1,1,1,1,0,1,0,0,1,0,1,0,0,0,0,0,1,1,1,0,0,1,1,1,1,1,0,1,0,1,45
2002,1,0,0,0,1,0,0,1,1,0,1,1,1,0,1,1,0,1,1,1,1,0,1,1,1,0,0,0,1,1,0,1,0,0,0,1,0,1,1,0,0,0,0,1,1,1,1,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,1,0,0,36
2005,1,0,0,0,1,0,1,1,1,0,1,1,0,0,1,1,1,1,1,1,1,0,1,1,1,0,0,0,1,0,0,1,0,0,0,1,0,1,1,0,0,0,0,1,1,1,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,1,1,1,0,1,0,0,36
2008,1,0,0,0,1,0,0,0,1,0,1,1,0,0,1,0,0,1,1,1,1,0,1,1,1,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,1,1,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,26
2011,1,0,0,0,1,0,0,0,1,0,1,0,0,0,1,1,0,1,1,1,1,0,1,1,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,1,0,0,26
2014,1,0,0,0,1,0,0,0,1,0,1,1,0,0,1,1,0,1,1,1,1,0,1,1,1,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,1,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,0,0,0,0,27
2017,1,0,0,0,1,0,0,0,1,0,1,1,1,0,1,1,0,1,1,1,1,0,1,0,1,0,0,0,1,0,0,1,0,0,0,1,0,0,1,0,0,0,0,1,1,1,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,1,0,0,0,0,29


In [24]:
ch_x.loc["All"][ch_x.loc["All"] == 10].to_frame().index.to_list()

['Aerospace Engineering',
 'Applied Mathematics',
 'Astronomical and Space Sciences',
 'Atomic, Molecular, Nuclear, Particle and Plasma Physics',
 'Civil Engineering',
 'Communications Technologies',
 'Computation Theory and Mathematics',
 'Computer Software',
 'Condensed Matter Physics',
 'Data Format',
 'Distributed Computing',
 'Electrical and Electronic Engineering',
 'Manufacturing Engineering',
 'Maritime Engineering',
 'Mathematical Physics',
 'Mechanical Engineering',
 'Numerical and Computational Mathematics',
 'Optical Physics',
 'Pure Mathematics',
 'Quantum Physics',
 'Theoretical and Computational Chemistry']

In [25]:
ch_variable_neighbours = ch_x.loc["All"][ch_x.loc["All"] < 10].to_frame().index.to_list()
ch_variable_neighbours

['Agricultural Biotechnology',
 'Agriculture, Land and Farm Management',
 'Applied Economics',
 'Architecture',
 'Art Theory and Criticism',
 'Artificial Intelligence and Image Processing',
 'Atmospheric Sciences',
 'Automotive Engineering',
 'Building',
 'Chemical Engineering',
 'Classical Physics',
 'Communication and Media Studies',
 'Curatorial and Related Studies',
 'Design Practice and Management',
 'Ecological Applications',
 'Econometrics',
 'Education Systems',
 'Environmental Biotechnology',
 'Environmental Engineering',
 'Film, Television and Digital Media',
 'Geochemistry',
 'Geology',
 'Geomatic Engineering',
 'Geophysics',
 'Industrial Biotechnology',
 'Information Systems',
 'Interdisciplinary Engineering',
 'Journalism and Professional Writing',
 'Law',
 'Library and Information Studies',
 'Literary Studies',
 'Materials Engineering',
 'Nanotechnology',
 'Oceanography',
 'Other Agricultural and Veterinary Sciences',
 'Other Biological Sciences',
 'Other Built Environmen

In [26]:
px.scatter(df_ch_res[df_ch_res.field.isin(ch_variable_neighbours)], x="year", y="cosine", text="field")

## AI and image processing

In [27]:
'Artificial Intelligence and Image Processing'
YEAR_LIST = (1990, 1993, 1996, 1999, 2002, 2005, 2008, 2011, 2014, 2017)
df_ai_res = pd.DataFrame()
list_of_years = []
list_of_fields = []
list_of_sims = []

for y in YEAR_LIST:
  y_res = compute_pairwise_similarities(all_embeddings[y])
  print("---------------------------------------------")
  for k,v in y_res[0].items():
    if k == 'Artificial Intelligence and Image Processing':
      for_res = v 
      for f in for_res:
        if for_res[f] >= NHOOD and f != 'Artificial Intelligence and Image Processing':
          print(y, "AI sim =", f, "->", for_res[f])
          list_of_years.append(y)
          list_of_fields.append(f)
          list_of_sims.append(for_res[f])

df_ai_res["year"] = list_of_years 
df_ai_res["field"] = list_of_fields
df_ai_res["cosine"] = list_of_sims  

---------------------------------------------
1990 AI sim = Electrical and Electronic Engineering -> 0.9720000536127029
1990 AI sim = Computer Hardware -> 0.9653804038469446
1990 AI sim = Statistics -> 0.9578049469955123
1990 AI sim = Building -> 0.9559127864395447
1990 AI sim = Other Engineering -> 0.953513533309985
1990 AI sim = Automotive Engineering -> 0.9517334608672834
1990 AI sim = Manufacturing Engineering -> 0.9516128625726559
1990 AI sim = Applied Mathematics -> 0.9514577804471543
1990 AI sim = Aerospace Engineering -> 0.9461605614182533
1990 AI sim = Communications Technologies -> 0.9453070915470897
1990 AI sim = Distributed Computing -> 0.9440077797513826
1990 AI sim = Classical Physics -> 0.9434405393278887
1990 AI sim = Design Practice and Management -> 0.9418290169534773
1990 AI sim = Film, Television and Digital Media -> 0.9416533054741523
1990 AI sim = Pure Mathematics -> 0.941580281547729
1990 AI sim = Communication and Media Studies -> 0.9410530046008944
1990 AI sim 

In [28]:
ai_x = pd.crosstab(df_ai_res.year, df_ai_res.field, margins=True)
ai_x

field,Aerospace Engineering,Agricultural Biotechnology,"Agriculture, Land and Farm Management",Applied Economics,Applied Mathematics,Architecture,Art Theory and Criticism,Astronomical and Space Sciences,"Atomic, Molecular, Nuclear, Particle and Plasma Physics",Automotive Engineering,Biomedical Engineering,Building,Chemical Engineering,Civil Engineering,Classical Physics,Clinical Sciences,Commercial Services,Communication and Media Studies,Communications Technologies,Computation Theory and Mathematics,Computer Hardware,Computer Software,Condensed Matter Physics,Curatorial and Related Studies,Data Format,Design Practice and Management,Distributed Computing,Education Systems,Electrical and Electronic Engineering,Environmental Biotechnology,Environmental Engineering,"Film, Television and Digital Media",Geology,Geomatic Engineering,Industrial Biotechnology,Information Systems,Inorganic Chemistry,Interdisciplinary Engineering,Journalism and Professional Writing,Law,Library and Information Studies,Literary Studies,Macromolecular and Materials Chemistry,Manufacturing Engineering,Maritime Engineering,Materials Engineering,Mathematical Physics,Mechanical Engineering,Nanotechnology,Neurosciences,Numerical and Computational Mathematics,Optical Physics,Other Agricultural and Veterinary Sciences,Other Built Environment and Design,Other Education,Other Engineering,Other Environmental Sciences,"Other Language, Communication and Culture",Other Law and Legal Studies,Other Philosophy and Religious Studies,Other Physical Sciences,Other Psychology and Cognitive Sciences,Other Studies In Creative Arts and Writing,Other Technology,Physical Chemistry (incl. Structural),Psychology,Pure Mathematics,Quantum Physics,Resources Engineering and Extractive Metallurgy,Statistics,Theoretical and Computational Chemistry,Tourism,Transportation and Freight Services,Urban and Regional Planning,Visual Arts and Crafts,All
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1
1990,1,1,1,1,1,1,0,1,1,1,0,1,0,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,0,1,1,1,0,1,1,1,0,1,0,1,1,1,1,1,1,0,1,1,1,0,0,1,0,0,1,0,1,1,0,0,1,1,1,1,1,1,1,1,1,1,1,57
1993,1,1,0,0,1,1,0,1,1,1,1,1,0,1,1,0,0,1,1,1,1,1,1,0,1,1,1,1,1,1,0,1,0,0,1,1,0,1,0,0,0,0,0,1,1,1,1,1,1,0,1,1,1,0,0,1,0,1,1,1,1,1,0,1,1,1,1,1,0,1,1,0,1,1,1,52
1996,1,1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,0,1,1,1,1,1,0,1,1,1,0,1,1,1,1,0,0,1,0,1,1,1,1,1,1,1,0,1,1,1,0,0,1,1,0,1,0,1,1,0,0,1,1,1,1,1,1,1,0,1,1,1,58
1999,1,1,0,0,1,0,1,0,1,1,1,1,0,1,1,1,0,1,1,1,1,1,0,0,1,1,1,1,1,0,0,1,0,0,1,1,0,1,0,0,1,1,0,1,0,1,1,1,1,1,1,1,0,1,0,1,0,0,1,0,1,1,0,1,1,1,1,1,0,1,1,0,1,0,1,49
2002,1,1,0,0,1,0,0,0,1,1,0,1,0,1,1,0,1,0,1,1,1,1,0,0,1,0,1,1,1,1,0,1,0,0,1,1,0,1,0,0,1,0,0,1,0,1,1,1,0,0,1,0,0,1,0,1,0,0,1,0,0,1,0,1,1,0,1,1,0,1,0,0,1,0,1,39
2005,1,1,0,0,1,0,1,1,1,1,1,0,0,1,1,0,0,1,1,1,1,1,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,1,0,0,1,0,0,1,0,1,0,1,0,0,1,0,0,1,0,0,0,0,1,0,1,1,0,1,1,1,1,1,0,1,1,0,1,1,0,38
2008,0,1,0,0,1,0,1,0,1,1,0,0,0,1,1,1,1,1,1,1,0,1,0,0,0,1,1,0,1,1,0,1,0,0,0,1,0,1,0,0,1,0,0,1,0,1,0,1,0,0,1,0,0,1,0,1,0,1,1,0,1,1,0,1,1,0,1,1,0,1,0,0,0,0,0,36
2011,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,0,1,1,1,0,1,0,0,0,1,0,0,0,0,0,17
2014,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,13
2017,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,12


In [29]:
ai_variable_neighbours = ai_x.loc["All"][ai_x.loc["All"] < 10].to_frame().index.to_list()
ai_variable_neighbours

['Aerospace Engineering',
 'Agricultural Biotechnology',
 'Agriculture, Land and Farm Management',
 'Applied Economics',
 'Applied Mathematics',
 'Architecture',
 'Art Theory and Criticism',
 'Astronomical and Space Sciences',
 'Atomic, Molecular, Nuclear, Particle and Plasma Physics',
 'Automotive Engineering',
 'Biomedical Engineering',
 'Building',
 'Chemical Engineering',
 'Civil Engineering',
 'Clinical Sciences',
 'Commercial Services',
 'Communication and Media Studies',
 'Communications Technologies',
 'Computation Theory and Mathematics',
 'Computer Hardware',
 'Computer Software',
 'Condensed Matter Physics',
 'Curatorial and Related Studies',
 'Data Format',
 'Design Practice and Management',
 'Distributed Computing',
 'Education Systems',
 'Electrical and Electronic Engineering',
 'Environmental Biotechnology',
 'Environmental Engineering',
 'Film, Television and Digital Media',
 'Geology',
 'Geomatic Engineering',
 'Industrial Biotechnology',
 'Information Systems',
 'Inor

In [30]:
px.scatter(df_ai_res[df_ai_res.field.isin(ai_variable_neighbours)], x="year", y="cosine", text="field")

In [31]:
px.scatter(df_ai_res, x="year", y="cosine", text="field")

In [32]:
y_2_mean = df_ai_res.groupby("year")["cosine"].mean().to_frame()
y_2_mean

Unnamed: 0_level_0,cosine
year,Unnamed: 1_level_1
1990,0.900381
1993,0.892145
1996,0.901706
1999,0.876828
2002,0.858672
2005,0.863798
2008,0.858029
2011,0.845681
2014,0.853474
2017,0.833197


In [33]:
df_ai_res["mean_cosine"] = df_ai_res.year.apply(lambda x : y_2_mean.loc[x])
df_ai_res

Unnamed: 0,year,field,cosine,mean_cosine
0,1990,Electrical and Electronic Engineering,0.972000,0.900381
1,1990,Computer Hardware,0.965380,0.900381
2,1990,Statistics,0.957805,0.900381
3,1990,Building,0.955913,0.900381
4,1990,Other Engineering,0.953514,0.900381
...,...,...,...,...
366,2017,Other Psychology and Cognitive Sciences,0.820234,0.833197
367,2017,Electrical and Electronic Engineering,0.819171,0.833197
368,2017,Agricultural Biotechnology,0.805690,0.833197
369,2017,Other Law and Legal Studies,0.804204,0.833197


In [34]:
px.scatter(df_ai_res, x="mean_cosine", y="cosine", text="year")

# Path similarity

In [35]:
from google.colab import auth
auth.authenticate_user()
print("Authenticated!")
# project id
pid = "springer-nature-analytics"

Authenticated!


In [36]:
%%bigquery --project $pid df_for_relations

SELECT *, 
  CASE
    WHEN for_cluster IN ("PCE", "EE", "MIC", "BB", "MHS", "AVS") THEN "STEM"
    ELSE "HSS"
    END AS for_area
    FROM (
  SELECT *, 

  -- clusters documented here: https://www.newcastle.edu.au/research-and-innovation/resources/era/for-codes
    CASE 
      WHEN f1 IN ("Physical Sciences", "Chemical Sciences", "Earth Sciences") THEN "PCE" -- Physical, Chemical and Earth Sciences (PCE) 
      WHEN f1 IN ("Built Environment and Design", "Law and Legal Studies", "Studies in Creative Arts and Writing", "Language, Communication and Culture",
      "History and Archaeology", "Philosophy and Religious Studies") THEN "HCA" -- Humanities and Creative Arts (HCA)
      WHEN f1 IN ("Environmental Sciences", "Engineering", "Technology", "Medical Biotechnology") 
        AND f2 NOT IN ("Agricultural Biotechnology", "Environmental Biotechnology", "Industrial Biotechnology", "Medical Biotechnology") THEN "EE" -- Engineering and Environmental Sciences (EE)
      WHEN f1 IN ("Education", "Studies in Human Society") THEN "EHS" -- Education and Human Society (EHS) 
      WHEN f1 IN ("Economics", "Commerce, Management, Tourism and Services") THEN "EC" -- Economics and Commerce (EC) 
      WHEN f1 IN ("Mathematical Sciences", "Information and Computing Sciences") THEN "MIC" -- Mathematics, Information and Communication Sciences (MIC) 
      WHEN (f1 IN ("Biological Sciences", "Agriculture and Veterinary Sciences") 
        OR f2 IN ("Agricultural Biotechnology", "Environmental Biotechnology", "Industrial Biotechnology") ) THEN "BB" -- Biological Sciences and Biotechnology (BB) 
      WHEN (f1 IN ("Medical and Health Sciences", "Psychology and Cognitive Sciences") OR f2 IN ("Medical Biotechnology")) THEN "MHS" -- Medical and Health Sciences (MHS) 
      WHEN f1 = "Agricultural and Veterinary Sciences" THEN "AVS" -- not found in the clusters documented above


      END AS for_cluster
  FROM (  
    SELECT DISTINCT(f.second_level.name) AS f2, f.first_level.name AS f1 FROM `springer-nature-analytics.DS_dimensions.publications_full_refresh`
    LEFT JOIN UNNEST(`for`) AS f
    WHERE year = 2019
  ) WHERE f2 IS NOT NULL 
  ORDER BY f1
)

In [37]:
df_for_relations

Unnamed: 0,f2,f1,for_cluster,for_area
0,Horticultural Production,Agricultural and Veterinary Sciences,AVS,STEM
1,Forestry Sciences,Agricultural and Veterinary Sciences,AVS,STEM
2,Animal Production,Agricultural and Veterinary Sciences,AVS,STEM
3,Fisheries Sciences,Agricultural and Veterinary Sciences,AVS,STEM
4,Crop and Pasture Production,Agricultural and Veterinary Sciences,AVS,STEM
...,...,...,...,...
149,Communications Technologies,Technology,EE,STEM
150,Industrial Biotechnology,Technology,BB,STEM
151,Environmental Biotechnology,Technology,BB,STEM
152,Other Technology,Technology,EE,STEM


In [38]:
df_for_relations[df_for_relations.f2=="Forestry Sciences"].f1.item()

'Agricultural and Veterinary Sciences'

In [39]:
import math 

def for_sim(f, ff, df, sim_type="naive"):
  
  assert sim_type in ("naive", "lc", "basic")

  """ Function for path similiarity, FoR codes. See also https://link.springer.com/article/10.1007/s10844-017-0479-y
  :param f: a level 2 FoR code
  :param f: another level 2 FoR code
  :param df: a pandas data frame with mappings between ontology levels
  :param lc: use basic path similarity or (True) Leacock & Chodrow
  """

  max_depth = len(df.columns)
  if f == ff:
    dist = 0.0
  
  elif (df[df.f2 == f].f1.item()) == (df[df.f2 == ff].f1.item()):
    dist = 1.0
  
  elif (df[df.f2 == f].for_cluster.item()) == (df[df.f2 == ff].for_cluster.item()):
    dist = 2.0

  elif (df[df.f2 == f].for_area.item()) == (df[df.f2 == ff].for_area.item()):
    dist = 3.0

  else:
    dist = 4.0
  
  if sim_type == "naive":
    sim = 1 - (dist/max_depth) # old version. 
  elif sim_type == "lc":
    sim = -math.log(dist/(max_depth*2))
  else:
    sim = (max_depth * 2) - dist 
  return sim 



In [40]:
for_sim(f="Medical Biochemistry and Metabolomics", ff="Automotive Engineering", df=df_for_relations)

0.25

In [41]:
for_sim(f="Nanotechnology", ff="Computer Hardware", df=df_for_relations)

0.75

In [42]:
for_sim(f="Medical Biochemistry and Metabolomics", ff="Automotive Engineering", df=df_for_relations, sim_type="lc")

0.9808292530117262

## Computer Hardware

### 1990

In [43]:
df_ch_res_90 = df_ch_res[df_ch_res["year"] == 1990].copy()
df_ch_res_90["ch_path_sim"] = df_ch_res_90.field.apply(lambda x: for_sim(f="Computer Hardware", ff=x, df=df_for_relations))
df_ch_res_90["ch_path_sim_jitter"] = rand_jitter(df_ch_res_90["ch_path_sim"]) #.apply(lambda x: rand_jitter(x))
df_ch_res_90

Unnamed: 0,year,field,cosine,ch_path_sim,ch_path_sim_jitter
0,1990,Building,0.993621,0.00,-0.002966
1,1990,Aerospace Engineering,0.985015,0.50,0.494162
2,1990,Statistics,0.979370,0.25,0.244386
3,1990,Other Engineering,0.970155,0.50,0.494064
4,1990,Applied Mathematics,0.970043,0.25,0.239151
...,...,...,...,...,...
61,1990,Psychology,0.818343,0.25,0.244529
62,1990,Applied Economics,0.815873,0.00,0.011821
63,1990,"Atomic, Molecular, Nuclear, Particle and Plasm...",0.814487,0.25,0.253876
64,1990,Other Chemical Sciences,0.808469,0.25,0.239430


In [44]:
#px.scatter(df_ch_res_90[df_ch_res_90.field.isin(ch_variable_neighbours)], x="ch_path_sim_jitter", y="cosine", 
#           text="field", title="1990 Computer Hardware: cosine vs. path similarity")

px.scatter(df_ch_res_90, x="ch_path_sim_jitter", y="cosine", 
           text="field", title="1990 Computer Hardware: cosine vs. path similarity")

### 1993

In [45]:
df_ch_res_93 = df_ch_res[df_ch_res["year"] == 1993].copy()
df_ch_res_93["ch_path_sim"] = df_ch_res_93.field.apply(lambda x: for_sim(f="Computer Hardware", ff=x, df=df_for_relations))
df_ch_res_93["ch_path_sim_jitter"] = rand_jitter(df_ch_res_93["ch_path_sim"]) #.apply(lambda x: rand_jitter(x))


### 1996

In [46]:
df_ch_res_96 = df_ch_res[df_ch_res["year"] == 1996].copy()
df_ch_res_96["ch_path_sim"] = df_ch_res_96.field.apply(lambda x: for_sim(f="Computer Hardware", ff=x, df=df_for_relations))
df_ch_res_96["ch_path_sim_jitter"] = rand_jitter(df_ch_res_96["ch_path_sim"]) #.apply(lambda x: rand_jitter(x))


### 1999

In [47]:
df_ch_res_99 = df_ch_res[df_ch_res["year"] == 1999].copy()
df_ch_res_99["ch_path_sim"] = df_ch_res_99.field.apply(lambda x: for_sim(f="Computer Hardware", ff=x, df=df_for_relations))
df_ch_res_99["ch_path_sim_jitter"] = rand_jitter(df_ch_res_99["ch_path_sim"]) #.apply(lambda x: rand_jitter(x))


### 2002

In [48]:
df_ch_res_02 = df_ch_res[df_ch_res["year"] == 2002].copy()
df_ch_res_02["ch_path_sim"] = df_ch_res_02.field.apply(lambda x: for_sim(f="Computer Hardware", ff=x, df=df_for_relations))
df_ch_res_02["ch_path_sim_jitter"] = rand_jitter(df_ch_res_02["ch_path_sim"]) #.apply(lambda x: rand_jitter(x))
df_ch_res_02

Unnamed: 0,year,field,cosine,ch_path_sim,ch_path_sim_jitter
225,2002,Numerical and Computational Mathematics,0.992612,0.25,0.24567
226,2002,Mechanical Engineering,0.991234,0.5,0.4967
227,2002,Pure Mathematics,0.988069,0.25,0.246142
228,2002,Mathematical Physics,0.981244,0.25,0.249723
229,2002,Computation Theory and Mathematics,0.978522,0.25,0.252572
230,2002,Communications Technologies,0.976228,0.75,0.737881
231,2002,Applied Mathematics,0.96622,0.25,0.236457
232,2002,Aerospace Engineering,0.965472,0.5,0.504573
233,2002,Manufacturing Engineering,0.963961,0.5,0.498196
234,2002,Quantum Physics,0.958322,0.25,0.244374


In [49]:
px.scatter(df_ch_res_02[df_ch_res_02.field.isin(ch_variable_neighbours)], x="ch_path_sim_jitter", y="cosine", 
           text="field", title="2002 Computer Hardware: cosine vs. path similarity")

In [50]:
df_ch_res_02.cosine.mean()

0.9028578896315399

### 2005

In [51]:
df_ch_res_05 = df_ch_res[df_ch_res["year"] == 2005].copy()
df_ch_res_05["ch_path_sim"] = df_ch_res_05.field.apply(lambda x: for_sim(f="Computer Hardware", ff=x, df=df_for_relations))
df_ch_res_05["ch_path_sim_jitter"] = rand_jitter(df_ch_res_05["ch_path_sim"]) #.apply(lambda x: rand_jitter(x))


### 2008

In [52]:
df_ch_res_08 = df_ch_res[df_ch_res["year"] == 2008].copy()
df_ch_res_08["ch_path_sim"] = df_ch_res_08.field.apply(lambda x: for_sim(f="Computer Hardware", ff=x, df=df_for_relations))
df_ch_res_08["ch_path_sim_jitter"] = rand_jitter(df_ch_res_08["ch_path_sim"]) #.apply(lambda x: rand_jitter(x))


### 2011

In [53]:
df_ch_res_11 = df_ch_res[df_ch_res["year"] == 2011].copy()
df_ch_res_11["ch_path_sim"] = df_ch_res_11.field.apply(lambda x: for_sim(f="Computer Hardware", ff=x, df=df_for_relations))
df_ch_res_11["ch_path_sim_jitter"] = rand_jitter(df_ch_res_11["ch_path_sim"]) #.apply(lambda x: rand_jitter(x))


### 2014

In [54]:
df_ch_res_14 = df_ch_res[df_ch_res["year"] == 2014].copy()
df_ch_res_14["ch_path_sim"] = df_ch_res_14.field.apply(lambda x: for_sim(f="Computer Hardware", ff=x, df=df_for_relations))
df_ch_res_14["ch_path_sim_jitter"] = rand_jitter(df_ch_res_14["ch_path_sim"]) #.apply(lambda x: rand_jitter(x))


### 2017

In [55]:
df_ch_res_17 = df_ch_res[df_ch_res["year"] == 2017].copy()
df_ch_res_17["ch_path_sim"] = df_ch_res_17.field.apply(lambda x: for_sim(f="Computer Hardware", ff=x, df=df_for_relations))
df_ch_res_17["ch_path_sim_jitter"] = rand_jitter(df_ch_res_17["ch_path_sim"]) #.apply(lambda x: rand_jitter(x))
df_ch_res_17

Unnamed: 0,year,field,cosine,ch_path_sim,ch_path_sim_jitter
376,2017,Mathematical Physics,0.982275,0.25,0.23783
377,2017,Aerospace Engineering,0.960566,0.5,0.502466
378,2017,Mechanical Engineering,0.960404,0.5,0.506658
379,2017,Quantum Physics,0.957612,0.25,0.254709
380,2017,Communications Technologies,0.952185,0.75,0.754577
381,2017,Electrical and Electronic Engineering,0.949532,0.5,0.501853
382,2017,Pure Mathematics,0.944129,0.25,0.261853
383,2017,Numerical and Computational Mathematics,0.936927,0.25,0.253009
384,2017,Manufacturing Engineering,0.935971,0.5,0.512087
385,2017,Applied Mathematics,0.934776,0.25,0.24542


In [56]:
#px.scatter(df_ch_res_17[df_ch_res_17.field.isin(ch_variable_neighbours)], x="ch_path_sim_jitter", y="cosine", 
#           text="field", title="2017 Computer Hardware: cosine vs. path similarity")

px.scatter(df_ch_res_17, x="ch_path_sim_jitter", y="cosine", 
           text="field", title="2017 Computer Hardware: cosine vs. path similarity")

In [57]:
df_ch_res_17.cosine.mean()

0.9005559655872458

### Trend chart

In [58]:
len(df_ch_res_90.cosine)

66

In [59]:
len(df_ch_res_02.cosine)

36

In [60]:
len(df_ch_res_17.cosine)

29

In [61]:
ch_trend_df = pd.DataFrame({"year": [1990, 1993, 1996, 1999, 2002, 2005, 2008, 2011, 2014, 2017],
                            "nbrs": [len(df_ch_res_90.cosine),
                                     len(df_ch_res_93.cosine),
                                     len(df_ch_res_96.cosine), 
                                     len(df_ch_res_99.cosine),
                                     len(df_ch_res_02.cosine), 
                                     len(df_ch_res_05.cosine),
                                     len(df_ch_res_08.cosine),
                                     len(df_ch_res_11.cosine),
                                     len(df_ch_res_14.cosine),
                                     len(df_ch_res_17.cosine)],
                            "density": [
                                        len(df_ch_res_90[df_ch_res_90.ch_path_sim >= 0.5]),
                                        len(df_ch_res_93[df_ch_res_93.ch_path_sim >= 0.5]),
                                        len(df_ch_res_96[df_ch_res_96.ch_path_sim >= 0.5]),
                                        len(df_ch_res_99[df_ch_res_99.ch_path_sim >= 0.5]),
                                        len(df_ch_res_02[df_ch_res_02.ch_path_sim >= 0.5]),
                                        len(df_ch_res_05[df_ch_res_05.ch_path_sim >= 0.5]),
                                        len(df_ch_res_08[df_ch_res_08.ch_path_sim >= 0.5]),
                                        len(df_ch_res_11[df_ch_res_11.ch_path_sim >= 0.5]),
                                        len(df_ch_res_14[df_ch_res_14.ch_path_sim >= 0.5]),
                                        len(df_ch_res_17[df_ch_res_17.ch_path_sim >= 0.5])
                            ],
                            "retention": retention_rate([
                                                         df_ch_res_90,
                                                         df_ch_res_93,
                                                         df_ch_res_96,
                                                         df_ch_res_99,
                                                         df_ch_res_02,
                                                         df_ch_res_05,
                                                         df_ch_res_08,
                                                         df_ch_res_11,
                                                         df_ch_res_14,
                                                         df_ch_res_17
                                                         ])
                            })

In [62]:
px.scatter(ch_trend_df, x="density", y="nbrs", 
           text="year", labels={"nbrs": "Num. neighbours > cosine cutoff",
                                "density": "Num. neighbours < path sim cutoff"},
           size="retention",
           title="Computer Hardware")

## AI and image processing

### 1990

In [63]:
df_ai_res_90 = df_ai_res[df_ai_res["year"] == 1990].copy()
df_ai_res_90["ai_path_sim"] = df_ai_res_90.field.apply(lambda x: for_sim(f="Artificial Intelligence and Image Processing", ff=x, df=df_for_relations))
df_ai_res_90

Unnamed: 0,year,field,cosine,mean_cosine,ai_path_sim
0,1990,Electrical and Electronic Engineering,0.972,0.900381,0.25
1,1990,Computer Hardware,0.96538,0.900381,0.25
2,1990,Statistics,0.957805,0.900381,0.5
3,1990,Building,0.955913,0.900381,0.0
4,1990,Other Engineering,0.953514,0.900381,0.25
5,1990,Automotive Engineering,0.951733,0.900381,0.25
6,1990,Manufacturing Engineering,0.951613,0.900381,0.25
7,1990,Applied Mathematics,0.951458,0.900381,0.5
8,1990,Aerospace Engineering,0.946161,0.900381,0.25
9,1990,Communications Technologies,0.945307,0.900381,0.25


In [64]:
#px.scatter(df_ai_res_90[df_ai_res_90.field.isin(ai_variable_neighbours)], x="ai_path_sim", y="cosine", 
#           text="field", title="1990 AI/image processing: cosine vs. path similarity")

In [65]:
px.scatter(df_ai_res_90, x="ai_path_sim", y="cosine", 
           text="field", title="1990 AI/image processing: cosine vs. path similarity")

### 1993

In [66]:
df_ai_res_93 = df_ai_res[df_ai_res["year"] == 1993].copy()
df_ai_res_93["ai_path_sim"] = df_ai_res_93.field.apply(lambda x: for_sim(f="Artificial Intelligence and Image Processing", ff=x, df=df_for_relations))


### 1996

In [67]:
df_ai_res_96 = df_ai_res[df_ai_res["year"] == 1996].copy()
df_ai_res_96["ai_path_sim"] = df_ai_res_96.field.apply(lambda x: for_sim(f="Artificial Intelligence and Image Processing", ff=x, df=df_for_relations))
df_ai_res_96

Unnamed: 0,year,field,cosine,mean_cosine,ai_path_sim
109,1996,Classical Physics,0.971451,0.901706,0.25
110,1996,"Film, Television and Digital Media",0.971307,0.901706,0.0
111,1996,Communications Technologies,0.96412,0.901706,0.25
112,1996,Materials Engineering,0.962419,0.901706,0.25
113,1996,Electrical and Electronic Engineering,0.960192,0.901706,0.25
114,1996,Other Environmental Sciences,0.956404,0.901706,0.25
115,1996,Building,0.953157,0.901706,0.0
116,1996,Communication and Media Studies,0.949944,0.901706,0.0
117,1996,Computer Hardware,0.949165,0.901706,0.25
118,1996,Distributed Computing,0.949068,0.901706,0.75


### 1999

In [68]:
df_ai_res_99 = df_ai_res[df_ai_res["year"] == 1999].copy()
df_ai_res_99["ai_path_sim"] = df_ai_res_99.field.apply(lambda x: for_sim(f="Artificial Intelligence and Image Processing", ff=x, df=df_for_relations))
df_ai_res_99

Unnamed: 0,year,field,cosine,mean_cosine,ai_path_sim
167,1999,Statistics,0.980348,0.876828,0.5
168,1999,Information Systems,0.949398,0.876828,0.75
169,1999,Classical Physics,0.944627,0.876828,0.25
170,1999,Other Psychology and Cognitive Sciences,0.937998,0.876828,0.25
171,1999,Electrical and Electronic Engineering,0.927269,0.876828,0.25
172,1999,Computer Hardware,0.925593,0.876828,0.25
173,1999,Other Technology,0.923459,0.876828,0.25
174,1999,Communications Technologies,0.922493,0.876828,0.25
175,1999,Other Engineering,0.921675,0.876828,0.25
176,1999,Transportation and Freight Services,0.920618,0.876828,0.0


### 2002

In [69]:
df_ai_res_02 = df_ai_res[df_ai_res["year"] == 2002].copy()
df_ai_res_02["ai_path_sim"] = df_ai_res_02.field.apply(lambda x: for_sim(f="Artificial Intelligence and Image Processing", ff=x, df=df_for_relations))
df_ai_res_02

Unnamed: 0,year,field,cosine,mean_cosine,ai_path_sim
216,2002,Communications Technologies,0.921305,0.858672,0.25
217,2002,Information Systems,0.918099,0.858672,0.75
218,2002,Statistics,0.916852,0.858672,0.5
219,2002,Applied Mathematics,0.911047,0.858672,0.5
220,2002,Electrical and Electronic Engineering,0.905664,0.858672,0.25
221,2002,Other Engineering,0.901584,0.858672,0.25
222,2002,Environmental Biotechnology,0.894469,0.858672,0.25
223,2002,Computer Software,0.89169,0.858672,0.75
224,2002,Automotive Engineering,0.884729,0.858672,0.25
225,2002,Other Technology,0.883176,0.858672,0.25


In [70]:
px.scatter(df_ai_res_02[df_ai_res_02.field.isin(ai_variable_neighbours)], x="ai_path_sim", y="cosine", 
           text="field", title="2002 AI/image processing: cosine vs. path similarity")

### 2005

In [71]:
df_ai_res_05 = df_ai_res[df_ai_res["year"] == 2005].copy()
df_ai_res_05["ai_path_sim"] = df_ai_res_05.field.apply(lambda x: for_sim(f="Artificial Intelligence and Image Processing", ff=x, df=df_for_relations))


### 2008

In [72]:
df_ai_res_08 = df_ai_res[df_ai_res["year"] == 2008].copy()
df_ai_res_08["ai_path_sim"] = df_ai_res_08.field.apply(lambda x: for_sim(f="Artificial Intelligence and Image Processing", ff=x, df=df_for_relations))
df_ai_res_08

Unnamed: 0,year,field,cosine,mean_cosine,ai_path_sim
293,2008,Classical Physics,0.940859,0.858029,0.25
294,2008,Other Technology,0.926759,0.858029,0.25
295,2008,"Other Language, Communication and Culture",0.917282,0.858029,0.0
296,2008,Other Engineering,0.913805,0.858029,0.25
297,2008,Other Psychology and Cognitive Sciences,0.911156,0.858029,0.25
298,2008,Distributed Computing,0.902824,0.858029,0.75
299,2008,Statistics,0.897741,0.858029,0.5
300,2008,Other Law and Legal Studies,0.893289,0.858029,0.0
301,2008,Applied Mathematics,0.893045,0.858029,0.5
302,2008,Environmental Biotechnology,0.884992,0.858029,0.25


### 2011

In [73]:
df_ai_res_11 = df_ai_res[df_ai_res["year"] == 2011].copy()
df_ai_res_11["ai_path_sim"] = df_ai_res_11.field.apply(lambda x: for_sim(f="Artificial Intelligence and Image Processing", ff=x, df=df_for_relations))
df_ai_res_11

Unnamed: 0,year,field,cosine,mean_cosine,ai_path_sim
329,2011,Other Technology,0.883376,0.845681,0.25
330,2011,Statistics,0.881919,0.845681,0.5
331,2011,Library and Information Studies,0.877446,0.845681,0.75
332,2011,Other Engineering,0.875515,0.845681,0.25
333,2011,Other Psychology and Cognitive Sciences,0.869353,0.845681,0.25
334,2011,Classical Physics,0.866184,0.845681,0.25
335,2011,Commercial Services,0.86343,0.845681,0.0
336,2011,Other Studies In Creative Arts and Writing,0.86158,0.845681,0.0
337,2011,Art Theory and Criticism,0.84077,0.845681,0.0
338,2011,"Other Language, Communication and Culture",0.838921,0.845681,0.0


### 2014

In [74]:
df_ai_res_14 = df_ai_res[df_ai_res["year"] == 2014].copy()
df_ai_res_14["ai_path_sim"] = df_ai_res_14.field.apply(lambda x: for_sim(f="Artificial Intelligence and Image Processing", ff=x, df=df_for_relations))
df_ai_res_14

Unnamed: 0,year,field,cosine,mean_cosine,ai_path_sim
346,2014,Statistics,0.908982,0.853474,0.5
347,2014,"Other Language, Communication and Culture",0.884925,0.853474,0.0
348,2014,Other Engineering,0.883458,0.853474,0.25
349,2014,Art Theory and Criticism,0.871637,0.853474,0.0
350,2014,Other Psychology and Cognitive Sciences,0.867794,0.853474,0.25
351,2014,Other Education,0.858597,0.853474,0.0
352,2014,Environmental Biotechnology,0.85389,0.853474,0.25
353,2014,Classical Physics,0.83924,0.853474,0.25
354,2014,Communications Technologies,0.833727,0.853474,0.25
355,2014,Electrical and Electronic Engineering,0.831106,0.853474,0.25


### 2017

In [75]:
df_ai_res_17 = df_ai_res[df_ai_res["year"] == 2017].copy()
df_ai_res_17["ai_path_sim"] = df_ai_res_17.field.apply(lambda x: for_sim(f="Artificial Intelligence and Image Processing", ff=x, df=df_for_relations))
df_ai_res_17

Unnamed: 0,year,field,cosine,mean_cosine,ai_path_sim
359,2017,Statistics,0.883995,0.833197,0.5
360,2017,Classical Physics,0.862143,0.833197,0.25
361,2017,Other Engineering,0.848404,0.833197,0.25
362,2017,Environmental Biotechnology,0.842252,0.833197,0.25
363,2017,Library and Information Studies,0.841561,0.833197,0.75
364,2017,Commercial Services,0.836394,0.833197,0.0
365,2017,"Other Language, Communication and Culture",0.832218,0.833197,0.0
366,2017,Other Psychology and Cognitive Sciences,0.820234,0.833197,0.25
367,2017,Electrical and Electronic Engineering,0.819171,0.833197,0.25
368,2017,Agricultural Biotechnology,0.80569,0.833197,0.25


In [76]:
#px.scatter(df_ai_res_17[df_ai_res_17.field.isin(ai_variable_neighbours)], x="ai_path_sim", y="cosine", 
#           text="field", title="2017 AI/image processing: cosine vs. path similarity")

In [77]:
px.scatter(df_ai_res_17, x="ai_path_sim", y="cosine", 
           text="field", title="2017 AI/image processing: cosine vs. path similarity")

### Trend chart

In [78]:
ai_trend_df = pd.DataFrame({"year": [1990, 1993, 1996, 1999, 2002, 2005, 2008, 2011, 2014, 2017],
                            "nbrs": [len(df_ai_res_90.cosine),
                                     len(df_ai_res_93.cosine), 
                                     len(df_ai_res_96.cosine),
                                     len(df_ai_res_99.cosine),
                                     len(df_ai_res_02.cosine), 
                                     len(df_ai_res_05.cosine),
                                     len(df_ai_res_08.cosine), 
                                     len(df_ai_res_11.cosine),
                                     len(df_ai_res_14.cosine),
                                     len(df_ai_res_17.cosine)],
                            "density": [
                                        len(df_ai_res_90[df_ai_res_90.ai_path_sim >= 0.5]),
                                        len(df_ai_res_93[df_ai_res_93.ai_path_sim >= 0.5]),
                                        len(df_ai_res_96[df_ai_res_96.ai_path_sim >= 0.5]),
                                        len(df_ai_res_99[df_ai_res_99.ai_path_sim >= 0.5]),
                                        len(df_ai_res_02[df_ai_res_02.ai_path_sim >= 0.5]),
                                        len(df_ai_res_05[df_ai_res_05.ai_path_sim >= 0.5]),
                                        len(df_ai_res_08[df_ai_res_08.ai_path_sim >= 0.5]),
                                        len(df_ai_res_11[df_ai_res_11.ai_path_sim >= 0.5]),
                                        len(df_ai_res_14[df_ai_res_14.ai_path_sim >= 0.5]),
                                        len(df_ai_res_17[df_ai_res_17.ai_path_sim >= 0.5])
                            ],
                            "retention": retention_rate([
                                                         df_ai_res_90,
                                                         df_ai_res_93,
                                                         df_ai_res_96,
                                                         df_ai_res_99,
                                                         df_ai_res_02,
                                                         df_ai_res_05,
                                                         df_ai_res_08,
                                                         df_ai_res_11,
                                                         df_ai_res_14,
                                                         df_ai_res_17
                                                         ])
                            })

ai_trend_df

Unnamed: 0,year,nbrs,density,retention
0,1990,57,10,1.0
1,1993,52,10,0.903846
2,1996,58,11,0.827586
3,1999,49,11,0.918367
4,2002,39,11,0.948718
5,2005,38,9,0.789474
6,2008,36,9,0.833333
7,2011,17,4,0.882353
8,2014,13,3,0.846154
9,2017,12,2,0.666667


In [79]:
px.scatter(ai_trend_df, x="density", y="nbrs", 
           text="year", labels={"nbrs": "Num. neighbours > cosine cutoff",
                                "density": "Num. neighbours < path sim cutoff"},
           size="retention",
           title="AI and imagine processing")

In [80]:
import statsmodels.api as sm
lowess = sm.nonparametric.lowess

z = lowess(ai_trend_df.nbrs, ai_trend_df.density, frac=0.8)
z

array([[ 2.        , 10.23329754],
       [ 3.        , 14.11677445],
       [ 4.        , 18.09209776],
       [ 9.        , 39.86497704],
       [ 9.        , 39.86497704],
       [10.        , 46.68943459],
       [10.        , 46.68943459],
       [11.        , 53.28689522],
       [11.        , 53.28689522],
       [11.        , 53.28689522]])

In [81]:
[item[1] for item in z]

[10.233297535558405,
 14.116774446865103,
 18.09209776436882,
 39.86497703710361,
 39.86497703710361,
 46.68943459135748,
 46.68943459135748,
 53.2868952153387,
 53.2868952153387,
 53.2868952153387]

In [82]:
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Scatter(x=ai_trend_df.density, y=ai_trend_df.nbrs,
                    mode='markers'))
fig.add_trace(go.Scatter(x=[item[0] for item in z], y=[item[1] for item in z],
                    mode='lines'))
fig.show()

In [83]:
z2 = lowess(ch_trend_df.nbrs, ch_trend_df.density)
z2

array([[ 8.        , 25.28289363],
       [10.        , 27.41511274],
       [10.        , 27.41511274],
       [11.        , 30.18412926],
       [12.        , 33.63466652],
       [13.        , 37.36377791],
       [15.        , 44.01263828],
       [17.        , 51.49286128],
       [18.        , 55.49533916],
       [20.        , 63.52944993]])

## Linguistics (little or no change)

In [84]:
YEAR_LIST = (1990, 1993, 1996, 1999, 2002, 2005, 2008, 2011, 2014, 2017)
df_ll_res = pd.DataFrame()
list_of_years = []
list_of_fields = []
list_of_sims = []

for y in YEAR_LIST:
  y_res = compute_pairwise_similarities(all_embeddings[y])
  print("---------------------------------------------")
  for k,v in y_res[0].items():
    if k == "Linguistics":
      for_res = v 
      for f in for_res:
        if for_res[f] >= NHOOD and f != "Linguistics":
          print(y, "sim =", f, "->", for_res[f])
          list_of_years.append(y)
          list_of_fields.append(f)
          list_of_sims.append(for_res[f])
  # print(y, ":", all_embeddings[y]["Computer Hardware"][0])
df_ll_res["year"] = list_of_years 
df_ll_res["field"] = list_of_fields
df_ll_res["cosine"] = list_of_sims  

---------------------------------------------
1990 sim = Cognitive Sciences -> 0.9972271491234735
1990 sim = Philosophy -> 0.98353257378931
1990 sim = Language Studies -> 0.9764754588016252
1990 sim = History and Philosophy of Specific Fields -> 0.9627038912442895
1990 sim = Applied Ethics -> 0.9598549764430336
1990 sim = Accounting, Auditing and Accountability -> 0.9549783090090461
1990 sim = Social Work -> 0.9537226604126277
1990 sim = Performing Arts and Creative Writing -> 0.951803766493138
1990 sim = Marketing -> 0.9474652317599486
1990 sim = Business and Management -> 0.9452140623879381
1990 sim = Curriculum and Pedagogy -> 0.9404523081972775
1990 sim = Criminology -> 0.9403120370093729
1990 sim = Literary Studies -> 0.9396120189138238
1990 sim = Policy and Administration -> 0.9334940641183768
1990 sim = Specialist Studies In Education -> 0.9326160582163547
1990 sim = Law -> 0.9283332474753994
1990 sim = Journalism and Professional Writing -> 0.9274227498482825
1990 sim = Religio

In [85]:
def y_df(df, y, field):
  my_res = df[df["year"] == y].copy()
  my_res["path_sim"] = my_res.field.apply(lambda x: for_sim(f=field, ff=x, df=df_for_relations))
  return my_res

In [86]:
df_ll_res_90 = y_df(df_ll_res, 1990, "Linguistics")
df_ll_res_93 = y_df(df_ll_res, 1993, "Linguistics")
df_ll_res_96 = y_df(df_ll_res, 1996, "Linguistics")
df_ll_res_99 = y_df(df_ll_res, 1999, "Linguistics")
df_ll_res_02 = y_df(df_ll_res, 2002, "Linguistics")
df_ll_res_05 = y_df(df_ll_res, 2005, "Linguistics")
df_ll_res_08 = y_df(df_ll_res, 2008, "Linguistics")
df_ll_res_11 = y_df(df_ll_res, 2011, "Linguistics")
df_ll_res_14 = y_df(df_ll_res, 2014, "Linguistics")
df_ll_res_17 = y_df(df_ll_res, 2017, "Linguistics")

In [87]:
ll_trend_df = pd.DataFrame({"year": [1990, 1993, 1996, 1999, 2002, 2005, 2008, 2011, 2014, 2017],
                            "nbrs": [len(df_ll_res_90.cosine),
                                     len(df_ll_res_93.cosine), 
                                     len(df_ll_res_96.cosine),
                                     len(df_ll_res_99.cosine),
                                     len(df_ll_res_02.cosine), 
                                     len(df_ll_res_05.cosine),
                                     len(df_ll_res_08.cosine), 
                                     len(df_ll_res_11.cosine),
                                     len(df_ll_res_14.cosine),
                                     len(df_ll_res_17.cosine)],
                            "density": [
                                        len(df_ll_res_90[df_ll_res_90.path_sim >= 0.5]),
                                        len(df_ll_res_93[df_ll_res_93.path_sim >= 0.5]),
                                        len(df_ll_res_96[df_ll_res_96.path_sim >= 0.5]),
                                        len(df_ll_res_99[df_ll_res_99.path_sim >= 0.5]),
                                        len(df_ll_res_02[df_ll_res_02.path_sim >= 0.5]),
                                        len(df_ll_res_05[df_ll_res_05.path_sim >= 0.5]),
                                        len(df_ll_res_08[df_ll_res_08.path_sim >= 0.5]),
                                        len(df_ll_res_11[df_ll_res_11.path_sim >= 0.5]),
                                        len(df_ll_res_14[df_ll_res_14.path_sim >= 0.5]),
                                        len(df_ll_res_17[df_ll_res_17.path_sim >= 0.5])
                            ],
                            "retention": retention_rate([
                                                         df_ll_res_90,
                                                         df_ll_res_93,
                                                         df_ll_res_96,
                                                         df_ll_res_99,
                                                         df_ll_res_02,
                                                         df_ll_res_05,
                                                         df_ll_res_08,
                                                         df_ll_res_11,
                                                         df_ll_res_14,
                                                         df_ll_res_17
                                                         ])
                            })

ll_trend_df

Unnamed: 0,year,nbrs,density,retention
0,1990,37,14,1.0
1,1993,35,15,0.942857
2,1996,31,13,0.935484
3,1999,24,10,0.958333
4,2002,25,12,0.88
5,2005,25,11,0.8
6,2008,19,11,1.0
7,2011,23,12,0.826087
8,2014,18,10,1.0
9,2017,14,8,0.928571


In [88]:
px.scatter(ll_trend_df, x="density", y="nbrs", 
           text="year", labels={"nbrs": "Num. neighbours > cosine cutoff",
                                "density": "Num. neighbours < path sim cutoff"},
           size="retention",
           title="Linguistics")

## Language studies (little or no change)

In [89]:
YEAR_LIST = (1990, 1993, 1996, 1999, 2002, 2005, 2008, 2011, 2014, 2017)
df_ls_res = pd.DataFrame()
list_of_years = []
list_of_fields = []
list_of_sims = []

for y in YEAR_LIST:
  y_res = compute_pairwise_similarities(all_embeddings[y])
  print("---------------------------------------------")
  for k,v in y_res[0].items():
    if k == "Language Studies":
      for_res = v 
      for f in for_res:
        if for_res[f] >= NHOOD and f != "Language Studies":
          print(y, "sim =", f, "->", for_res[f])
          list_of_years.append(y)
          list_of_fields.append(f)
          list_of_sims.append(for_res[f])
  # print(y, ":", all_embeddings[y]["Computer Hardware"][0])
df_ls_res["year"] = list_of_years 
df_ls_res["field"] = list_of_fields
df_ls_res["cosine"] = list_of_sims  

---------------------------------------------
1990 sim = Social Work -> 0.9930720996054557
1990 sim = Philosophy -> 0.9925059273343948
1990 sim = History and Philosophy of Specific Fields -> 0.9919130306048798
1990 sim = Applied Ethics -> 0.9882266664704372
1990 sim = Criminology -> 0.9869458717755557
1990 sim = Cognitive Sciences -> 0.9853375873108132
1990 sim = Policy and Administration -> 0.9845369759283511
1990 sim = Linguistics -> 0.9764754588016252
1990 sim = Religion and Religious Studies -> 0.9724502273463935
1990 sim = Performing Arts and Creative Writing -> 0.9721796092756125
1990 sim = Sociology -> 0.9719459364671973
1990 sim = Accounting, Auditing and Accountability -> 0.9715438434168229
1990 sim = Business and Management -> 0.9714700568318518
1990 sim = Curriculum and Pedagogy -> 0.9693158974836465
1990 sim = Political Science -> 0.9691682801868557
1990 sim = Marketing -> 0.9659157249816545
1990 sim = Demography -> 0.9642562237094964
1990 sim = Specialist Studies In Educat

In [90]:
df_ls_res_90 = y_df(df_ls_res, 1990, "Language Studies")
df_ls_res_93 = y_df(df_ls_res, 1993, "Language Studies")
df_ls_res_96 = y_df(df_ls_res, 1996, "Language Studies")
df_ls_res_99 = y_df(df_ls_res, 1999, "Language Studies")
df_ls_res_02 = y_df(df_ls_res, 2002, "Language Studies")
df_ls_res_05 = y_df(df_ls_res, 2005, "Language Studies")
df_ls_res_08 = y_df(df_ls_res, 2008, "Language Studies")
df_ls_res_11 = y_df(df_ls_res, 2011, "Language Studies")
df_ls_res_14 = y_df(df_ls_res, 2014, "Language Studies")
df_ls_res_17 = y_df(df_ls_res, 2017, "Language Studies")

In [91]:
ls_trend_df = pd.DataFrame({"year": [1990, 1993, 1996, 1999, 2002, 2005, 2008, 2011, 2014, 2017],
                            "nbrs": [len(df_ls_res_90.cosine),
                                     len(df_ls_res_93.cosine), 
                                     len(df_ls_res_96.cosine),
                                     len(df_ls_res_99.cosine),
                                     len(df_ls_res_02.cosine), 
                                     len(df_ls_res_05.cosine),
                                     len(df_ls_res_08.cosine), 
                                     len(df_ls_res_11.cosine),
                                     len(df_ls_res_14.cosine),
                                     len(df_ls_res_17.cosine)],
                            "density": [
                                        len(df_ls_res_90[df_ls_res_90.path_sim >= 0.5]),
                                        len(df_ls_res_93[df_ls_res_93.path_sim >= 0.5]),
                                        len(df_ls_res_96[df_ls_res_96.path_sim >= 0.5]),
                                        len(df_ls_res_99[df_ls_res_99.path_sim >= 0.5]),
                                        len(df_ls_res_02[df_ls_res_02.path_sim >= 0.5]),
                                        len(df_ls_res_05[df_ls_res_05.path_sim >= 0.5]),
                                        len(df_ls_res_08[df_ls_res_08.path_sim >= 0.5]),
                                        len(df_ls_res_11[df_ls_res_11.path_sim >= 0.5]),
                                        len(df_ls_res_14[df_ls_res_14.path_sim >= 0.5]),
                                        len(df_ls_res_17[df_ls_res_17.path_sim >= 0.5])
                            ],
                            "retention": retention_rate([
                                                         df_ls_res_90,
                                                         df_ls_res_93,
                                                         df_ls_res_96,
                                                         df_ls_res_99,
                                                         df_ls_res_02,
                                                         df_ls_res_05,
                                                         df_ls_res_08,
                                                         df_ls_res_11,
                                                         df_ls_res_14,
                                                         df_ls_res_17
                                                         ])
                            })

ls_trend_df

Unnamed: 0,year,nbrs,density,retention
0,1990,35,13,1.0
1,1993,38,14,0.894737
2,1996,33,13,0.969697
3,1999,36,16,0.888889
4,2002,32,14,0.96875
5,2005,30,13,0.966667
6,2008,27,13,0.925926
7,2011,26,12,0.961538
8,2014,26,11,0.961538
9,2017,24,12,0.958333


In [92]:
px.scatter(ll_trend_df, x="density", y="nbrs", 
           text="year", labels={"nbrs": "Num. neighbours > cosine cutoff",
                                "density": "Num. neighbours < path sim cutoff"},
           size="retention",
           title="Language studies")

In [93]:
#px.scatter(ll_trend_df, x="density_prop", y="nbrs_prop", 
#           text="year", labels={"nbrs": "Num. neighbours > cosine cutoff",
#                                "density": "Num. neighbours < path sim cutoff"},
#           title="Language studies")

# Media and Comm studies

In [94]:
YEAR_LIST = (1990, 1993, 1996, 1999, 2002, 2005, 2008, 2011, 2014, 2017)
df_mc_res = pd.DataFrame()
list_of_years = []
list_of_fields = []
list_of_sims = []

for y in YEAR_LIST:
  y_res = compute_pairwise_similarities(all_embeddings[y])
  print("---------------------------------------------")
  for k,v in y_res[0].items():
    if k == "Communication and Media Studies":
      for_res = v 
      for f in for_res:
        if for_res[f] >= NHOOD and f != "Communication and Media Studies":
          print(y, "sim =", f, "->", for_res[f])
          list_of_years.append(y)
          list_of_fields.append(f)
          list_of_sims.append(for_res[f])
  # print(y, ":", all_embeddings[y]["Computer Hardware"][0])
df_mc_res["year"] = list_of_years 
df_mc_res["field"] = list_of_fields
df_mc_res["cosine"] = list_of_sims  

---------------------------------------------
1990 sim = Distributed Computing -> 0.99906081121405
1990 sim = Classical Physics -> 0.9977761263520997
1990 sim = Other Engineering -> 0.9972144803574561
1990 sim = Automotive Engineering -> 0.9966755933978068
1990 sim = Other Agricultural and Veterinary Sciences -> 0.9932069335145391
1990 sim = Other Law and Legal Studies -> 0.9912271558131807
1990 sim = Visual Arts and Crafts -> 0.9897168245434664
1990 sim = Statistics -> 0.9864330343011303
1990 sim = Other Psychology and Cognitive Sciences -> 0.9855471420435817
1990 sim = Agriculture, Land and Farm Management -> 0.9841257383917132
1990 sim = Urban and Regional Planning -> 0.9799850799142695
1990 sim = Curatorial and Related Studies -> 0.9792413325068877
1990 sim = Architecture -> 0.9776136151901317
1990 sim = Transportation and Freight Services -> 0.9764174405252067
1990 sim = Building -> 0.9745239031562866
1990 sim = Education Systems -> 0.968572896882261
1990 sim = Computer Hardware -

In [95]:
df_mc_res_90 = y_df(df_mc_res, 1990, "Communication and Media Studies")
df_mc_res_93 = y_df(df_mc_res, 1993, "Communication and Media Studies")
df_mc_res_96 = y_df(df_mc_res, 1996, "Communication and Media Studies")
df_mc_res_99 = y_df(df_mc_res, 1999, "Communication and Media Studies")
df_mc_res_02 = y_df(df_mc_res, 2002, "Communication and Media Studies")
df_mc_res_05 = y_df(df_mc_res, 2005, "Communication and Media Studies")
df_mc_res_08 = y_df(df_mc_res, 2008, "Communication and Media Studies")
df_mc_res_11 = y_df(df_mc_res, 2011, "Communication and Media Studies")
df_mc_res_14 = y_df(df_mc_res, 2014, "Communication and Media Studies")
df_mc_res_17 = y_df(df_mc_res, 2017, "Communication and Media Studies")

In [96]:
mc_trend_df = pd.DataFrame({"year": [1990, 1993, 1996, 1999, 2002, 2005, 2008, 2011, 2014, 2017],
                            "nbrs": [len(df_mc_res_90.cosine),
                                     len(df_mc_res_93.cosine), 
                                     len(df_mc_res_96.cosine),
                                     len(df_mc_res_99.cosine),
                                     len(df_mc_res_02.cosine), 
                                     len(df_mc_res_05.cosine),
                                     len(df_mc_res_08.cosine), 
                                     len(df_mc_res_11.cosine),
                                     len(df_mc_res_14.cosine),
                                     len(df_mc_res_17.cosine)],
                            "density": [
                                        len(df_mc_res_90[df_mc_res_90.path_sim >= 0.5]),
                                        len(df_mc_res_93[df_mc_res_93.path_sim >= 0.5]),
                                        len(df_mc_res_96[df_mc_res_96.path_sim >= 0.5]),
                                        len(df_mc_res_99[df_mc_res_99.path_sim >= 0.5]),
                                        len(df_mc_res_02[df_mc_res_02.path_sim >= 0.5]),
                                        len(df_mc_res_05[df_mc_res_05.path_sim >= 0.5]),
                                        len(df_mc_res_08[df_mc_res_08.path_sim >= 0.5]),
                                        len(df_mc_res_11[df_mc_res_11.path_sim >= 0.5]),
                                        len(df_mc_res_14[df_mc_res_14.path_sim >= 0.5]),
                                        len(df_mc_res_17[df_mc_res_17.path_sim >= 0.5])
                            ],
                            "retention": retention_rate([
                                                         df_mc_res_90,
                                                         df_mc_res_93,
                                                         df_mc_res_96,
                                                         df_mc_res_99,
                                                         df_mc_res_02,
                                                         df_mc_res_05,
                                                         df_mc_res_08,
                                                         df_mc_res_11,
                                                         df_mc_res_14,
                                                         df_mc_res_17
                                                         ])
                            })

mc_trend_df

Unnamed: 0,year,nbrs,density,retention
0,1990,79,13,1.0
1,1993,77,14,0.883117
2,1996,60,10,0.833333
3,1999,42,14,0.714286
4,2002,38,10,0.736842
5,2005,47,17,0.595745
6,2008,46,17,0.869565
7,2011,39,19,0.769231
8,2014,37,16,0.810811
9,2017,37,17,0.918919


In [97]:
#mc_trend_df["nbrs_prop"] = mc_trend_df.nbrs/df_for_relations.f2.nunique()
#mc_trend_df["density_prop"] = mc_trend_df.density/df_for_relations.f2.nunique()

In [98]:
px.scatter(mc_trend_df, x="density", y="nbrs", 
           text="year", labels={"nbrs": "Num. neighbours > cosine cutoff",
                                "density": "Num. neighbours < path sim cutoff"},
           size="retention",
           title="Communication and Media studies")

In [99]:
#px.scatter(mc_trend_df, x="density_prop", y="nbrs_prop", 
#           text="year", labels={"nbrs": "Num. neighbours > cosine cutoff",
 #                               "density": "Num. neighbours < path sim cutoff"},
  #         title="Communication and Media studies")

In [100]:
#mc_x = pd.crosstab(df_mc_res.year, df_mc_res.field, margins=True)
#mc_variable_neighbours = mc_x.loc["All"][mc_x.loc["All"] < 10].to_frame().index.to_list()

In [101]:
px.scatter(df_mc_res_90, x="path_sim", y="cosine", 
           text="field", title="1990 Comms & media: cosine vs. path similarity")

In [102]:
px.scatter(df_mc_res_17, x="path_sim", y="cosine", 
           text="field", title="2017 Comms & media: cosine vs. path similarity")

# Cognitive Science

In [103]:
YEAR_LIST = (1990, 1993, 1996, 1999, 2002, 2005, 2008, 2011, 2014, 2017)
df_co_res = pd.DataFrame()
list_of_years = []
list_of_fields = []
list_of_sims = []

for y in YEAR_LIST:
  y_res = compute_pairwise_similarities(all_embeddings[y])
  print("---------------------------------------------")
  for k,v in y_res[0].items():
    if k == "Cognitive Sciences":
      for_res = v 
      for f in for_res:
        if for_res[f] >= NHOOD and f != "Cognitive Sciences":
          print(y, "sim =", f, "->", for_res[f])
          list_of_years.append(y)
          list_of_fields.append(f)
          list_of_sims.append(for_res[f])
  # print(y, ":", all_embeddings[y]["Computer Hardware"][0])
df_co_res["year"] = list_of_years 
df_co_res["field"] = list_of_fields
df_co_res["cosine"] = list_of_sims  

---------------------------------------------
1990 sim = Linguistics -> 0.9972271491234735
1990 sim = Philosophy -> 0.9895183421521271
1990 sim = Language Studies -> 0.9853375873108132
1990 sim = History and Philosophy of Specific Fields -> 0.9738736793855293
1990 sim = Applied Ethics -> 0.9671663014756668
1990 sim = Social Work -> 0.9645456598023142
1990 sim = Accounting, Auditing and Accountability -> 0.9643522062764096
1990 sim = Business and Management -> 0.9571422994941693
1990 sim = Marketing -> 0.9570694887858425
1990 sim = Performing Arts and Creative Writing -> 0.9557956658741058
1990 sim = Criminology -> 0.9525003824033426
1990 sim = Policy and Administration -> 0.9492096504010638
1990 sim = Curriculum and Pedagogy -> 0.9468991637722257
1990 sim = Specialist Studies In Education -> 0.9356016271788623
1990 sim = Political Science -> 0.935589647994605
1990 sim = Literary Studies -> 0.9353829435936258
1990 sim = Law -> 0.9329619729987811
1990 sim = Religion and Religious Studies

In [104]:
df_co_res_90 = y_df(df_co_res, 1990, "Cognitive Sciences")
df_co_res_93 = y_df(df_co_res, 1993, "Cognitive Sciences")
df_co_res_96 = y_df(df_co_res, 1996, "Cognitive Sciences")
df_co_res_99 = y_df(df_co_res, 1999, "Cognitive Sciences")
df_co_res_02 = y_df(df_co_res, 2002, "Cognitive Sciences")
df_co_res_05 = y_df(df_co_res, 2005, "Cognitive Sciences")
df_co_res_08 = y_df(df_co_res, 2008, "Cognitive Sciences")
df_co_res_11 = y_df(df_co_res, 2011, "Cognitive Sciences")
df_co_res_14 = y_df(df_co_res, 2014, "Cognitive Sciences")
df_co_res_17 = y_df(df_co_res, 2017, "Cognitive Sciences")

In [105]:
co_trend_df = pd.DataFrame({"year": [1990, 1993, 1996, 1999, 2002, 2005, 2008, 2011, 2014, 2017],
                            "nbrs": [len(df_co_res_90.cosine),
                                     len(df_co_res_93.cosine), 
                                     len(df_co_res_96.cosine),
                                     len(df_co_res_99.cosine),
                                     len(df_co_res_02.cosine), 
                                     len(df_co_res_05.cosine),
                                     len(df_co_res_08.cosine), 
                                     len(df_co_res_11.cosine),
                                     len(df_co_res_14.cosine),
                                     len(df_co_res_17.cosine)],
                            "density": [
                                        len(df_co_res_90[df_co_res_90.path_sim >= 0.5]),
                                        len(df_co_res_93[df_co_res_93.path_sim >= 0.5]),
                                        len(df_co_res_96[df_co_res_96.path_sim >= 0.5]),
                                        len(df_co_res_99[df_co_res_99.path_sim >= 0.5]),
                                        len(df_co_res_02[df_co_res_02.path_sim >= 0.5]),
                                        len(df_co_res_05[df_co_res_05.path_sim >= 0.5]),
                                        len(df_co_res_08[df_co_res_08.path_sim >= 0.5]),
                                        len(df_co_res_11[df_co_res_11.path_sim >= 0.5]),
                                        len(df_co_res_14[df_co_res_14.path_sim >= 0.5]),
                                        len(df_co_res_17[df_co_res_17.path_sim >= 0.5])
                            ],
                            "retention": retention_rate([
                                                         df_co_res_90,
                                                         df_co_res_93,
                                                         df_co_res_96,
                                                         df_co_res_99,
                                                         df_co_res_02,
                                                         df_co_res_05,
                                                         df_co_res_08,
                                                         df_co_res_11,
                                                         df_co_res_14,
                                                         df_co_res_17
                                                         ])
                            })

co_trend_df

Unnamed: 0,year,nbrs,density,retention
0,1990,35,1,1.0
1,1993,30,1,0.9
2,1996,22,2,0.909091
3,1999,17,1,0.823529
4,2002,15,1,0.866667
5,2005,19,2,0.684211
6,2008,7,0,1.0
7,2011,12,0,0.583333
8,2014,10,0,0.9
9,2017,5,0,1.0


In [106]:
px.scatter(co_trend_df, x="density", y="nbrs", 
           text="year", labels={"nbrs": "Num. neighbours > cosine cutoff",
                                "density": "Num. neighbours < path sim cutoff"},
           size="retention",
           title="Cognitive Sciences")

In [107]:
df_for_relations[df_for_relations.f2=="Cognitive Sciences"]

Unnamed: 0,f2,f1,for_cluster,for_area
128,Cognitive Sciences,Psychology and Cognitive Sciences,MHS,STEM


In [108]:
df_for_relations[df_for_relations.f1=="Psychology and Cognitive Sciences"]

Unnamed: 0,f2,f1,for_cluster,for_area
128,Cognitive Sciences,Psychology and Cognitive Sciences,MHS,STEM
129,Other Psychology and Cognitive Sciences,Psychology and Cognitive Sciences,MHS,STEM
130,Psychology,Psychology and Cognitive Sciences,MHS,STEM


# All

In [109]:
df_trend_all = pd.concat([co_trend_df, mc_trend_df,  ch_trend_df]) # ai_trend_df, , ll_trend_df
fields = [ "CogSci" for x in range(0, 10)] + [ "Media" for x in range(0, 10)]  + [ "CHardw" for x in range(0, 10) ]  # + [ "AI" for x in range(0, 10)] + ["Ling" for x in range(0, 10)]
df_trend_all["field"] = fields
df_trend_all["fy"] = df_trend_all["field"] + "_" + df_trend_all["year"].astype(str)
df_trend_all

Unnamed: 0,year,nbrs,density,retention,field,fy
0,1990,35,1,1.0,CogSci,CogSci_1990
1,1993,30,1,0.9,CogSci,CogSci_1993
2,1996,22,2,0.909091,CogSci,CogSci_1996
3,1999,17,1,0.823529,CogSci,CogSci_1999
4,2002,15,1,0.866667,CogSci,CogSci_2002
5,2005,19,2,0.684211,CogSci,CogSci_2005
6,2008,7,0,1.0,CogSci,CogSci_2008
7,2011,12,0,0.583333,CogSci,CogSci_2011
8,2014,10,0,0.9,CogSci,CogSci_2014
9,2017,5,0,1.0,CogSci,CogSci_2017


In [110]:
px.scatter(df_trend_all, x="density", y="nbrs", text = "fy", color="field",
           trendline="ols",
           labels={"nbrs": "Num. similar neighbours > cosine cutoff",
                                "density": "Num. similar & related neighbours"},
           size="retention")

In [111]:
px.scatter(df_trend_all, x="density", y="retention", text = "fy", color="field",
           labels={"nbrs": "Num. similar neighbours > cosine cutoff",
                                "density": "Num. similar & related neighbours"},
           size="nbrs"
          )

In [112]:
px.scatter(df_trend_all, x="nbrs", y="retention", text = "fy", color="field",
           labels={"nbrs": "Num. similar neighbours > cosine cutoff",
                                "density": "Num. similar & related neighbours"},
           size="density"
          )

In [113]:
px.line(df_trend_all[df_trend_all.field.isin(["Media", "CHardw"])], x="nbrs", y="retention", text = "fy", color="field",
           labels={"nbrs": "Num. similar neighbours > cosine cutoff",
                                "density": "Num. similar & related neighbours",
                   "retention": "Neigbhour stability"}
          )

In [114]:
px.line(df_trend_all[df_trend_all.field.isin(["Media", "CHardw"])], x="density", y="retention", text = "fy", color="field",
           labels={"nbrs": "Num. similar neighbours > cosine cutoff",
                                "density": "Num. similar & related neighbours"}
          )

In [115]:
px.scatter_3d(df_trend_all, x='density', y='nbrs', z='retention',
              color='field', text="fy", labels={"nbrs": "Num. similar neighbours > cosine cutoff",
                                "density": "Num. similar & related neighbours"})

# Correlation of path similarity and cosine

In [116]:
def cp_corr(df_list, m="kendall"):
  assert m.lower() in ("kendall", "pearson", "spearman")
  corr_list = []
  for d in df_list:
    if "ai_path_sim" in d.columns:
      if d['ai_path_sim'].sum() == 0:
        my_corr = 0.0
      else:
        my_corr = d['cosine'].corr(d['ai_path_sim'], method=m.lower())
    elif "ch_path_sim" in d.columns:
      if d['ch_path_sim'].sum() == 0:
        my_corr = 0.0
      else:
        my_corr = d['cosine'].corr(d['ch_path_sim'], method=m.lower())
    else:
      if d['path_sim'].sum() == 0:
        my_corr = 0.0
      else:
        my_corr = d['cosine'].corr(d['path_sim'], method=m.lower())
    corr_list.append(my_corr)
  return corr_list 
      

In [117]:
df_co_res_90

Unnamed: 0,year,field,cosine,path_sim
0,1990,Linguistics,0.997227,0.0
1,1990,Philosophy,0.989518,0.0
2,1990,Language Studies,0.985338,0.0
3,1990,History and Philosophy of Specific Fields,0.973874,0.0
4,1990,Applied Ethics,0.967166,0.0
5,1990,Social Work,0.964546,0.0
6,1990,"Accounting, Auditing and Accountability",0.964352,0.0
7,1990,Business and Management,0.957142,0.0
8,1990,Marketing,0.957069,0.0
9,1990,Performing Arts and Creative Writing,0.955796,0.0


In [118]:
co_c = cp_corr([
                                                         df_co_res_90,
                                                         df_co_res_93,
                                                         df_co_res_96,
                                                         df_co_res_99,
                                                         df_co_res_02,
                                                         df_co_res_05,
                                                         df_co_res_08,
                                                         df_co_res_11,
                                                         df_co_res_14,
                                                         df_co_res_17
                                                         ], m="spearman")
co_c 

[-0.15802573898701655,
 0.17075571100143747,
 -0.218297021102055,
 0.00437587526258753,
 -0.13884677041968485,
 -0.38251496234127946,
 0.0,
 0.0,
 0.0,
 0.0]

In [119]:
sum(co_c)/len(co_c)

-0.07225529065860109

In [120]:
df_co_res_93


Unnamed: 0,year,field,cosine,path_sim
35,1993,Linguistics,0.989723,0.0
36,1993,Literary Studies,0.948437,0.0
37,1993,Philosophy,0.946049,0.0
38,1993,Performing Arts and Creative Writing,0.93595,0.0
39,1993,Language Studies,0.932489,0.0
40,1993,Specialist Studies In Education,0.930748,0.0
41,1993,History and Philosophy of Specific Fields,0.926956,0.0
42,1993,Curriculum and Pedagogy,0.926401,0.0
43,1993,Psychology,0.922949,0.75
44,1993,Marketing,0.902857,0.0


In [121]:
mc_c = cp_corr([
                                                         df_mc_res_90,
                                                         df_mc_res_93,
                                                         df_mc_res_96,
                                                         df_mc_res_99,
                                                         df_mc_res_02,
                                                         df_mc_res_05,
                                                         df_mc_res_08,
                                                         df_mc_res_11,
                                                         df_mc_res_14,
                                                         df_mc_res_17
                                                         ], m="spearman")
mc_c

[0.28924517737692446,
 0.15305409804903972,
 0.3107745924098556,
 -0.0017851429222197437,
 0.42354472412898936,
 0.22452819889108347,
 0.2606742255365508,
 -0.015664445343137823,
 0.08134829179565416,
 0.04644379561183819]

In [122]:
sum(mc_c)/len(mc_c)

0.1772163515534578

In [123]:
ch_c = cp_corr([
                                                         df_ch_res_90,
                                                         df_ch_res_93,
                                                         df_ch_res_96,
                                                         df_ch_res_99,
                                                         df_ch_res_02,
                                                         df_ch_res_05,
                                                         df_ch_res_08,
                                                         df_ch_res_11,
                                                         df_ch_res_14,
                                                         df_ch_res_17
                                                         ], m="spearman")
ch_c

[0.10903232180474033,
 -0.05879789746393065,
 -0.04595327176763223,
 0.19947857777411182,
 0.15198063018869584,
 0.2416617049294942,
 0.15641536524154662,
 0.25687830911047127,
 0.175558854157954,
 0.2989770156361245]

In [124]:
sum(ch_c)/len(ch_c)

0.14852316096115756

In [125]:
ll_c = cp_corr([
                                                         df_ll_res_90,
                                                         df_ll_res_93,
                                                         df_ll_res_96,
                                                         df_ll_res_99,
                                                         df_ll_res_02,
                                                         df_ll_res_05,
                                                         df_ll_res_08,
                                                         df_ll_res_11,
                                                         df_ll_res_14,
                                                         df_ll_res_17
                                                         ], m="spearman")
ll_c

[0.08165654745017129,
 0.09631841256711549,
 0.15913311791438803,
 0.25053516779513624,
 0.13361747545217603,
 0.3088949375335999,
 -0.09375513405213959,
 0.19734731539313508,
 -0.007632666536364692,
 0.33993096369499426]

In [126]:
sum(ll_c)/len(ll_c)

0.14660461372122122

In [127]:
ls_c = cp_corr([
                                                         df_ls_res_90,
                                                         df_ls_res_93,
                                                         df_ls_res_96,
                                                         df_ls_res_99,
                                                         df_ls_res_02,
                                                         df_ls_res_05,
                                                         df_ls_res_08,
                                                         df_ls_res_11,
                                                         df_ls_res_14,
                                                         df_ls_res_17
                                                         ], m="spearman")
ls_c

[0.14015119836122505,
 0.3753455980791538,
 0.3783043194044756,
 0.24537326073814944,
 0.3294196678330847,
 0.35887059605800864,
 0.15866641313365132,
 0.42252105116558614,
 0.3673066489756596,
 0.32331399749802264]

In [128]:
sum(ls_c)/len(ls_c)

0.3099272751247017

In [129]:
df_trend_all["corr"] = co_c + mc_c + ch_c 

px.scatter(df_trend_all, x="nbrs", y="corr", color="field",
           text="fy", size="retention")


In [130]:
df_trend_all2 = pd.concat([co_trend_df, mc_trend_df,  ch_trend_df, ll_trend_df, ls_trend_df]) # ai_trend_df, , ll_trend_df
fields2 = [ "CogSci" for x in range(0, 10)] + [ "Media" for x in range(0, 10)]  + [ "CHardw" for x in range(0, 10) ] + ["Ling" for x in range(0, 10)] + ["LSt" for x in range(0, 10)]  # + [ "AI" for x in range(0, 10)] + ["Ling" for x in range(0, 10)]
df_trend_all2["field"] = fields2
df_trend_all2["fy"] = df_trend_all2["field"] + "_" + df_trend_all2["year"].astype(str)
df_trend_all2["corr"] = co_c + mc_c + ch_c + ll_c + ls_c
df_trend_all2

Unnamed: 0,year,nbrs,density,retention,field,fy,corr
0,1990,35,1,1.0,CogSci,CogSci_1990,-0.158026
1,1993,30,1,0.9,CogSci,CogSci_1993,0.170756
2,1996,22,2,0.909091,CogSci,CogSci_1996,-0.218297
3,1999,17,1,0.823529,CogSci,CogSci_1999,0.004376
4,2002,15,1,0.866667,CogSci,CogSci_2002,-0.138847
5,2005,19,2,0.684211,CogSci,CogSci_2005,-0.382515
6,2008,7,0,1.0,CogSci,CogSci_2008,0.0
7,2011,12,0,0.583333,CogSci,CogSci_2011,0.0
8,2014,10,0,0.9,CogSci,CogSci_2014,0.0
9,2017,5,0,1.0,CogSci,CogSci_2017,0.0


In [131]:
px.scatter(df_trend_all2, x="nbrs", y="corr", color="field",
           text="fy", size="density")

## Look at average number of neighbours

In [132]:
YEAR_LIST = (1990, 1993, 1996, 1999, 2002, 2005, 2008, 2011, 2014, 2017)


mean_nns = {}

for y in YEAR_LIST:
  #already_processed = []
  nns_list = []
  y_res = compute_pairwise_similarities(all_embeddings[y])
  print("--------------------{}-----------------------".format(y))
  for k,v in y_res[0].items():
    nns = 0
    #print (k, v)
    #if k in already_processed:
    #  continue
    #else:
    for_res = v 
   # print(v)
    for f in for_res:
      # print(f)
      # already_processed.append(f)
      if for_res[f] >= 0.9 and f != k:
        # print(y, "sim =", f, "->", for_res[f])
        nns += 1
      nns_list.append(nns)
  mean_nns[y] = (sum(nns_list)/len(nns_list))

mean_nns

--------------------1990-----------------------
--------------------1993-----------------------
--------------------1996-----------------------
--------------------1999-----------------------
--------------------2002-----------------------
--------------------2005-----------------------
--------------------2008-----------------------
--------------------2011-----------------------
--------------------2014-----------------------
--------------------2017-----------------------


{1990: 22.730308758664147,
 1993: 19.627454870065463,
 1996: 18.154619988934158,
 1999: 17.236659392977582,
 2002: 14.993200231481481,
 2005: 14.245410190634274,
 2008: 11.75410225921522,
 2011: 10.101426158753988,
 2014: 9.241087583744926,
 2017: 8.770592206790123}

In [133]:
nbrs_diff_list = []
for i in range(0, len(df_trend_all2)):
  nbr_diff = (df_trend_all2.iloc[i].nbrs - mean_nns[df_trend_all2.iloc[i].year])/mean_nns[df_trend_all2.iloc[i].year]
  nbrs_diff_list.append(nbr_diff)

df_trend_all2["nbr_diff"] = nbrs_diff_list
df_trend_all2

Unnamed: 0,year,nbrs,density,retention,field,fy,corr,nbr_diff
0,1990,35,1,1.0,CogSci,CogSci_1990,-0.158026,0.539794
1,1993,30,1,0.9,CogSci,CogSci_1993,0.170756,0.528471
2,1996,22,2,0.909091,CogSci,CogSci_1996,-0.218297,0.211813
3,1999,17,1,0.823529,CogSci,CogSci_1999,0.004376,-0.01373
4,2002,15,1,0.866667,CogSci,CogSci_2002,-0.138847,0.000454
5,2005,19,2,0.684211,CogSci,CogSci_2005,-0.382515,0.333763
6,2008,7,0,1.0,CogSci,CogSci_2008,0.0,-0.404463
7,2011,12,0,0.583333,CogSci,CogSci_2011,0.0,0.187951
8,2014,10,0,0.9,CogSci,CogSci_2014,0.0,0.082124
9,2017,5,0,1.0,CogSci,CogSci_2017,0.0,-0.429913


In [134]:
# [(df_trend_all2.field != "Ling") ] & (df_trend_all2.field != "LSt")
px.scatter(df_trend_all2, x="nbr_diff", y="corr", color="field",
           text="fy", size="retention",
           labels={"corr": "Neighbour cosine / path similarity correlation",
                   "nbr_diff": "Neighbourhood relative deviation from average"})

## Look at neighbourhood density = N neighbours / cosine range

### Linguistics

In [135]:
#YEAR_LIST = (1990, 1993, 1996, 1999, 2002, 2005, 2008, 2011, 2014, 2017)
#df_ll_res = pd.DataFrame()
#list_of_years = []
#list_of_fields = []
list_of_sims = []
ll_density_dict = {}
#for y in YEAR_LIST:
y = 2017
y_res = compute_pairwise_similarities(all_embeddings[y])
print("-----------------{}-------------------------".format(y))
for k,v in y_res[0].items():
  if k == "Linguistics":
    for_res = v 
    for f in for_res:
      if f != "Linguistics":
        list_of_sims.append(for_res[f])
        ll_density_dict[f] = for_res[f]
  # print(y, ":", all_embeddings[y]["Computer Hardware"][0])
nnbrs = 0
max_cos = 0
for k,v in ll_density_dict.items():
  if v >= 0.9:
    nnbrs += 1
    if v > max_cos:
      max_cos = v 

my_range = max_cos - 0.9
print("Range:", my_range)
print("N neighbours:", nnbrs)
print("Density:", nnbrs/(1-my_range))
print("Density 2:", nnbrs/(math.pi *(1-my_range)**2) )
# pd.Series(list_of_sims).quantile(q=[.1, .2, .3, .4, .5, .6, .7, .8, .9, 1.0])

-----------------2017-------------------------
Range: 0.04419420394686879
N neighbours: 3
Density: 3.1387129188670837
Density 2: 1.04527860794918


### Computer Hardware

In [136]:
#YEAR_LIST = (1990, 1993, 1996, 1999, 2002, 2005, 2008, 2011, 2014, 2017)
#df_ll_res = pd.DataFrame()
#list_of_years = []
#list_of_fields = []
list_of_sims = []
ll_density_dict = {}
#for y in YEAR_LIST:
y = 2017
y_res = compute_pairwise_similarities(all_embeddings[y])
print("-----------------{}-------------------------".format(y))
for k,v in y_res[0].items():
  if k == "Computer Hardware":
    for_res = v 
    for f in for_res:
      if f != "Computer Hardware":
        list_of_sims.append(for_res[f])
        ll_density_dict[f] = for_res[f]
  # print(y, ":", all_embeddings[y]["Computer Hardware"][0])
nnbrs = 0
max_cos = 0
for k,v in ll_density_dict.items():
  if v >= 0.9:
    nnbrs += 1
    if v > max_cos:
      max_cos = v 

my_range = max_cos - 0.9
print("Range:", my_range)
print("N neighbours:", nnbrs)
print("Density:", nnbrs/(1-my_range))
print("Density 2:", nnbrs/(math.pi *(1-my_range)**2) )
# pd.Series(list_of_sims).quantile(q=[.1, .2, .3, .4, .5, .6, .7, .8, .9, 1.0])

-----------------2017-------------------------
Range: 0.08227498240181241
N neighbours: 16
Density: 17.434416293754524
Density 2: 6.047069612060654


### AI

In [137]:
#YEAR_LIST = (1990, 1993, 1996, 1999, 2002, 2005, 2008, 2011, 2014, 2017)
#df_ll_res = pd.DataFrame()
#list_of_years = []
#list_of_fields = []
list_of_sims = []
ll_density_dict = {}
#for y in YEAR_LIST:
y = 2017
y_res = compute_pairwise_similarities(all_embeddings[y])
print("-----------------{}-------------------------".format(y))
for k,v in y_res[0].items():
  if k == "AI and Image Processing":
    for_res = v 
    for f in for_res:
      if f != "AI and Image Processing":
        list_of_sims.append(for_res[f])
        ll_density_dict[f] = for_res[f]
  # print(y, ":", all_embeddings[y]["Computer Hardware"][0])
nnbrs = 0
max_cos = 0
for k,v in ll_density_dict.items():
  if v >= 0.9:
    nnbrs += 1
    if v > max_cos:
      max_cos = v 

my_range = max_cos - 0.9
print("Range:", my_range)
print("N neighbours:", nnbrs)
print("Density 2:", nnbrs/(math.pi *(1-my_range)**2) )
# pd.Series(list_of_sims).quantile(q=[.1, .2, .3, .4, .5, .6, .7, .8, .9, 1.0])

-----------------2017-------------------------
Range: -0.9
N neighbours: 0
Density 2: 0.0


### Communication and media studies

In [138]:
#YEAR_LIST = (1990, 1993, 1996, 1999, 2002, 2005, 2008, 2011, 2014, 2017)
#df_ll_res = pd.DataFrame()
#list_of_years = []
#list_of_fields = []
list_of_sims = []
ll_density_dict = {}
#for y in YEAR_LIST:
y = 2017
y_res = compute_pairwise_similarities(all_embeddings[y])
print("-----------------{}-------------------------".format(y))
for k,v in y_res[0].items():
  if k == "Communication and Media Studies":
    for_res = v 
    for f in for_res:
      if f != "Communication and Media Studies":
        list_of_sims.append(for_res[f])
        ll_density_dict[f] = for_res[f]
  # print(y, ":", all_embeddings[y]["Computer Hardware"][0])
nnbrs = 0
max_cos = 0
for k,v in ll_density_dict.items():
  if v >= 0.9:
    nnbrs += 1
    if v > max_cos:
      max_cos = v 

my_range = max_cos - 0.9
print("Range:", my_range)
print("N neighbours:", nnbrs)
print("Density 2:", nnbrs/(math.pi *(1-my_range)**2) )
# pd.Series(list_of_sims).quantile(q=[.1, .2, .3, .4, .5, .6, .7, .8, .9, 1.0])

-----------------2017-------------------------
Range: 0.08298957117426498
N neighbours: 15
Density 2: 5.67796664328004


### All fields, 2017

In [139]:
#YEAR_LIST = (1990, 1993, 1996, 1999, 2002, 2005, 2008, 2011, 2014, 2017)
#df_ll_res = pd.DataFrame()
#list_of_years = []
#list_of_fields = []
list_of_sims = []
ll_density_dict = {}
#for y in YEAR_LIST:
y = 2017
y_res = compute_pairwise_similarities(all_embeddings[y])
print("-----------------{}-------------------------".format(y))

item_list = []
nnbrs_list = []
density_list = []

for k,v in y_res[0].items():
  item_list.append(k)
  nnbrs = 0
  #max_cos = 0
  cos_list = []
  for_res = v 
  for f in for_res:
    if f != k:
      if for_res[f] >= 0.9:
        nnbrs += 1
        cos_list.append((1-for_res[f]))
        #if for_res[f] > max_cos:
        #  max_cos = for_res[f]
  #my_range = max_cos - 0.9
  #my_density = nnbrs/(1-my_range)
  if len(cos_list) > 0:
    print("Mean distance:", sum(cos_list)/len(cos_list))
    my_density = nnbrs/( sum(cos_list)/len(cos_list) )
  else:
    my_density = 0.0
  nnbrs_list.append(nnbrs)
  density_list.append(my_density)


df_nbrs_density_17 = pd.DataFrame({"field": item_list, "nnbrs": nnbrs_list, "density": density_list})
df_nbrs_density_17.head()

-----------------2017-------------------------
Mean distance: 0.094360998776432
Mean distance: 0.09303943623567701
Mean distance: 0.0966128544074416
Mean distance: 0.09818323717737287
Mean distance: 0.09026133306101658
Mean distance: 0.09000324967355981
Mean distance: 0.0761885774736088
Mean distance: 0.08362220262484217
Mean distance: 0.06177828782624528
Mean distance: 0.08968689652829014
Mean distance: 0.06628516875089698
Mean distance: 0.09169815729029003
Mean distance: 0.05609702675405562
Mean distance: 0.07755154230315463
Mean distance: 0.07683051671305391
Mean distance: 0.08750930179193539
Mean distance: 0.07319482205646047
Mean distance: 0.07617767914821762
Mean distance: 0.08494636073424032
Mean distance: 0.07811567085522506
Mean distance: 0.06838916364035673
Mean distance: 0.07441121613488225
Mean distance: 0.0758572865361698
Mean distance: 0.07538389230062702
Mean distance: 0.0749242405945593
Mean distance: 0.06617627016830181
Mean distance: 0.07309680566542939
Mean distance:

Unnamed: 0,field,nnbrs,density
0,Public Health and Health Services,0,0.0
1,Psychology,0,0.0
2,Clinical Sciences,0,0.0
3,Artificial Intelligence and Image Processing,0,0.0
4,Genetics,0,0.0


In [140]:
px.scatter(df_nbrs_density_17, x="nnbrs", y="density")

In [141]:
df_nbrs_density_17.nnbrs.mean()

9.194444444444445

In [142]:
df_nbrs_density_17["nnbrs_mean_diff"] = df_nbrs_density_17.nnbrs - df_nbrs_density_17.nnbrs.mean()
df_nbrs_density_17

Unnamed: 0,field,nnbrs,density,nnbrs_mean_diff
0,Public Health and Health Services,0,0.000000,-9.194444
1,Psychology,0,0.000000,-9.194444
2,Clinical Sciences,0,0.000000,-9.194444
3,Artificial Intelligence and Image Processing,0,0.000000,-9.194444
4,Genetics,0,0.000000,-9.194444
...,...,...,...,...
139,Other Law and Legal Studies,9,215.218407,-0.194444
140,Art Theory and Criticism,8,236.095046,-1.194444
141,Commercial Services,10,262.305826,0.805556
142,Library and Information Studies,9,231.997979,-0.194444


In [143]:
df_nbrs_density_17["density_mean_diff"] = df_nbrs_density_17.density - df_nbrs_density_17.density.mean()
df_nbrs_density_17

Unnamed: 0,field,nnbrs,density,nnbrs_mean_diff,density_mean_diff
0,Public Health and Health Services,0,0.000000,-9.194444,-142.993466
1,Psychology,0,0.000000,-9.194444,-142.993466
2,Clinical Sciences,0,0.000000,-9.194444,-142.993466
3,Artificial Intelligence and Image Processing,0,0.000000,-9.194444,-142.993466
4,Genetics,0,0.000000,-9.194444,-142.993466
...,...,...,...,...,...
139,Other Law and Legal Studies,9,215.218407,-0.194444,72.224941
140,Art Theory and Criticism,8,236.095046,-1.194444,93.101580
141,Commercial Services,10,262.305826,0.805556,119.312360
142,Library and Information Studies,9,231.997979,-0.194444,89.004514


In [144]:
px.scatter(df_nbrs_density_17, x="nnbrs_mean_diff", y="density_mean_diff", hover_name="field",
           title="2017", labels={"density_mean_diff": "Difference from average density",
                                 "nnbrs_mean_diff": "Difference from average neighbourhood size"})

In [145]:
px.scatter(df_nbrs_density_17, x="nnbrs_mean_diff", y="density_mean_diff", text="field")

### All fields 1990

In [146]:
#YEAR_LIST = (1990, 1993, 1996, 1999, 2002, 2005, 2008, 2011, 2014, 2017)
#df_ll_res = pd.DataFrame()
#list_of_years = []
#list_of_fields = []
list_of_sims = []
ll_density_dict = {}
#for y in YEAR_LIST:
y = 1990
y_res = compute_pairwise_similarities(all_embeddings[y])
print("-----------------{}-------------------------".format(y))

item_list = []
nnbrs_list = []
density_list = []

for k,v in y_res[0].items():
  item_list.append(k)
  nnbrs = 0
  #max_cos = 0
  cos_list = []
  for_res = v 
  for f in for_res:
    if f != k:
      if for_res[f] >= 0.9:
        nnbrs += 1
        cos_list.append((1-for_res[f]))
        #if for_res[f] > max_cos:
        #  max_cos = for_res[f]
  #my_range = max_cos - 0.9
  #my_density = nnbrs/(1-my_range)
  if len(cos_list) > 0:
    # print("Mean distance:", sum(cos_list)/len(cos_list))
    my_density = nnbrs/( sum(cos_list)/len(cos_list) )
  else:
    my_density = 0.0
  nnbrs_list.append(nnbrs)
  density_list.append(my_density)


df_nbrs_density_90 = pd.DataFrame({"field": item_list, "nnbrs": nnbrs_list, "density": density_list})
df_nbrs_density_90["density_mean_diff"] = df_nbrs_density_90.density - df_nbrs_density_90.density.mean()
df_nbrs_density_90["nnbrs_mean_diff"] = df_nbrs_density_90.nnbrs - df_nbrs_density_90.nnbrs.mean()
df_nbrs_density_90.head()

-----------------1990-------------------------


Unnamed: 0,field,nnbrs,density,density_mean_diff,nnbrs_mean_diff
0,Clinical Sciences,23,384.066662,-154.320609,-2.289855
1,Psychology,23,347.645548,-190.741724,-2.289855
2,Public Health and Health Services,14,184.915936,-353.471336,-11.289855
3,Biochemistry and Cell Biology,36,603.45939,65.072119,10.710145
4,Neurosciences,20,483.202282,-55.184989,-5.289855


In [147]:
fig = px.scatter(df_nbrs_density_90, x="nnbrs_mean_diff", y="density_mean_diff", hover_name="field",
                 title="1990 (rectangle = 2017 plot borders)", labels={"density_mean_diff": "Difference from average density",
                                 "nnbrs_mean_diff": "Difference from average neighbourhood size"})
fig.add_shape(
        # Line Horizontal
            type="line",
            x0=df_nbrs_density_17.nnbrs_mean_diff.min(),
            y0=df_nbrs_density_17.density_mean_diff.min(),
            x1=df_nbrs_density_17.nnbrs_mean_diff.max(),
            y1=df_nbrs_density_17.density_mean_diff.min(),
            line=dict(
                color="black",
                width=3 #,dash="dashdot",
            ),
    )

fig.add_shape(
        # Line Horizontal
            type="line",
            x0=df_nbrs_density_17.nnbrs_mean_diff.min(),
            y0=df_nbrs_density_17.density_mean_diff.max(),
            x1=df_nbrs_density_17.nnbrs_mean_diff.max(),
            y1=df_nbrs_density_17.density_mean_diff.max(),
            line=dict(
                color="black",
                width=3 #,dash="dashdot",
            ),
    )

fig.add_shape(
        # Line Vertical
        dict(
            type="line",
            x0=df_nbrs_density_17.nnbrs_mean_diff.min(),
            y0=df_nbrs_density_17.density_mean_diff.min(),
            x1=df_nbrs_density_17.nnbrs_mean_diff.min(),
            y1=df_nbrs_density_17.density_mean_diff.max(),
            line=dict(
                color="black",
                width=3
            )
))

fig.add_shape(
        # Line Vertical
        dict(
            type="line",
            x0=df_nbrs_density_17.nnbrs_mean_diff.max(),
            y0=df_nbrs_density_17.density_mean_diff.min(),
            x1=df_nbrs_density_17.nnbrs_mean_diff.max(),
            y1=df_nbrs_density_17.density_mean_diff.max(),
            line=dict(
                color="black",
                width=3
            )
))

fig.show()

### Compare quadrants

In [148]:
q_list_90 = []
q_list_17 = []
stable_list = []
for f in df_nbrs_density_17.field.unique():
  my_90 = df_nbrs_density_90[df_nbrs_density_90.field == f]
  my_17 = df_nbrs_density_17[df_nbrs_density_17.field == f]
  try:
    if my_90.nnbrs_mean_diff.item() < 0 and my_90.density_mean_diff.item() < 0:
      q_90 = "Q1"
    elif my_90.nnbrs_mean_diff.item() < 0 and my_90.density_mean_diff.item() >= 0:
      q_90 = "Q2"
    elif my_90.nnbrs_mean_diff.item() >= 0 and my_90.density_mean_diff.item() >= 0:
      q_90 = "Q3"
    else:
      q_90 = "Q3"
    if my_17.nnbrs_mean_diff.item() < 0 and my_17.density_mean_diff.item() < 0:
      q_17 = "Q1"
    elif my_17.nnbrs_mean_diff.item() < 0 and my_17.density_mean_diff.item() >= 0:
      q_17 = "Q2"
    elif my_17.nnbrs_mean_diff.item() >= 0 and my_17.density_mean_diff.item() >= 0:
      q_17 = "Q3"
    else:
      q_17 = "Q3"
    q_list_17.append(q_17)
    q_list_90.append(q_90)
    if q_90 != q_17:
      print("{} changed from {} to {}".format(f, q_90, q_17))
      stable_status = False
    else:
      print("== STABLE: {} ({})".format(f, q_90))
      stable_status = True
    stable_list.append(stable_status)
  except ValueError:
    print("--- Error with {}".format(f))
    stable_list.append(False)


== STABLE: Public Health and Health Services (Q1)
== STABLE: Psychology (Q1)
== STABLE: Clinical Sciences (Q1)
Artificial Intelligence and Image Processing changed from Q3 to Q1
Genetics changed from Q3 to Q1
Statistics changed from Q3 to Q1
== STABLE: Neurosciences (Q1)
Biochemistry and Cell Biology changed from Q3 to Q1
Applied Economics changed from Q3 to Q1
Cardiorespiratory Medicine and Haematology changed from Q2 to Q1
== STABLE: Information Systems (Q1)
== STABLE: Other Physical Sciences (Q1)
Physical Chemistry (incl. Structural) changed from Q3 to Q1
Oncology and Carcinogenesis changed from Q2 to Q1
== STABLE: Paediatrics and Reproductive Medicine (Q1)
Immunology changed from Q2 to Q3
Sociology changed from Q3 to Q1
== STABLE: Materials Engineering (Q3)
Historical Studies changed from Q3 to Q1
== STABLE: Biomedical Engineering (Q1)
Pharmacology and Pharmaceutical Sciences changed from Q3 to Q1
Medical Microbiology changed from Q2 to Q1
== STABLE: Microbiology (Q1)
Law changed f

In [149]:
#df_nbrs_density_17["q_90"] = q_list_90
#df_nbrs_density_17["q_17"] = q_list_17
df_nbrs_density_17["stable"] = stable_list

px.scatter(df_nbrs_density_17[df_nbrs_density_17.stable==True], x="nnbrs_mean_diff", y="density_mean_diff", hover_name="field",
           title="2017", labels={"density_mean_diff": "Difference from average density",
                                 "nnbrs_mean_diff": "Difference from average neighbourhood size"})

### Compare quadrant changes

In [150]:

#for f in df_nbrs_density_17[df_nbrs_density_17.stable==True].field.unique():
#  my_90 = df_nbrs_density_90[df_nbrs_density_90.field == f]
#  my_17 = df_nbrs_density_17[df_nbrs_density_17.field == f]
df_nbrs_density_90["field2"] = df_nbrs_density_90["field"] + "_90"
df_nbrs_density_17["field2"] = df_nbrs_density_17["field"] + "_17"
df_delta = pd.concat([df_nbrs_density_90[df_nbrs_density_90.field.isin(df_nbrs_density_17[df_nbrs_density_17.stable==False].field.unique())][["density_mean_diff",	"nnbrs_mean_diff", "field", "field2"]],
          df_nbrs_density_17[df_nbrs_density_17.field.isin(df_nbrs_density_17[df_nbrs_density_17.stable==False].field.unique())][["density_mean_diff",	"nnbrs_mean_diff", "field", "field2"]]
]
)

In [151]:
px.scatter(df_delta, x="nnbrs_mean_diff", y="density_mean_diff",
           color="field", trendline="ols", hover_name="field2"
           )

# Look at neighbourhood density = N neigbhours shared with other disciplines

# Look at neighbours defined by top decile

In [152]:
#YEAR_LIST = (1990, 1993, 1996, 1999, 2002, 2005, 2008, 2011, 2014, 2017)
#df_ll_res = pd.DataFrame()
#list_of_years = []
#list_of_fields = []
list_of_sims = []

#for y in YEAR_LIST:
y = 2017
y_res = compute_pairwise_similarities(all_embeddings[y])
print("-----------------{}-------------------------".format(y))
for k,v in y_res[0].items():
  if k == "Linguistics":
    for_res = v 
    for f in for_res:
      if f != "Linguistics":
        list_of_sims.append(for_res[f])
  # print(y, ":", all_embeddings[y]["Computer Hardware"][0])
pd.Series(list_of_sims).quantile(q=[.1, .2, .3, .4, .5, .6, .7, .8, .9, 1.0])

-----------------2017-------------------------


0.1    0.100216
0.2    0.182251
0.3    0.251916
0.4    0.365152
0.5    0.403300
0.6    0.546356
0.7    0.618676
0.8    0.712158
0.9    0.796677
1.0    0.944194
dtype: float64

In [153]:
#YEAR_LIST = (1990, 1993, 1996, 1999, 2002, 2005, 2008, 2011, 2014, 2017)
#df_ll_res = pd.DataFrame()
#list_of_years = []
#list_of_fields = []
list_of_sims = []

#for y in YEAR_LIST:
y = 2017
y_res = compute_pairwise_similarities(all_embeddings[y])
print("-----------------{}-------------------------".format(y))
for k,v in y_res[0].items():
  if k == "Artificial Intelligence and Image Processing":
    for_res = v 
    for f in for_res:
      if f != "Artificial Intelligence and Image Processing":
        list_of_sims.append(for_res[f])
  # print(y, ":", all_embeddings[y]["Computer Hardware"][0])
pd.Series(list_of_sims).quantile(q=[.1, .2, .3, .4, .5, .6, .7, .8, .9, 1.0])

-----------------2017-------------------------


0.1    0.454681
0.2    0.501157
0.3    0.535254
0.4    0.575888
0.5    0.620375
0.6    0.651804
0.7    0.685184
0.8    0.715096
0.9    0.780209
1.0    0.883995
dtype: float64

# MI

## Computer Hardware

### 1990

#### Actual co-occurrence

In [154]:
%%bigquery --project $pid

SELECT DISTINCT(f2) FROM (
  SELECT doi, f.second_level.name AS f2 FROM `springer-nature-analytics.DS_dimensions.publications_full_refresh`
  LEFT JOIN UNNEST(`for`) AS f
  WHERE doi IN (
  SELECT DISTINCT(doi) FROM (  
    SELECT doi, f.second_level.name AS f2 FROM `springer-nature-analytics.DS_dimensions.publications_full_refresh`
    LEFT JOIN UNNEST(`for`) AS f
    WHERE year = 1990
  ) WHERE f2 = "Computer Hardware"
 )
)

Unnamed: 0,f2
0,Communications Technologies
1,Electrical and Electronic Engineering
2,Distributed Computing
3,Information Systems
4,Computer Software
5,Computer Hardware


In [155]:
%%bigquery --project $pid

SELECT DISTINCT(f2) FROM (
  SELECT doi, f.second_level.name AS f2 FROM `springer-nature-analytics.DS_dimensions.publications_full_refresh`
  LEFT JOIN UNNEST(`for`) AS f
  WHERE doi IN (
  SELECT DISTINCT(doi) FROM (  
    SELECT doi, f.second_level.name AS f2 FROM `springer-nature-analytics.DS_dimensions.publications_full_refresh`
    LEFT JOIN UNNEST(`for`) AS f
    WHERE year = 2017
  ) WHERE f2 = "Computer Hardware"
 )
)

Unnamed: 0,f2
0,Artificial Intelligence and Image Processing
1,Computer Hardware
2,Other Information and Computing Sciences
3,Electrical and Electronic Engineering
4,Information Systems
5,Communications Technologies
6,Data Format
7,Computer Software
8,Computation Theory and Mathematics
9,Distributed Computing


#### MI example

In [156]:
%%bigquery --project $pid ch_mi_test

SELECT COUNT(DISTINCT(doi)) AS n, "Computer Hardware" AS field FROM (  
  SELECT doi, f.second_level.name AS f2 FROM `springer-nature-analytics.DS_dimensions.publications_full_refresh`
  LEFT JOIN UNNEST(`for`) AS f
  WHERE year = 1990
  AND ARRAY_LENGTH(`for`) = 1
) WHERE f2 = "Computer Hardware"
UNION ALL 
SELECT COUNT(DISTINCT(doi)) AS n, "Communications Technologies" AS field FROM (  
  SELECT doi, f.second_level.name AS f2 FROM `springer-nature-analytics.DS_dimensions.publications_full_refresh`
  LEFT JOIN UNNEST(`for`) AS f
  WHERE year = 1990
  AND ARRAY_LENGTH(`for`) = 1
) WHERE f2 = "Communications Technologies"
UNION ALL 
SELECT COUNT(DISTINCT(doi)) AS n, "CT and CH" AS field FROM (  

  SELECT DISTINCT(doi) FROM `springer-nature-analytics.DS_dimensions.publications_full_refresh`
  LEFT JOIN UNNEST(`for`) AS f
  WHERE year = 1990
  AND f.second_level.name = "Computer Hardware"
  AND doi IN (

  SELECT DISTINCT(doi) FROM `springer-nature-analytics.DS_dimensions.publications_full_refresh`
  LEFT JOIN UNNEST(`for`) AS f
  WHERE year = 1990
  AND f.second_level.name = "Communications Technologies"
  )

  
  
) 

UNION ALL

SELECT COUNT(DISTINCT(doi)) AS n, "All" AS field FROM `springer-nature-analytics.DS_dimensions.publications_full_refresh`
WHERE year = 1990



In [157]:
ch_mi_test

Unnamed: 0,n,field
0,3536,Communications Technologies
1,1060525,All
2,104,Computer Hardware
3,130,CT and CH


In [158]:
ch_mi_test[ch_mi_test.field=="CT and CH"].n.item()

130

In [159]:
ch_mi_test[ch_mi_test.field=="All"].n.item()

1060525

In [160]:
import math

joint_prob = ch_mi_test[ch_mi_test.field=="CT and CH"].n.item()/ch_mi_test[ch_mi_test.field=="All"].n.item()
joint_prob
ind_prob = (ch_mi_test[ch_mi_test.field=="Computer Hardware"].n.item()/ch_mi_test[ch_mi_test.field=="All"].n.item()) * (ch_mi_test[ch_mi_test.field=="Communications Technologies"].n.item()/ch_mi_test[ch_mi_test.field=="All"].n.item())
math.log(joint_prob/ind_prob)

5.9266667540516424

## 2017

In [161]:
%%bigquery --project $pid ch_mi_test2

SELECT COUNT(DISTINCT(doi)) AS n, "Computer Hardware" AS field FROM (  
  SELECT doi, f.second_level.name AS f2 FROM `springer-nature-analytics.DS_dimensions.publications_full_refresh`
  LEFT JOIN UNNEST(`for`) AS f
  WHERE year = 2017
  AND ARRAY_LENGTH(`for`) = 1
) WHERE f2 = "Computer Hardware"
UNION ALL 
SELECT COUNT(DISTINCT(doi)) AS n, "Communications Technologies" AS field FROM (  
  SELECT doi, f.second_level.name AS f2 FROM `springer-nature-analytics.DS_dimensions.publications_full_refresh`
  LEFT JOIN UNNEST(`for`) AS f
  WHERE year = 2017
  AND ARRAY_LENGTH(`for`) = 1
) WHERE f2 = "Communications Technologies"
UNION ALL 
SELECT COUNT(DISTINCT(doi)) AS n, "CT and CH" AS field FROM (  

  SELECT DISTINCT(doi) FROM `springer-nature-analytics.DS_dimensions.publications_full_refresh`
  LEFT JOIN UNNEST(`for`) AS f
  WHERE year = 2017
  AND f.second_level.name = "Computer Hardware"
  AND doi IN (

  SELECT DISTINCT(doi) FROM `springer-nature-analytics.DS_dimensions.publications_full_refresh`
  LEFT JOIN UNNEST(`for`) AS f
  WHERE year = 2017
  AND f.second_level.name = "Communications Technologies"
  )

  
  
) 

UNION ALL

SELECT COUNT(DISTINCT(doi)) AS n, "All" AS field FROM `springer-nature-analytics.DS_dimensions.publications_full_refresh`
WHERE year = 2017



In [162]:
joint_prob2 = ch_mi_test2[ch_mi_test2.field=="CT and CH"].n.item()/ch_mi_test2[ch_mi_test2.field=="All"].n.item()
joint_prob2
ind_prob2 = (ch_mi_test2[ch_mi_test2.field=="Computer Hardware"].n.item()/ch_mi_test2[ch_mi_test2.field=="All"].n.item()) * (ch_mi_test[ch_mi_test.field=="Communications Technologies"].n.item()/ch_mi_test[ch_mi_test.field=="All"].n.item())
math.log(joint_prob2/ind_prob2)

4.256604219801107

# Calculate density and compare with neighbourhood size

Density = N first degree relative FoR codes / N neighbours

## Iterate over all fields in all years

In [163]:
YEAR_LIST = (1990, 1993, 1996, 1999, 2002, 2005, 2008, 2011, 2014, 2017)
df_all_nbrs_density_res = pd.DataFrame()
list_of_years = []
list_of_fields = []
list_of_nsize = []
list_of_density = []


for y in YEAR_LIST:
  #y = 1990
  # Get all pairwise similarities for a given year
  y_res = compute_pairwise_similarities(all_embeddings[y])
  print("-----------------{}-------------------------".format(y))

  # iterate over similarities to calculate neighborhoods and density
  for k,v in y_res[0].items():
    list_of_fields.append(k)
    list_of_years.append(y)
    density_count = 0
    nnbrs = 0
    for_res = v 
    for f in for_res:
      if f != k:
        # neighborhood size
        # a neighbour has a similiarty of at least this threshold
        if for_res[f] >= NHOOD:
          nnbrs += 1
          # density
          my_for_sim = for_sim(f=f, ff=k, df=df_for_relations)
          if my_for_sim == 0.75:
            density_count += 1

    list_of_nsize.append(nnbrs)
    try:
      my_density = density_count/nnbrs 
    except ZeroDivisionError:
      my_density = 0.0
    list_of_density.append(my_density)


df_all_nbrs_density_res["year"] = list_of_years
df_all_nbrs_density_res["field"] = list_of_fields
df_all_nbrs_density_res["nsize"] = list_of_nsize
df_all_nbrs_density_res["density"] = list_of_density

df_all_nbrs_density_res.head()

-----------------1990-------------------------
-----------------1993-------------------------
-----------------1996-------------------------
-----------------1999-------------------------
-----------------2002-------------------------
-----------------2005-------------------------
-----------------2008-------------------------
-----------------2011-------------------------
-----------------2014-------------------------
-----------------2017-------------------------


Unnamed: 0,year,field,nsize,density
0,1990,Clinical Sciences,53,0.320755
1,1990,Psychology,57,0.035088
2,1990,Public Health and Health Services,52,0.307692
3,1990,Biochemistry and Cell Biology,59,0.101695
4,1990,Neurosciences,37,0.459459


In [165]:
px.scatter(df_all_nbrs_density_res, x="nsize", y="density", 
           color="year", hover_name="field",
           labels={"nsize": "Proximity", "density": "Relatedness"})

In [166]:
px.scatter(df_all_nbrs_density_res, x="nsize", y="density", color="field", hover_name="year",
           trendline="ols",
           labels={"nsize": "Proximity", "density": "Relatedness"})

In [167]:
df_all_nbrs_density_res[["nsize", "density"]].corr()

Unnamed: 0,nsize,density
nsize,1.0,-0.363528
density,-0.363528,1.0


In [168]:
df_all_nbrs_density_res[["nsize", "density", "year"]].corr()

Unnamed: 0,nsize,density,year
nsize,1.0,-0.363528,-0.591088
density,-0.363528,1.0,0.137783
year,-0.591088,0.137783,1.0


In [169]:
# fields that have changed:
delta_for = [
'Architecture',
'Astronomical and Space Sciences',
'Automotive Engineering',
'Biomedical Engineering',
'Building',
'Cardiorespiratory Medicine and Haematology',
'Communication and Media Studies',
'Complementary and Alternative Medicine',
'Computer Hardware',
'Distributed Computing',
'Education Systems',
'Food Sciences',
'Immunology',
'Medical Biochemistry and Metabolomics',
'Medical Biotechnology',
'Medical Physiology',
'Medicinal and Biomolecular Chemistry',
'Neurosciences',
'Nutrition and Dietetics',
'Oncology and Carcinogenesis',
'Ophthalmology and Optometry',
'Other Medical and Health Sciences',
'Pharmacology and Pharmaceutical Sciences',
'Transportation and Freight Services',
'Visual Arts and Crafts'
]

In [170]:
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression

changed_bin = []
coef = []
r2 = []
for_code = []
neg_coef = []

for n in sorted(df_all_nbrs_density_res.field.unique()):
  print("--- {} ---".format(n))
  for_code.append(n)
  if n in delta_for:
    print("--- CHANGED ---")
    changed_bin.append(True)
  else:
    changed_bin.append(False)
  my_df = df_all_nbrs_density_res[df_all_nbrs_density_res.field==n].copy()
  my_fit = LinearRegression().fit(np.array(my_df.density).reshape(-1,1), np.array(my_df.nsize).reshape(-1,1))
  print("Beta (density):", my_fit.coef_[0][0])
  coef.append(my_fit.coef_[0][0])
  if my_fit.coef_[0][0] < 0:
    neg_coef.append(True)
  else:
    neg_coef.append(False)
  my_r2 = r2_score(np.array(my_df.nsize).reshape(-1,1), my_fit.predict(np.array(my_df.density).reshape(-1,1)))
  print("R^2:", my_r2)
  r2.append(my_r2)
  print("#")

reg_df = pd.DataFrame({"field": for_code, "change": changed_bin, "r2": r2, 
                       "coef": coef, "negCoef": neg_coef})
reg_df.head()

--- Accounting, Auditing and Accountability ---
Beta (density): -111.00331276039567
R^2: 0.7428833697404494
#
--- Aerospace Engineering ---
Beta (density): -202.6450629725895
R^2: 0.8677756951027318
#
--- Agricultural Biotechnology ---
Beta (density): -281.77873927174585
R^2: 0.04066262427611156
#
--- Agriculture, Land and Farm Management ---
Beta (density): -123.88923545455437
R^2: 0.2858903296393426
#
--- Analytical Chemistry ---
Beta (density): -217.03078484703192
R^2: 0.9571472618678787
#
--- Animal Production ---
Beta (density): -86.26051431703675
R^2: 0.8226638836413618
#
--- Anthropology ---
Beta (density): -115.13213636655956
R^2: 0.9537258807197215
#
--- Applied Economics ---
Beta (density): -293.01176292420536
R^2: 0.07920946989825361
#
--- Applied Ethics ---
Beta (density): -271.30221895862144
R^2: 0.9722797664350136
#
--- Applied Mathematics ---
Beta (density): -469.24142094423047
R^2: 0.6464336259120864
#
--- Archaeology ---
Beta (density): -61.22324910316213
R^2: 0.012130


R^2 score is not well-defined with less than two samples.


R^2 score is not well-defined with less than two samples.


R^2 score is not well-defined with less than two samples.


R^2 score is not well-defined with less than two samples.



Unnamed: 0,field,change,r2,coef,negCoef
0,"Accounting, Auditing and Accountability",False,0.742883,-111.003313,True
1,Aerospace Engineering,False,0.867776,-202.645063,True
2,Agricultural Biotechnology,False,0.040663,-281.778739,True
3,"Agriculture, Land and Farm Management",False,0.28589,-123.889235,True
4,Analytical Chemistry,False,0.957147,-217.030785,True


In [171]:
from scipy.stats import ttest_ind
reg_df_clean = reg_df.dropna()
t_res = ttest_ind(reg_df_clean[reg_df_clean.change==True].r2, reg_df_clean[reg_df_clean.change==False].r2)
t_res

Ttest_indResult(statistic=2.7093815528069967, pvalue=0.007565007983293096)

In [172]:
import math
import statistics 

g1 = reg_df_clean[reg_df_clean.change==True]
g2 = reg_df_clean[reg_df_clean.change==False]

sd_pool = math.sqrt((((len(g1) - 1) * statistics.stdev(g1.r2)) +  ((len(g2) - 1) * statistics.stdev(g2.r2)))/(len(g1) + len(g2) - 2))
print("SD pool:", sd_pool)
d = t_res[0]/sd_pool 
print("d:", d)

SD pool: 0.5697265665828946
d: 4.755582259499189


In [173]:
obs = pd.crosstab(reg_df_clean.change, reg_df_clean.negCoef)
obs

negCoef,False,True
change,Unnamed: 1_level_1,Unnamed: 2_level_1
False,14,106
True,4,21


In [174]:
from scipy.stats import chi2_contingency
# X2, p, df, exp
chi2_contingency(obs)

(0.06990467337416163,
 0.7914756260592861,
 1,
 array([[ 14.89655172, 105.10344828],
        [  3.10344828,  21.89655172]]))

In [175]:
res_corr = df_all_nbrs_density_res.groupby("field")["nsize", 'density'].corr()
res_corr.reset_index(inplace=True)
res_corr


Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.



Unnamed: 0,field,level_1,nsize,density
0,"Accounting, Auditing and Accountability",nsize,1.000000,-0.861907
1,"Accounting, Auditing and Accountability",density,-0.861907,1.000000
2,Aerospace Engineering,nsize,1.000000,-0.931545
3,Aerospace Engineering,density,-0.931545,1.000000
4,Agricultural Biotechnology,nsize,1.000000,-0.201650
...,...,...,...,...
293,Veterinary Sciences,density,-0.980976,1.000000
294,Visual Arts and Crafts,nsize,1.000000,-0.696670
295,Visual Arts and Crafts,density,-0.696670,1.000000
296,Zoology,nsize,1.000000,-0.886965


In [176]:
res_corr2 = res_corr[res_corr.level_1=="nsize"].copy()
res_corr2["changed"] = res_corr2.field.apply(lambda x: True if x in delta_for else False)
res_corr2.head(n=20)

Unnamed: 0,field,level_1,nsize,density,changed
0,"Accounting, Auditing and Accountability",nsize,1.0,-0.861907,False
2,Aerospace Engineering,nsize,1.0,-0.931545,False
4,Agricultural Biotechnology,nsize,1.0,-0.20165,False
6,"Agriculture, Land and Farm Management",nsize,1.0,-0.534687,False
8,Analytical Chemistry,nsize,1.0,-0.978339,False
10,Animal Production,nsize,1.0,-0.907008,False
12,Anthropology,nsize,1.0,-0.976589,False
14,Applied Economics,nsize,1.0,-0.281442,False
16,Applied Ethics,nsize,1.0,-0.986042,False
18,Applied Mathematics,nsize,1.0,-0.804011,False


In [177]:
#res_corr2.to_csv('/gdrive/My Drive/SpringerNature-Google-Turing/data/for_nsize_negcorr.csv')

In [178]:
#reg_df_clean.to_csv('/gdrive/My Drive/SpringerNature-Google-Turing/data/for_regression_coef.csv')

In [179]:
reg_df_clean.columns

Index(['field', 'change', 'r2', 'coef', 'negCoef'], dtype='object')

In [180]:
df_all_nbrs_density_res.to_csv('/gdrive/My Drive/SpringerNature-Google-Turing/data/for_nsize_density.csv')