In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import torch
from torch_geometric.data import Data
import torch.nn.functional as F
import warnings
import os
warnings.filterwarnings('ignore')
from torch_geometric.loader import NeighborLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
%matplotlib inline

In [None]:
os.chdir("..")
%pwd

In [None]:
from pprint import pprint
import json
import copy

import gensim
from gensim.models import Word2Vec
from multiprocessing import Pool
from itertools import compress
from tqdm import tqdm
import time

import multiprocessing
import random
import xxhash

In [None]:
def procrustes(X, Y, scaling=True, reflection='best'):
    """
    A port of MATLAB's `procrustes` function to Numpy.

    Procrustes analysis determines a linear transformation (translation,
    reflection, orthogonal rotation and scaling) of the points in Y to best
    conform them to the points in matrix X, using the sum of squared errors
    as the goodness of fit criterion.

        d, Z, [tform] = procrustes(X, Y)

    Inputs:
    ------------
    X, Y    
        matrices of target and input coordinates. they must have equal
        numbers of  points (rows), but Y may have fewer dimensions
        (columns) than X.

    scaling 
        if False, the scaling component of the transformation is forced
        to 1

    reflection
        if 'best' (default), the transformation solution may or may not
        include a reflection component, depending on which fits the data
        best. setting reflection to True or False forces a solution with
        reflection or no reflection respectively.

    Outputs
    ------------
    d       
        the residual sum of squared errors, normalized according to a
        measure of the scale of X, ((X - X.mean(0))**2).sum()

    Z
        the matrix of transformed Y-values

    tform   
        a dict specifying the rotation, translation and scaling that
        maps X --> Y

    """

    n,m = X.shape
    ny,my = Y.shape

    muX = X.mean(0)
    muY = Y.mean(0)

    X0 = X - muX
    Y0 = Y - muY

    ssX = (X0**2.).sum()
    ssY = (Y0**2.).sum()

    # centred Frobenius norm
    normX = np.sqrt(ssX)
    normY = np.sqrt(ssY)

    # scale to equal (unit) norm
    X0 /= normX
    Y0 /= normY

    if my < m:
        Y0 = np.concatenate((Y0, np.zeros(n, m-my)),0)

    # optimum rotation matrix of Y
    A = np.dot(X0.T, Y0)
    U,s,Vt = np.linalg.svd(A,full_matrices=False)
    V = Vt.T
    T = np.dot(V, U.T)

    if reflection != 'best':

        # does the current solution use a reflection?
        have_reflection = np.linalg.det(T) < 0

        # if that's not what was specified, force another reflection
        if reflection != have_reflection:
            V[:,-1] *= -1
            s[-1] *= -1
            T = np.dot(V, U.T)

    traceTA = s.sum()

    if scaling:

        # optimum scaling of Y
        b = traceTA * normX / normY

        # standarised distance between X and b*Y*T + c
        d = 1 - traceTA**2

        # transformed coords
        Z = normX*traceTA*np.dot(Y0, T) + muX

    else:
        b = 1
        d = 1 + ssY/ssX - 2 * traceTA * normY / normX
        Z = normY*np.dot(Y0, T) + muX

    # transformation matrix
    if my < m:
        T = T[:my,:]
    c = muX - b*np.dot(muY, T)
    
    #transformation values 
    tform = {'rotation':T, 'scale':b, 'translation':c}
   
    return d, Z, tform

In [None]:
import numpy as np
from gensim.models import Word2Vec

def align_word2vec_models(models, overlap_percentage):
    # Check if there are at least two models to align
    if len(models) < 2:
        raise ValueError("At least two models are required for alignment.")
    
    np.random.seed(42)  # Set the random seed for reproducibility

    # Determine the overlapping vocabulary
    common_vocab = set(models[0].wv.index_to_key)
    for model in models[1:]:
        common_vocab = common_vocab.intersection(set(model.wv.index_to_key))

    # Calculate the sample size based on the specified percentage
    sample_size = int(len(common_vocab) * overlap_percentage / 100)

    # Randomly select vocabulary words from the overlapping set
    sampled_vocab = np.random.choice(list(common_vocab), size=sample_size, replace=False)

    # Extract vectors for the sampled vocabulary from all models
    vectors = []
    vocab = None
    for model in models:
        def get_vectors_for_vocab(model, vocab):
            vectors = []
            for word in vocab:
                vectors.append(model.wv[word])
            return np.array(vectors)

        model_vectors = get_vectors_for_vocab(model, sampled_vocab)
        vectors.append(model_vectors)
        vocab = sampled_vocab
        
    # Reset the random seed to its original state
    np.random.seed(None)

    # Perform Procrustes analysis for each model with respect to the first model
    aligned_models = []
    for i, model in enumerate(models):
        if i == 0:
            aligned_model = model  # No transformation needed for the first model
        else:
            _, _, tform = procrustes(vectors[0], vectors[i], scaling=True)
            aligned_vectors = np.dot(model.wv.vectors, tform['rotation']) * tform['scale'] + tform['translation']
            aligned_model = Word2Vec(vector_size=model.vector_size, window=model.window, min_count=model.min_count, sg=model.sg)
            aligned_model.build_vocab([model.wv.index_to_key])  # Use the original vocabulary
            aligned_model.wv.vectors = aligned_vectors

        aligned_models.append(aligned_model)

    return aligned_models

In [None]:
from itertools import combinations

def calculate_disparity(models):
    # Find common vocabulary
    common_vocab = set(models[0].wv.index_to_key)
    for model in models[1:]:
        common_vocab &= set(model.wv.index_to_key)

    # Calculate disparity between each pair of models
    num_models = len(models)
    disparities = np.zeros((num_models, num_models))

    for i, j in combinations(range(num_models), 2):
        common_vectors_i = [models[i].wv[word] for word in common_vocab]
        common_vectors_j = [models[j].wv[word] for word in common_vocab]

        # Compute the sum of square differences
        disparity = sum(np.sum((np.array(common_vectors_i) - np.array(common_vectors_j))**2, axis=1))
        disparities[i, j] = disparity
        disparities[j, i] = disparity

    return disparities

In [None]:
def print_rounded_matrix(matrix):
    for row in matrix:
        print([round(value) for value in row])
    print('\n')

In [None]:
model1 = Word2Vec.load("Content_FL_Exp/051.txt.model")
model2 = Word2Vec.load("Content_FL_Exp/201.txt.model")
model3 = Word2Vec.load("Content_FL_Exp/501.txt.model")

models = [model1, model2, model3]

disparities = calculate_disparity(models)

print("Disparity Matrix:")
print_rounded_matrix(disparities)

In [None]:
for overlap in range(10,100,10):
    aligned_models = align_word2vec_models([model1, model2, model3], overlap_percentage=overlap)

    disparities = calculate_disparity(aligned_models)

    print("Disparity Matrix:")
    print_rounded_matrix(disparities)