In [33]:
import numpy as np
import os
from sklearn.cross_decomposition import PLSSVD
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.simplefilter("ignore", category=ConvergenceWarning)

import pandas as pd
from scipy.stats import ttest_ind_from_stats

In [2]:
def read_embeddings(suffix:str, dir:str):
    '''
    reads and returns the embeddings from file
    '''
    train_vectors_path = os.path.join(dir, f"train_vectors{suffix}.txt")
    test_vectors_path = os.path.join(dir, f"test_vectors{suffix}.txt")
    X=[]
    file_paths = [train_vectors_path, test_vectors_path]
    for file_path in file_paths:
        with open(file_path) as f:
            for line in f:
                vector = line.split()[1:]
                X.append(vector)
    return np.array(X, dtype=np.float64)

def register(rec, type, tag, value):
    result = dict(type=type, tag=tag, value=value)
    rec.append(result)

def get_sim_mean(a, b, include_diag=False):
    cos = a @ b.T / np.linalg.norm(a, axis=1, keepdims=True) / np.linalg.norm(b, axis=1)
    n = a.shape[0]
    if include_diag:
        cos_mean = (cos.sum() + cos.diagonal().sum()) / (n * n + n)
    else:
        cos_mean = (cos.sum() - cos.diagonal().sum()) / (n * n - n)
    return cos_mean

def lr_test_cv(emb):
    y = np.repeat([1,0,1,0], 12500)
    p = np.arange(50000)
    np.random.seed(12)
    p[:25000] = np.random.permutation(25000)
    p[25000:] = np.random.permutation(25000) + 25000
    emb = emb[p]
    y = y[p]
    lr = LogisticRegressionCV(Cs=17, scoring="accuracy", n_jobs=-1)
    lr.fit(emb[:25000], y[:25000])
    return lr.score(emb[25000:], y[25000:])

def cos_sim(a, b):
    return (a * b).sum(axis=1) / np.linalg.norm(a, axis=1) / np.linalg.norm(b, axis=1)

def plssvd_cos(a, b):
    pls = PLSSVD(n_components=500, scale=False)
    sc = StandardScaler(with_mean=False)
    a = sc.fit_transform(a)
    b = sc.fit_transform(b)
    cs = cos_sim(*pls.fit_transform(a, b)).mean()
    return cs


In [3]:
original = read_embeddings("", "../files_root")

cross_blocks = []
for i in range(3):
    cross_blocks.append(read_embeddings(f"_cb{i}", "vectors"))

in_blocks = []
for i in range(3):
    in_blocks.append(read_embeddings(f"_ib{i}", "vectors"))

In [4]:
embs = {}
embs["original"] = [original]
embs["cross_blocks"] = cross_blocks
embs["in_blocks"] = in_blocks

In [5]:
# get mean l2 norms
records = []
for tag, embeds in embs.items():
    for embed in embeds:
        mean_norm = np.linalg.norm(embed, axis=1).mean()
        register(records, "mean_norm", tag, mean_norm)

In [6]:
# get cosine similarities
pos_inds = np.arange(25000)
pos_inds[12500:] += 12500
neg_inds = np.arange(25000)
neg_inds[:12500] += 12500
neg_inds[12500:] += 25000

for tag, embeds in embs.items():
    for embed in embeds:
        pos_embed = embed[pos_inds]
        neg_embed = embed[neg_inds]
        same_cos = (get_sim_mean(pos_embed, pos_embed) + get_sim_mean(neg_embed, neg_embed)) / 2
        diff_cos = get_sim_mean(pos_embed, neg_embed)
        register(records, "same_cos_mean", tag, same_cos)
        register(records, "diff_cos_mean", tag, diff_cos)



In [7]:
for tag, embeds in embs.items():
    for embed in embeds:
        lr_acc = lr_test_cv(embed)
        register(records, "lr test acc", tag, lr_acc)


In [8]:
ori = embs["original"][0]
for tag, embeds in embs.items():
    if tag=="original": continue
    for embed in embeds:
        cs = plssvd_cos(embed, ori)
        register(records, "plssvd cos mean", tag, cs)

In [65]:
df = pd.DataFrame.from_dict(records)
pt = pd.pivot_table(df, values="value", index="type", columns="tag", aggfunc=[np.mean, np.std])
pt

Unnamed: 0_level_0,mean,mean,mean,std,std
tag,cross_blocks,in_blocks,original,cross_blocks,in_blocks
type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
diff_cos_mean,0.317654,0.317634,0.317138,2.1e-05,9e-06
lr test acc,0.9316,0.931173,0.93152,0.000445,0.001121
mean_norm,1.194381,1.194383,5.352516,5.5e-05,6.2e-05
plssvd cos mean,0.905803,0.905894,,0.000193,0.000155
same_cos_mean,0.347179,0.347161,0.346806,2.3e-05,1.3e-05


In [66]:
for t in ("cross_blocks", "in_blocks"):
    pt[f"original-{t}-dif/std"] = (pt["mean"]["original"] - pt["mean"][t]) / pt["std"][t]

def get_p(row):
    means = row["mean"]
    stds =  row["std"]
    ta ="cross_blocks"
    tb = "in_blocks"
    return ttest_ind_from_stats(means[ta], stds[ta], 3, means[tb], stds[tb], 3, False)[1]
    
pt["cross/in-blocks pvalue"] = pt.apply(get_p, axis=1)
pt.loc[["mean_norm", "same_cos_mean", "diff_cos_mean", "lr test acc", "plssvd cos mean"]]

Unnamed: 0_level_0,mean,mean,mean,std,std,original-cross_blocks-dif/std,original-in_blocks-dif/std,cross/in-blocks pvalue
tag,cross_blocks,in_blocks,original,cross_blocks,in_blocks,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
mean_norm,1.194381,1.194383,5.352516,5.5e-05,6.2e-05,75473.240486,66646.847392,0.973565
same_cos_mean,0.347179,0.347161,0.346806,2.3e-05,1.3e-05,-16.385281,-27.513275,0.330115
diff_cos_mean,0.317654,0.317634,0.317138,2.1e-05,9e-06,-24.15747,-58.24058,0.244149
lr test acc,0.9316,0.931173,0.93152,0.000445,0.001121,-0.179605,0.309261,0.589185
plssvd cos mean,0.905803,0.905894,,0.000193,0.000155,,,0.561
