In [2]:
import pandas as pd
import numpy as np
from itertools import combinations
from scipy.linalg import eigvalsh
import math


In [60]:
def effective_rank(R):
    lam = eigvalsh(R)
    p = lam / lam.sum()
    H = -np.sum(p * np.log(p + 1e-20))
    return math.exp(H)


# def variance_of_vars(corr):
#     # Assuming `corr` is your 12x12 correlation matrix (as a pandas DataFrame)
#     # Mask upper triangle and diagonal
#     corr_tril = corr.where(np.tril(np.ones(corr.shape), k=-1).astype(bool))

#     # Define group index ranges
#     group_indices = [
#         range(0, 3),   # Far-right
#         range(3, 6),   # Moderate conservative
#         range(6, 9),   # Progressive left
#         range(9, 12),  # Centrist
#     ]

#     group_variances = []

#     # Step 1: Intra-group variances (3 values from lower triangle of 3x3 block)
#     for group in group_indices:
#         block = corr_tril.iloc[group, group]
#         values = block.where(np.tril(np.ones((3, 3)), k=-1).astype(bool)).stack().values
#         if len(values) > 0:
#             group_variances.append(np.var(values))

#     # Step 2: Inter-group variances (9 values from full 3x3 block, lower triangle only of the global matrix)
#     for g1, g2 in combinations(group_indices, 2):
#         block = corr_tril.iloc[g2, g1]
#         values = block.stack().values  # all 9 values from 3x3 block
#         if len(values) > 0:
#             group_variances.append(np.var(values))

#     # Step 3: Variance of the 10 variances
#     final_mean_of_variances = np.mean(group_variances)

#     # Output
#     print("Block variances (intra + inter):", group_variances)
#     print("Mean of these variances:", final_mean_of_variances)

def mean_of_vars(corr):
    # Assuming `corr` is your 12x12 correlation matrix (as a pandas DataFrame)
    # Mask upper triangle and diagonal
    corr_tril = corr

    # Define group index ranges
    group_indices = [
        range(0, 3),   # Far-right
        range(3, 6),   # Moderate conservative
        range(6, 9),   # Progressive left
        range(9, 12),  # Centrist
    ]

    group_variances = []

    # Step 1: Intra-group variances (3 values from lower triangle of 3x3 block)
    for group in group_indices:
        block = corr_tril.iloc[group, group]
        values = block.stack().values
        if len(values) > 0:
            group_variances.append(np.var(values))

    # Step 2: Inter-group variances (9 values from full 3x3 block, lower triangle only of the global matrix)
    for g1, g2 in combinations(group_indices, 2):
        block = corr_tril.iloc[g2, g1]
        values = block.stack().values  # all 9 values from 3x3 block
        if len(values) > 0:
            group_variances.append(np.var(values))

    # Step 3: Variance of the 10 variances
    final_mean_of_variances = np.mean(group_variances)

    # Output
    print("Block variances (intra + inter):", group_variances)
    print("Mean of these variances:", final_mean_of_variances)


def var_of_means(corr):
    # Assuming `corr` is your 12x12 correlation matrix (as a pandas DataFrame)
    # Mask upper triangle and diagonal
    corr_tril = corr

    # Define group index ranges
    group_indices = [
        range(0, 3),   # Far-right
        range(3, 6),   # Moderate conservative
        range(6, 9),   # Progressive left
        range(9, 12),  # Centrist
    ]

    group_variances = []

    # Step 2: Inter-group variances (9 values from full 3x3 block, lower triangle only of the global matrix)
    for g1, g2 in combinations(group_indices, 2):
        block = corr_tril.iloc[g2, g1]
        values = block.stack().values  # all 9 values from 3x3 block
        if len(values) > 0:
            group_variances.append(np.mean(values))

    # Step 3: Variance of the 10 variances
    final_mean_of_variances = np.var(group_variances)

    # Output
    print("Block means (inter):", group_variances)
    print("Var of these means:", final_mean_of_variances)



In [61]:
data = pd.read_parquet('data/Deepseek/DEEPSEEK_5_tables_final.parquet', engine='pyarrow')
target_columns_names = [col for col in data if col.startswith('Deep')]
data_for_plot = data[data['Translation_quality'] == 1.0][target_columns_names]

# Calculate correlation
corr = data_for_plot.corr(method='pearson')
print(effective_rank(corr))
mean_of_vars(corr)
var_of_means(corr)

4.619697917683003
Block variances (intra + inter): [np.float64(0.0045041463710439825), np.float64(0.00806552261588918), np.float64(0.005318171451362563), np.float64(0.008893564884661358), np.float64(0.003023725833221516), np.float64(0.0016758077773073026), np.float64(0.0012193391323167904), np.float64(0.0019485583374853543), np.float64(0.0032747619318621787), np.float64(0.0013023887169436468)]
Mean of these variances: 0.003922598705209387
Block means (inter): [np.float64(0.10505410340206275), np.float64(-0.30772296515346753), np.float64(0.07528716383362317), np.float64(0.34921642499624506), np.float64(0.7133012890292519), np.float64(0.3857128957010883)]
Var of these means: 0.1000249303310361


In [62]:
data = pd.read_parquet('data/Openai/OPENAI_5_tables_final.parquet', engine='pyarrow')
target_columns_names = [col for col in data if col.startswith('OPEN')]
data_for_plot = data[data['Translation_quality'] == 1.0][target_columns_names]

# Calculate correlation
corr = data_for_plot.corr(method='pearson')
print(effective_rank(corr))
mean_of_vars(corr)
var_of_means(corr)

3.781527662670325
Block variances (intra + inter): [np.float64(0.006063278585147936), np.float64(0.0030366903867556755), np.float64(0.005489197173756896), np.float64(0.004360378343601488), np.float64(0.003386866715171129), np.float64(0.0049492023221440785), np.float64(0.005916119354134308), np.float64(0.006638734834912928), np.float64(0.0019521648968761926), np.float64(0.006658157117674996)]
Mean of these variances: 0.004845078973017562
Block means (inter): [np.float64(0.15283147778602524), np.float64(-0.08576847189354718), np.float64(0.23088946625127368), np.float64(0.5910835326051921), np.float64(0.7853471106287695), np.float64(0.5703216576095858)]
Var of these means: 0.08927616730814796


In [63]:
data = pd.read_parquet('data/Deepseek_v3/data.parquet', engine='pyarrow')
target_columns_names = [col for col in data if col.startswith('Deep')]
data_for_plot = data[data['Translation_quality'] == 1.0][target_columns_names]

# Calculate correlation
corr = data_for_plot.corr(method='pearson')
print(effective_rank(corr))
mean_of_vars(corr)
var_of_means(corr)

4.198599595232317
Block variances (intra + inter): [np.float64(0.04081456987941498), np.float64(0.03737063595015323), np.float64(0.02190637595583179), np.float64(0.016338412950502038), np.float64(0.006155329728126864), np.float64(0.003283811954375435), np.float64(0.008305507698811838), np.float64(0.007321648007769497), np.float64(0.007200318999845124), np.float64(0.004407704399266924)]
Mean of these variances: 0.015310431552409775
Block means (inter): [np.float64(0.5815787388104501), np.float64(0.5526423792300493), np.float64(0.6095555855114195), np.float64(0.5730768541388473), np.float64(0.6740025060469998), np.float64(0.6207362532878036)]
Var of these means: 0.001547036373163279


In [64]:
data = pd.read_parquet('data/Openai_41_mini/data.parquet', engine='pyarrow')
target_columns_names = [col for col in data if col.startswith('Open')]
data_for_plot = data[data['Translation_quality'] == 1.0][target_columns_names]

# Calculate correlation
corr = data_for_plot.corr(method='pearson')
print(effective_rank(corr))
mean_of_vars(corr)
var_of_means(corr)

4.8634489521418764
Block variances (intra + inter): [np.float64(0.02455342298083119), np.float64(0.021249122478977138), np.float64(0.031386277145012534), np.float64(0.02113272211454774), np.float64(0.004928305809625417), np.float64(0.00386380148550139), np.float64(0.004814367413716823), np.float64(0.003308822994275057), np.float64(0.006029141071686891), np.float64(0.002095448508935429)]
Mean of these variances: 0.012336143200310961
Block means (inter): [np.float64(0.5002278949419937), np.float64(0.4117483420253167), np.float64(0.5406254563444305), np.float64(0.4349961751136802), np.float64(0.6865063244933786), np.float64(0.4774527555472486)]
Var of these means: 0.00808571658505823


In [65]:
data = pd.read_csv('data/Qwen_reasoning/data.csv')
target_columns_names = [col for col in data if col.startswith('Qwen')]
data_for_plot = data[data['Translation_quality'] == 1.0][target_columns_names]

# Calculate correlation
corr = data_for_plot.corr(method='pearson')
print(effective_rank(corr))
mean_of_vars(corr)
var_of_means(corr)

4.8195779867942
Block variances (intra + inter): [np.float64(0.0826146115747336), np.float64(0.02901104916004922), np.float64(0.01138505771194881), np.float64(0.012271099862906434), np.float64(0.018994912889286314), np.float64(0.028727385622991222), np.float64(0.013045356381676303), np.float64(0.011432400007482288), np.float64(0.008891611185309893), np.float64(0.0056374040174040725)]
Mean of these variances: 0.022201088841378815
Block means (inter): [np.float64(0.28508938667805733), np.float64(0.24944340421522307), np.float64(0.30169680108457403), np.float64(0.5618918991630047), np.float64(0.739843694156364), np.float64(0.540446863443694)]
Var of these means: 0.0323407372135999


In [66]:
data = pd.read_parquet('data/Qwen/temp_0/data_qwen_complete.parquet', engine='pyarrow')
target_columns_names = [col for col in data if col.startswith('Qwen')]
data_for_plot = data[data['Translation_quality'] == 1.0][target_columns_names]

# Calculate correlation
corr = data_for_plot.corr(method='pearson')
print(effective_rank(corr))
mean_of_vars(corr)
var_of_means(corr)

5.303171144736138
Block variances (intra + inter): [np.float64(0.11903392080914138), np.float64(0.051610968341156166), np.float64(0.04197544583597851), np.float64(0.040531604843835486), np.float64(0.01248025646955225), np.float64(0.015019604191630117), np.float64(0.019739922765588475), np.float64(0.010653251423713291), np.float64(0.013723323952516365), np.float64(0.009574095899739656)]
Mean of these variances: 0.03343423945328517
Block means (inter): [np.float64(0.4391535347410742), np.float64(0.42332709209888836), np.float64(0.4611581395504195), np.float64(0.5608715702618586), np.float64(0.6118316403883561), np.float64(0.5385585846588571)]
Var of these means: 0.004764189561922619


In [67]:
data = pd.read_parquet('data/Gemma/temp_0/data_gemma_complete.parquet', engine='pyarrow')
target_columns_names = [col for col in data if col.startswith('Gemma')]
data_for_plot = data[data['Translation_quality'] == 1.0][target_columns_names]

# Calculate correlation
corr = data_for_plot.corr(method='pearson')
print(effective_rank(corr))
mean_of_vars(corr)
var_of_means(corr)

6.062225611363859
Block variances (intra + inter): [np.float64(0.08016112028056926), np.float64(0.054958754377956526), np.float64(0.0565388836963944), np.float64(0.03692244642390694), np.float64(0.009015975441511377), np.float64(0.014634522714323377), np.float64(0.00806105663539033), np.float64(0.008105610741685434), np.float64(0.010424470067469228), np.float64(0.004127050260453727)]
Mean of these variances: 0.028294989063966058
Block means (inter): [np.float64(0.4336313672022815), np.float64(0.39458786272360935), np.float64(0.42314240510896955), np.float64(0.4606039133473597), np.float64(0.5744699152819358), np.float64(0.4169036006283602)]
Var of these means: 0.0034597472859023704


In [68]:
data = pd.read_parquet('data/Mistral/temp_0/data_mistral_complete.parquet', engine='pyarrow')
target_columns_names = [col for col in data if col.startswith('Mistral')]
data_for_plot = data[data['Translation_quality'] == 1.0][target_columns_names]

# Calculate correlation
corr = data_for_plot.corr(method='pearson')
print(effective_rank(corr))
mean_of_vars(corr)
var_of_means(corr)

8.188778138064986
Block variances (intra + inter): [np.float64(0.1325365934427155), np.float64(0.1317039448248046), np.float64(0.08944321231422951), np.float64(0.15643114344636938), np.float64(0.021463086265674905), np.float64(0.01766932887754554), np.float64(0.028729057283429543), np.float64(0.011913853923964804), np.float64(0.050718668071244886), np.float64(0.015774049332328042)]
Mean of these variances: 0.06563829377823069
Block means (inter): [np.float64(0.30771196770695686), np.float64(0.30520263132082687), np.float64(0.2287451667217596), np.float64(0.29241438468015546), np.float64(0.3120697923849921), np.float64(0.2251915644042865)]
Var of these means: 0.0013673707696543246
