In [3]:
langs_str \
    = 'af,ar,az,bn,cs,de,en,es,et,fa,fi,fr,gl,gu,he,hi,hr,id,it,ja,ka,kk,km,ko,lt,lv,mk,ml,mn,mr,my,ne,nl,pl,ps,pt,ro,ru,si,sl,sv,sw,ta,te,th,tl,tr,uk,ur,vi,xh,zh'

In [8]:
langs = langs_str.split(',')

In [30]:
import lang2vec.lang2vec as l2v
import numpy as np
from tqdm import tqdm

In [18]:
letter_codes = list(map(lambda lang: l2v.LETTER_CODES[lang], langs))

In [19]:
for letter_code in letter_codes:
    print(letter_code)

afr
ara
aze
ben
ces
deu
eng
spa
est
fas
fin
fra
glg
guj
heb
hin
hrv
ind
ita
jpn
kat
kaz
khm
kor
lit
lav
mkd
mal
mon
mar
mya
nep
nld
pol
pus
por
ron
rus
sin
slv
swe
swa
tam
tel
tha
tgl
tur
ukr
urd
vie
xho
zho


### Find Language Families

In [33]:
for letter_code in letter_codes:
    features = l2v.get_features(letter_code, "fam", header=True)
    families_and_subfamilies = np.array(features['CODE'])[np.array(features[letter_code]) == 1]
    family = families_and_subfamilies[0]
    subfamily = ''
    others = []
    if len(families_and_subfamilies) > 1:
        subfamily = families_and_subfamilies[1]
    if len(families_and_subfamilies) > 2:
        others = families_and_subfamilies[2:]
    joined = ' / '.join(others)
    print(f'{family}\t{subfamily}\t{joined}')

F_Indo-European	F_Germanic	F_Northwest_Germanic / F_West_Germanic / F_Franconian / F_Low_Franconian / F_Afrikaansic
F_Afro-Asiatic	F_Semitic	F_West_Semitic / F_Central_Semitic / F_Arabian
F_Turkic	F_Common_Turkic	F_Oghuz-Kipchak-Uyghur
F_Indo-European	F_Indo-Iranian	F_Indo-Aryan / F_Indo-Aryan_Eastern_zone / F_Oriya-Gauda-Kamrupa / F_Gauda-Kamrupa / F_Gauda-Banga
F_Indo-European	F_Balto-Slavic	F_Slavic / F_West_Slavic / F_Czech-Slovak / F_Czech-Lach
F_Indo-European	F_Germanic	F_Northwest_Germanic / F_West_Germanic / F_Franconian / F_High_Franconian
F_Indo-European	F_Germanic	F_Northwest_Germanic / F_West_Germanic / F_North_Sea_Germanic / F_Anglo-Frisian / F_Anglian / F_Mercian / F_Macro-English
F_Indo-European	F_Italic	F_Latino-Faliscan / F_Latinic / F_Imperial_Latin / F_Romance / F_Italo-Western_Romance / F_Western_Romance / F_Shifted_Western_Romance / F_Southwestern_Shifted_Romance / F_West_Ibero-Romance / F_Castilic
F_Uralic	F_Finnic	
F_Indo-European	F_Indo-Iranian	F_Iranian / F_Wes

### Find Syntax WALS

In [46]:
for letter_code in letter_codes:
    features = l2v.get_features(letter_code, "syntax_wals", header=True)
    for k, v in features.items():
        features[k] = v[:5]
    feature_map = np.array(list(map(lambda x: x if x != '--' else 0, features[letter_code]))) == 1
    syntax = np.array(features['CODE'])[feature_map == 1]
    print(syntax)

[]
['S_SVO' 'S_VSO']
['S_SOV']
['S_SOV']
['S_SVO']
['S_SVO' 'S_SOV']
['S_SVO']
['S_SVO']
['S_SVO']
['S_SOV']
['S_SVO']
['S_SVO']
[]
['S_SOV']
['S_SVO']
['S_SOV']
[]
['S_SVO']
['S_SVO']
['S_SOV']
['S_SOV']
[]
['S_SVO']
['S_SOV']
['S_SVO']
['S_SVO']
['S_SVO']
['S_SOV']
['S_SOV']
['S_SOV']
['S_SOV']
['S_SOV']
['S_SVO' 'S_SOV']
['S_SVO']
['S_SOV']
['S_SVO']
['S_SVO']
['S_SVO']
['S_SOV']
['S_SVO']
['S_SVO']
['S_SVO']
['S_SOV']
['S_SOV']
['S_SVO']
['S_VSO']
['S_SOV']
['S_SVO']
['S_SOV']
['S_SVO']
['S_SVO']
['S_SVO']


### Find Syntax SSWL

In [45]:
for letter_code in letter_codes:
    features = l2v.get_features(letter_code, "syntax_sswl", header=True)
    for k, v in features.items():
        features[k] = v[:5]
    feature_map = np.array(list(map(lambda x: x if x != '--' else 0, features[letter_code]))) == 1
    syntax = np.array(features['CODE'])[feature_map == 1]
    print(syntax)

['S_SVO' 'S_SOV']
['S_SVO' 'S_VSO' 'S_VOS' 'S_OVS']
[]
['S_SOV']
['S_SVO' 'S_OVS']
['S_SVO' 'S_SOV']
['S_SVO']
['S_SVO']
[]
[]
['S_SVO']
['S_SVO']
['S_SVO' 'S_VSO' 'S_VOS' 'S_OVS']
[]
['S_SVO']
['S_SOV']
[]
['S_SVO']
['S_SVO' 'S_SOV' 'S_VOS']
['S_SOV']
['S_SVO' 'S_SOV' 'S_VSO' 'S_VOS' 'S_OVS']
[]
[]
['S_SOV']
['S_SVO' 'S_SOV' 'S_VSO' 'S_OVS']
[]
[]
['S_SOV']
[]
[]
['S_SOV']
['S_SOV']
['S_SVO' 'S_SOV']
['S_SVO' 'S_SOV' 'S_OVS']
['S_SOV']
['S_SVO']
['S_SVO' 'S_VSO' 'S_VOS']
['S_SVO']
[]
['S_SVO']
['S_SVO']
['S_SVO']
[]
[]
['S_SVO']
['S_VSO' 'S_VOS']
['S_SVO' 'S_SOV' 'S_VSO' 'S_VOS' 'S_OVS']
['S_SVO']
[]
['S_SVO']
[]
['S_SVO' 'S_SOV']


### SVO with WALS fallback to SSWL if empty

In [56]:
svo_orders = []
for letter_code in letter_codes:
    features = l2v.get_features(letter_code, "syntax_wals", header=True)
    for k, v in features.items():
        features[k] = v[:5]
    features_sswl = l2v.get_features(letter_code, "syntax_sswl", header=True)
    for k, v in features_sswl.items():
        features_sswl[k] = v[:5]
    feature_map = np.array(list(map(lambda x: x if x != '--' else 0, features[letter_code]))) == 1
    feature_map_sswl = np.array(list(map(lambda x: x if x != '--' else 0, features_sswl[letter_code]))) == 1
    syntax = np.array(features['CODE'])[feature_map == 1]
    if len(syntax) == 0:
        syntax = np.array(features_sswl['CODE'])[feature_map_sswl == 1]
    svo_orders.append('/'.join(syntax))


In [58]:
for svo_order in svo_orders:
    print(svo_order)

S_SVO/S_SOV
S_SVO/S_VSO
S_SOV
S_SOV
S_SVO
S_SVO/S_SOV
S_SVO
S_SVO
S_SVO
S_SOV
S_SVO
S_SVO
S_SVO/S_VSO/S_VOS/S_OVS
S_SOV
S_SVO
S_SOV

S_SVO
S_SVO
S_SOV
S_SOV

S_SVO
S_SOV
S_SVO
S_SVO
S_SVO
S_SOV
S_SOV
S_SOV
S_SOV
S_SOV
S_SVO/S_SOV
S_SVO
S_SOV
S_SVO
S_SVO
S_SVO
S_SOV
S_SVO
S_SVO
S_SVO
S_SOV
S_SOV
S_SVO
S_VSO
S_SOV
S_SVO
S_SOV
S_SVO
S_SVO
S_SVO


In [62]:
svo_orders = []
for letter_code in letter_codes:
    features = l2v.get_features(letter_code, "syntax_knn", header=True)
    for k, v in features.items():
        features[k] = v[:5]
    feature_map = np.array(list(map(lambda x: x if x != '--' else 0, features[letter_code]))) == 1
    syntax = np.array(features['CODE'])[feature_map == 1]
    svo_orders.append('/'.join(syntax))
for svo_order in svo_orders:
    print(svo_order)

S_SVO/S_SOV
S_SVO/S_VSO
S_SOV
S_SOV
S_SVO
S_SVO/S_SOV
S_SVO
S_SVO
S_SVO
S_SOV
S_SVO
S_SVO
S_SVO/S_VSO/S_VOS/S_OVS
S_SOV
S_SVO
S_SOV
S_SVO
S_SVO
S_SVO
S_SOV
S_SVO/S_SOV
S_SOV
S_SVO
S_SOV
S_SVO
S_SVO
S_SVO
S_SOV
S_SOV
S_SOV
S_SOV
S_SOV
S_SVO/S_SOV
S_SVO
S_SOV
S_SVO
S_SVO
S_SVO
S_SOV
S_SVO
S_SVO
S_SVO
S_SOV
S_SOV
S_SVO
S_VSO
S_SOV
S_SVO
S_SOV
S_SVO
S_SVO
S_SVO


### GEO - feature Clustering

In [70]:
geo_features = {}
for letter_code in tqdm(letter_codes):
    features = l2v.get_features(letter_code, "geo", header=False)
    geo_features[letter_code] = features[letter_code]

100%|██████████| 52/52 [00:03<00:00, 16.17it/s]


### Overall Typology Clustering 


In [75]:
synt_phono_inventory_features = dict()
feature_set = "syntax_knn+phonology_knn+inventory_knn"
for letter_code in letter_codes:
    feat = l2v.get_features(letter_code, feature_set)[letter_code]
    synt_phono_inventory_features[letter_code] = np.asarray(feat)

In [77]:
synt_phono_inventory_features.keys()

dict_keys(['afr', 'ara', 'aze', 'ben', 'ces', 'deu', 'eng', 'spa', 'est', 'fas', 'fin', 'fra', 'glg', 'guj', 'heb', 'hin', 'hrv', 'ind', 'ita', 'jpn', 'kat', 'kaz', 'khm', 'kor', 'lit', 'lav', 'mkd', 'mal', 'mon', 'mar', 'mya', 'nep', 'nld', 'pol', 'pus', 'por', 'ron', 'rus', 'sin', 'slv', 'swe', 'swa', 'tam', 'tel', 'tha', 'tgl', 'tur', 'ukr', 'urd', 'vie', 'xho', 'zho'])

### Learned Features

In [80]:
synt_phono_inventory_features = dict()
feature_set = "learned"
for letter_code in letter_codes:
    feat = l2v.get_features(letter_code, feature_set)[letter_code]
    synt_phono_inventory_features[letter_code] = np.asarray(feat)

ValueError: Object arrays cannot be loaded when allow_pickle=False

# Language x (Task + Model + Subset) Matrix

In [49]:
import pandas as pd
from itertools import chain

# Define language presence in each task, model, and subset
task_languages = {
    'xnli': ['ar', 'hi', 'es', 'ru', 'sw', 'tr', 'bg', 'de', 'el', 'en', 'fr', 'th', 'ur', 'vi', 'zh'],
    'xcopa': ['tr', 'sw', 'et', 'id', 'it', 'ta', 'th', 'vi', 'ht', 'qu', 'zh'],
    'xstorycloze': ['ar', 'es', 'hi', 'ru', 'sw', 'en', 'eu', 'id', 'my', 'te', 'zh'],
    'xwinograd': ['ja', 'ru', 'en', 'fr', 'pt', 'zh'],
    'pawsx': ['es', 'ja', 'ko', 'de', 'en', 'fr', 'zh']
}

model_languages = {
    'bloom': ['ak', 'ar', 'as', 'bm', 'eu', 'bn', 'ca', 'ny', 'sn', 'tum', 'en', 'fon', 'fr', 'gu', 'hi', 'ig',
              'id', 'xh', 'zu', 'kn', 'ki', 'rw', 'rn', 'ln', 'lg', 'ml', 'mr', 'ne', 'nso', 'or', 'pt', 'pa', 'st',
              'tn', 'zhs', 'es', 'sw', 'ta', 'te', 'zht', 'tw', 'ur', 'vi', 'wo', 'ts', 'yo'],
    'mgpt': ['ar', 'he', 'vi', 'id', 'jv', 'ms', 'tl', 'lv', 'lt', 'eu', 'ml', 'ta', 'te', 'hy', 'bn', 'mr', 'hi',
             'ur', 'af', 'da', 'en', 'de', 'sv', 'fr', 'it', 'pt', 'ro', 'es', 'el', 'os', 'tg', 'fa', 'ja', 'ka',
             'ko', 'th', 'bxr', 'xal', 'mn', 'sw', 'yo', 'be', 'bg', 'ru', 'uk', 'pl', 'my', 'uz', 'ba', 'kk', 'ky',
             'tt', 'az', 'cv', 'tr', 'tk', 'tyv', 'sax', 'et', 'fi', 'hu'],
    'mt5': ['af', 'sq', 'am', 'ar', 'hy', 'az', 'eu', 'be', 'bn', 'bg', 'my', 'ca', 'ceb', 'ny', 'zh', 'co', 'cs',
            'da', 'nl', 'en', 'eo', 'et', 'fil', 'fi', 'fr', 'gl', 'ka', 'de', 'el', 'gu', 'ht', 'ha', 'haw', 'he',
            'hi', 'hmn', 'hu', 'is', 'ig', 'id', 'ga', 'it', 'ja', 'jv', 'kn', 'kk', 'km', 'ko', 'ku', 'ky', 'lo',
            'la', 'lv', 'lt', 'lb', 'mk', 'mg', 'ms', 'ml', 'mt', 'mi', 'mr', 'mn', 'ne', 'no', 'ps', 'fa', 'pl',
            'pt', 'pa', 'ro', 'ru', 'sm', 'gd', 'sr', 'sn', 'sd', 'si', 'sk', 'sl', 'so', 'st', 'es', 'su', 'sw',
            'sv', 'tg', 'ta', 'te', 'th', 'tr', 'uk', 'ur', 'uz', 'vi', 'cy', 'fy', 'xh', 'yi', 'yo', 'zu']
}

# Define subset languages
subsets = {
    'RND-1': ['cs', 'gu', 'hi', 'id', 'ko', 'lv', 'mk', 'ml', 'ps', 'pt', 'si', 'ta', 'vi', 'zh'],
    'RND-2': ['cs', 'en', 'et', 'fr', 'hi', 'hr', 'it', 'ja', 'ml', 'mn', 'mr', 'ne', 'pl', 'pt'],
    'RND-3': ['ar', 'az', 'de', 'en', 'es', 'et', 'he', 'hi', 'id', 'ml', 'ps', 'ru', 'sv', 'uk'],
    'FAM-1': ['az', 'en', 'fi', 'he', 'ja', 'ka', 'vi', 'ko', 'ml', 'mn', 'my', 'th', 'tl', 'xh'],
    'FAM-2': ['ar', 'en', 'fi', 'id', 'ja', 'ka', 'ko', 'mn', 'sw', 'ta', 'th', 'tr', 'vi', 'zh'],
    'FAM-3': ['af', 'az', 'et', 'he', 'ja', 'ka', 'ko', 'ml', 'mn', 'my', 'th', 'tl', 'vi', 'xh'],
    'GEO-1': ['af', 'bn', 'et', 'fr', 'gu', 'he', 'hr', 'id', 'ja', 'kk', 'mn', 'sw', 'ta', 'vi'],
    'GEO-2': ['af', 'bn', 'et', 'fr', 'he', 'hr', 'id', 'ja', 'ka', 'kk', 'mn', 'ta', 'ur', 'vi'],
    'GEO-3': ['af', 'bn', 'et', 'fr', 'he', 'hr', 'id', 'ja', 'kk', 'mn', 'sw', 'ta', 'ur', 'vi'],
    'LEARN-1': ['cs', 'hi', 'id', 'km', 'ko', 'lt', 'lv', 'my', 'nl', 'pt', 'sl', 'ta', 'tl', 'vi'],
    'LEARN-2': ['cs', 'fi', 'hr', 'km', 'ko', 'lt', 'lv', 'my', 'nl', 'pt', 'sl', 'ta', 'uk', 'vi'],
    'LEARN-3': ['cs', 'hi', 'id', 'ja', 'lt', 'lv', 'nl', 'pt', 'sl', 'sv', 'ta', 'tl', 'tr', 'vi'],
    'SEM-1': ['gl', 'gu', 'ka', 'kk', 'ko', 'ml', 'pl', 'ps', 'si', 'sl', 'sv', 'tl', 'uk', 'vi'],
    'SEM-2': ['en', 'fr', 'gl', 'gu', 'ka', 'kk', 'ko', 'pl', 'ps', 'sl', 'sv', 'tl', 'uk', 'vi'],
    'SEM-3': ['en', 'gl', 'gu', 'ka', 'kk', 'ko', 'pl', 'ps', 'sl', 'sv', 'tl', 'uk', 'vi'],
    'TYPO-1': ['az', 'bn', 'de', 'et', 'fa', 'hi', 'it', 'ja', 'mk', 'sw', 'ta', 'tl', 'vi', 'zh'],
    'TYPO-2': ['ar', 'az', 'bn', 'de', 'et', 'fa', 'it', 'ja', 'mk', 'sw', 'ta', 'th', 'ur', 'zh'],
    'TYPO-3': ['ar', 'az', 'bn', 'en', 'fi', 'he', 'hi', 'it', 'ja', 'mk', 'nl', 'sw', 'ta', 'th'],
    'ALL': ['af', 'ar', 'az', 'bn', 'cs', 'de', 'en', 'es', 'et', 'fa', 'fi', 'fr', 'gl', 'gu', 'he', 'hi', 'hr', 'id',
            'it', 'ja', 'ka', 'kk', 'km', 'ko', 'lt', 'lv', 'mk', 'ml', 'mn', 'mr', 'my', 'ne', 'nl', 'pl', 'ps', 'pt',
            'ro', 'ru', 'si', 'sl', 'sv', 'sw', 'ta', 'te', 'th', 'tl', 'tr', 'uk', 'ur', 'vi', 'xh', 'zh']
}

# Define all unique languages across tasks, models, and subsets
all_languages = [*list(task_languages.values()), *list(model_languages.values()), *list(subsets.values())]
all_languages = set(chain.from_iterable(all_languages))

In [50]:
# Create a DataFrame with languages as rows and tasks, models, and subsets as columns
columns = ['xnli', 'xcopa', 'xstorycloze', 'xwinograd', 'pawsx', 'bloom', 'mgpt', 'mt5'] + list(subsets.keys())
df = pd.DataFrame(0, index=sorted(all_languages), columns=columns)

In [51]:
# Mark presence of each language in tasks
for task, langs in task_languages.items():
    df.loc[langs, task] = 1

# Mark presence of each language in models
for model, langs in model_languages.items():
    df.loc[langs, model] = 1

# Mark presence of each language in subsets
for subset, langs in subsets.items():
    df.loc[langs, subset] = 1

In [52]:
df.to_csv('language_matrix.csv')

#### TASK SPECIFIC

In [67]:
from collections import defaultdict

# Group subsets together
subset_groups = {
    'RND': ['RND-1', 'RND-2', 'RND-3'],
    'FAM': ['FAM-1', 'FAM-2', 'FAM-3'],
    'GEO': ['GEO-1', 'GEO-2', 'GEO-3'],
    'LEARN': ['LEARN-1', 'LEARN-2', 'LEARN-3'],
    'SEM': ['SEM-1', 'SEM-2', 'SEM-3'],
    'TYPO': ['TYPO-1', 'TYPO-2', 'TYPO-3'],
    'ALL': ['ALL']
}

In [69]:
def task_x_subsets(task_name: str, ):
    task_languages_set = set(task_languages[task_name])

    # Initialize dictionary to store intersection and difference counts
    group_results = defaultdict(lambda: {'intersection': set(), 'difference': set()})

    # Calculate intersections and differences for each group
    for group_name, subset_names in subset_groups.items():
        group_intersection = set()
        group_difference = set()

        # For each subset in the group, calculate intersection and difference
        for subset_name in subset_names:
            subset_languages_set = set(subsets[subset_name])
            group_intersection.update(task_languages_set.intersection(subset_languages_set))
            group_difference.update(subset_languages_set.difference(task_languages_set))

        # Store the results in the dictionary
        group_results[group_name]['intersection'] = group_intersection
        group_results[group_name]['difference'] = group_difference

    # Sort groups based on the number of intersections
    sorted_groups = sorted(group_results.items(), key=lambda x: len(x[1]['intersection']), reverse=True)

    # Display sorted results
    for group_name, result in sorted_groups:
        print(
            f"{group_name} -> {len(result['intersection'])} languages in common with {task_name.upper()}: {result['intersection']}")
        print(
            f"{group_name} -> {len(result['difference'])} languages not in {task_name.upper()}: {result['difference']}")

In [72]:
def task_x_models(task_name: str, ):
    """
    intersections of TASK (XNLI, XCOPA etc) and model languages
    :param task_name: 
    :return: 
    """
    task_languages_set = set(task_languages[task_name])

    # Intersections with each model's languages
    model_results = {}

    for model_name, model_languages_set in model_languages.items():
        model_languages_set = set(model_languages_set)
        intersection = task_languages_set.intersection(model_languages_set)
        differences = model_languages_set.difference(task_languages_set)

        model_results[model_name] = {
            'intersection': intersection,
            'difference': differences
        }

    # Sort models based on the number of intersections
    sorted_models = sorted(model_results.items(), key=lambda x: len(x[1]['intersection']), reverse=True)

    # Display sorted results
    for model_name, result in sorted_models:
        print(
            f"{model_name} -> {len(result['intersection'])} languages in common with  {task_name.upper()}: {result['intersection']}")
        print(
            f"{model_name} -> {len(result['difference'])} languages not in  {task_name.upper()}: {result['difference']}")


In [74]:
##### XNLI

task_x_subsets(task_name='xnli')
print('\n---------------------\n')
task_x_models(task_name='xnli')

ALL -> 13 languages in common with XNLI: {'ar', 'en', 'es', 'hi', 'ur', 'sw', 'tr', 'de', 'fr', 'ru', 'vi', 'zh', 'th'}
ALL -> 39 languages not in XNLI: {'kk', 'et', 'he', 'pt', 'lt', 'mn', 'ml', 'nl', 'fi', 'gl', 'km', 'mr', 'ka', 'ko', 'az', 'af', 'ta', 'te', 'lv', 'sl', 'si', 'pl', 'mk', 'hr', 'cs', 'fa', 'bn', 'xh', 'ps', 'gu', 'ne', 'it', 'ja', 'my', 'ro', 'tl', 'id', 'uk', 'sv'}
RND -> 9 languages in common with XNLI: {'ar', 'en', 'es', 'hi', 'fr', 'de', 'vi', 'ru', 'zh'}
RND -> 23 languages not in XNLI: {'ta', 'et', 'he', 'pt', 'mn', 'lv', 'si', 'ml', 'pl', 'mk', 'uk', 'hr', 'ps', 'gu', 'ne', 'it', 'ja', 'mr', 'id', 'ko', 'az', 'cs', 'sv'}
TYPO -> 9 languages in common with XNLI: {'ar', 'en', 'hi', 'ur', 'sw', 'de', 'vi', 'zh', 'th'}
TYPO -> 12 languages not in XNLI: {'ta', 'et', 'he', 'fi', 'it', 'ja', 'tl', 'az', 'mk', 'nl', 'fa', 'bn'}
FAM -> 7 languages in common with XNLI: {'ar', 'en', 'sw', 'tr', 'vi', 'zh', 'th'}
FAM -> 15 languages not in XNLI: {'af', 'ta', 'et', 'he', '

In [76]:
##### XCOPA

print('Intersections and Differences between XCOPA Languages and Subsets')
task_x_subsets(task_name='xcopa')
print('\n---------------------\n')
print('Intersections and Differences between XCOPA Languages and Models')
task_x_models(task_name='xcopa')

Intersections and Differences between XCOPA Languages and Subsets
ALL -> 9 languages in common with XCOPA: {'ta', 'et', 'it', 'tr', 'sw', 'id', 'vi', 'zh', 'th'}
ALL -> 43 languages not in XCOPA: {'kk', 'he', 'lt', 'pt', 'mn', 'hi', 'ml', 'ru', 'nl', 'fi', 'en', 'gl', 'km', 'mr', 'ka', 'ur', 'fr', 'de', 'ko', 'az', 'af', 'ar', 'te', 'lv', 'sl', 'si', 'pl', 'mk', 'hr', 'cs', 'fa', 'bn', 'xh', 'ps', 'gu', 'es', 'ne', 'my', 'ja', 'ro', 'tl', 'uk', 'sv'}
FAM -> 8 languages in common with XCOPA: {'ta', 'et', 'tr', 'sw', 'id', 'vi', 'zh', 'th'}
FAM -> 14 languages not in XCOPA: {'af', 'he', 'ar', 'mn', 'ml', 'xh', 'fi', 'en', 'my', 'ja', 'ka', 'tl', 'ko', 'az'}
TYPO -> 7 languages in common with XCOPA: {'ta', 'et', 'it', 'sw', 'vi', 'zh', 'th'}
TYPO -> 14 languages not in XCOPA: {'he', 'ar', 'hi', 'mk', 'nl', 'fa', 'bn', 'fi', 'en', 'ja', 'tl', 'ur', 'de', 'az'}
RND -> 6 languages in common with XCOPA: {'ta', 'et', 'id', 'vi', 'it', 'zh'}
RND -> 26 languages not in XCOPA: {'he', 'ar', 'pt', 

In [61]:
# SUBSETS x MODELS

# Function to group subsets and calculate intersections with models
def collect_grouped_subset_model_intersections(model_languages, subset_groups):
    all_results = []

    # Iterate over all models and subset groups
    for model_name, model_languages_set in model_languages.items():
        model_languages_set = set(model_languages_set)

        for subset_group_name, subset_group in subset_groups.items():
            # Combine all subset languages in the group
            combined_subset_languages = set(chain.from_iterable(subset_group))

            # Calculate intersections and differences
            intersection = combined_subset_languages.intersection(model_languages_set)
            differences = combined_subset_languages.difference(model_languages_set)

            # Store results in a list
            all_results.append({
                'model': model_name,
                'subset_group': subset_group_name,
                'intersection': intersection,
                'difference': differences,
                'num_intersection': len(intersection)
            })

    # Sort the results by the number of intersections in descending order
    sorted_all_results = sorted(all_results, key=lambda x: x['num_intersection'], reverse=True)

    # Return sorted results
    return sorted_all_results


# Subset groups (e.g., group RND-1, RND-2, RND-3 into one RND group)
subset_groups = {
    'RND': [subsets['RND-1'], subsets['RND-2'], subsets['RND-3']],
    'FAM': [subsets['FAM-1'], subsets['FAM-2'], subsets['FAM-3']],
    'GEO': [subsets['GEO-1'], subsets['GEO-2'], subsets['GEO-3']],
    'LEARN': [subsets['LEARN-1'], subsets['LEARN-2'], subsets['LEARN-3']],
    'SEM': [subsets['SEM-1'], subsets['SEM-2'], subsets['SEM-3']],
    'TYPO': [subsets['TYPO-1'], subsets['TYPO-2'], subsets['TYPO-3']],
    'ALL': [subsets['ALL']]
}


# Display the sorted results
def display_sorted_grouped_results(sorted_results):
    for result in sorted_results:
        print(
            f"{result['subset_group']} with {result['model']} -> {result['num_intersection']} languages in common: {result['intersection']}")
        print(
            f"{result['subset_group']} with {result['model']} -> {len(result['difference'])} languages not in model: {result['difference']}")
        print()  # For better readability


# Example of how to use the function
sorted_grouped_results = collect_grouped_subset_model_intersections(model_languages, subset_groups)

# Display the results
display_sorted_grouped_results(sorted_grouped_results)

ALL with mt5 -> 50 languages in common: {'kk', 'et', 'he', 'pt', 'lt', 'mn', 'hi', 'ml', 'tr', 'ru', 'nl', 'fi', 'en', 'gl', 'km', 'mr', 'ka', 'ur', 'fr', 'de', 'ko', 'az', 'af', 'ta', 'ar', 'te', 'lv', 'sl', 'si', 'pl', 'mk', 'zh', 'cs', 'fa', 'bn', 'th', 'xh', 'ps', 'gu', 'es', 'ne', 'it', 'ja', 'my', 'ro', 'sw', 'id', 'vi', 'uk', 'sv'}
ALL with mt5 -> 2 languages not in model: {'hr', 'tl'}

ALL with mgpt -> 39 languages in common: {'kk', 'et', 'he', 'pt', 'lt', 'mn', 'hi', 'ml', 'tr', 'ru', 'fi', 'en', 'mr', 'ka', 'ur', 'fr', 'de', 'ko', 'az', 'af', 'ta', 'ar', 'te', 'lv', 'pl', 'fa', 'bn', 'th', 'es', 'it', 'ja', 'my', 'ro', 'tl', 'sw', 'id', 'vi', 'uk', 'sv'}
ALL with mgpt -> 13 languages not in model: {'xh', 'ps', 'gu', 'gl', 'km', 'ne', 'sl', 'si', 'cs', 'mk', 'zh', 'hr', 'nl'}

RND with mt5 -> 31 languages in common: {'et', 'he', 'pt', 'mn', 'hi', 'ml', 'ru', 'en', 'mr', 'fr', 'de', 'ko', 'az', 'ta', 'ar', 'lv', 'si', 'pl', 'mk', 'zh', 'cs', 'ps', 'gu', 'es', 'ne', 'it', 'ja', 

In [80]:
from collections import defaultdict


# Function to compute intersections of task, model, and subsets with sorting by full intersection
def compute_intersections_with_sorted_full(task_langs, model_langs, subsets):
    all_results = []

    # Iterate over all models and subset groups
    for model_name, model_langs_set in model_langs.items():
        model_langs_set = set(model_langs_set)

        for subset_group_name, subset_group in subsets.items():
            # Combine all subset languages in the group
            combined_subset_languages = set(chain.from_iterable(subset_group))

            # Calculate intersections
            task_model_intersection = task_langs.intersection(model_langs_set)
            subset_task_intersection = task_langs.intersection(combined_subset_languages)
            model_subset_intersection = model_langs_set.intersection(combined_subset_languages)
            full_intersection = task_langs.intersection(model_langs_set).intersection(combined_subset_languages)

            # Calculate the maximum possible intersections (minimum of two sets compared)
            max_task_model_intersection = min(len(task_langs), len(model_langs_set))
            max_subset_task_intersection = min(len(task_langs), len(combined_subset_languages))
            max_model_subset_intersection = min(len(model_langs_set), len(combined_subset_languages))
            max_full_intersection = min(len(task_langs), len(model_langs_set), len(combined_subset_languages))

            # Store results in a list
            all_results.append({
                'model': model_name,
                'subset_group': subset_group_name,
                'task_model_intersection': task_model_intersection,
                'subset_task_intersection': subset_task_intersection,
                'model_subset_intersection': model_subset_intersection,
                'full_intersection': full_intersection,
                'num_task_model_intersection': len(task_model_intersection),
                'num_subset_task_intersection': len(subset_task_intersection),
                'num_model_subset_intersection': len(model_subset_intersection),
                'num_full_intersection': len(full_intersection),
                'max_task_model_intersection': max_task_model_intersection,  # Corrected max value
                'max_subset_task_intersection': max_subset_task_intersection,  # Corrected max value
                'max_model_subset_intersection': max_model_subset_intersection,  # Corrected max value
                'max_full_intersection': max_full_intersection  # Corrected max value
            })

    # Sort the results by the number of full intersections in descending order
    sorted_results = sorted(all_results, key=lambda x: x['num_full_intersection'], reverse=True)

    return sorted_results


# Example for XCOPA
subset_groups = {
    'RND': [subsets['RND-1'], subsets['RND-2'], subsets['RND-3']],
    'FAM': [subsets['FAM-1'], subsets['FAM-2'], subsets['FAM-3']],
    'GEO': [subsets['GEO-1'], subsets['GEO-2'], subsets['GEO-3']],
    'LEARN': [subsets['LEARN-1'], subsets['LEARN-2'], subsets['LEARN-3']],
    'SEM': [subsets['SEM-1'], subsets['SEM-2'], subsets['SEM-3']],
    'TYPO': [subsets['TYPO-1'], subsets['TYPO-2'], subsets['TYPO-3']],
    'ALL': [subsets['ALL']]
}

# 'xnli', 'xcopa', 'xstorycloze', 'xwinograd', 'pawsx',
xnli_languages = set(task_languages['xnli'])
xcopa_languages = set(task_languages['xcopa'])
xstorycloze_languages = set(task_languages['xstorycloze'])
xwinograd_languages = set(task_languages['xwinograd'])
pawsx_languages = set(task_languages['pawsx'])
# Get the breakdown of intersections
xnli_breakdown_results_sorted_by_full = compute_intersections_with_sorted_full(xnli_languages, model_languages,
                                                                               subset_groups)
xcopa_breakdown_results_sorted_by_full = compute_intersections_with_sorted_full(xcopa_languages, model_languages,
                                                                                subset_groups)
xstorycloze_breakdown_results_sorted_by_full = compute_intersections_with_sorted_full(xstorycloze_languages,
                                                                                      model_languages,
                                                                                      subset_groups)
xwinograd_breakdown_results_sorted_by_full = compute_intersections_with_sorted_full(xwinograd_languages,
                                                                                    model_languages,
                                                                                    subset_groups)
pawsx_breakdown_results_sorted_by_full = compute_intersections_with_sorted_full(pawsx_languages, model_languages,
                                                                                subset_groups)


# Display the sorted results
def display_sorted_grouped_results(sorted_results):
    for result in sorted_results:
        print(
            f"{result['subset_group']} with {result['model']} -> {result['num_full_intersection']}/{result['max_full_intersection']} languages in common (Full Intersection): {result['full_intersection']}")
        print(
            f"  Task-Model Intersection: {result['num_task_model_intersection']}/{result['max_task_model_intersection']} languages")
        print(
            f"  Subset-Task Intersection: {result['num_subset_task_intersection']}/{result['max_subset_task_intersection']} languages")
        print(
            f"  Model-Subset Intersection: {result['num_model_subset_intersection']}/{result['max_model_subset_intersection']} languages")
        print()  # For better readability


# Example of how to use the function
print('------ XNLI -------')
display_sorted_grouped_results(xnli_breakdown_results_sorted_by_full)
print('------ XCOPA -------')
display_sorted_grouped_results(xcopa_breakdown_results_sorted_by_full)
print('------ XSTORYCLOZE -------')
display_sorted_grouped_results(xstorycloze_breakdown_results_sorted_by_full)
print('------ XWINOGRAD -------')
display_sorted_grouped_results(xwinograd_breakdown_results_sorted_by_full)
print('------ PAWSX -------')
display_sorted_grouped_results(pawsx_breakdown_results_sorted_by_full)


------ XNLI -------
ALL with mt5 -> 13/15 languages in common (Full Intersection): {'ar', 'en', 'es', 'hi', 'ur', 'sw', 'tr', 'de', 'fr', 'ru', 'vi', 'zh', 'th'}
  Task-Model Intersection: 15/15 languages
  Subset-Task Intersection: 13/15 languages
  Model-Subset Intersection: 50/52 languages

ALL with mgpt -> 12/15 languages in common (Full Intersection): {'ar', 'en', 'es', 'hi', 'ur', 'sw', 'tr', 'de', 'fr', 'ru', 'vi', 'th'}
  Task-Model Intersection: 14/15 languages
  Subset-Task Intersection: 13/15 languages
  Model-Subset Intersection: 39/52 languages

RND with mt5 -> 9/15 languages in common (Full Intersection): {'ar', 'en', 'es', 'hi', 'fr', 'de', 'ru', 'vi', 'zh'}
  Task-Model Intersection: 15/15 languages
  Subset-Task Intersection: 9/15 languages
  Model-Subset Intersection: 31/32 languages

TYPO with mt5 -> 9/15 languages in common (Full Intersection): {'ar', 'en', 'hi', 'ur', 'sw', 'de', 'vi', 'zh', 'th'}
  Task-Model Intersection: 15/15 languages
  Subset-Task Intersectio