In [1]:
import pandas as pd
import numpy as np
import torch
import glob
from tqdm import tqdm
import os
import sys

# Change to parent directory
os.chdir('..')
# Add current directory to path
sys.path.append('.')

# Now import
from src.utils.const import LANGCODE2LANGNAME, LANGNAME2LANGCODE, EXP2_CONFIG

In [2]:
model_name = 'gemma-3-4b-it' # 1152 size for 1b, 2560 for 4b
model_to_num_layers = {
    'gemma-3-1b-it': 26,
	'gemma-3-4b-it': 34,
    'gemma-3-270m-it': 18,
    'gemma-2-9b-it': 42
}
model_to_hidden_size = {
	'gemma-3-1b-it': 1152,
	'gemma-3-4b-it': 2560,
	'gemma-3-270m-it': 640,
	'gemma-2-9b-it': 3584
}
num_layers = model_to_num_layers[model_name]
extraction_mode = 'raw'
token_position = 'last_token'
languages = glob.glob('outputs/topic_classification/gemma-3-4b-it/raw/*')
languages = sorted([lang.split('/')[-1] for lang in languages])
text_ids = glob.glob('outputs/topic_classification/gemma-3-4b-it/raw/ace_Latn/*')
text_ids = [text_id.split('/')[-1].split('.')[0] for text_id in text_ids]

In [3]:
# # Extract language codes from the comment and add eng_Latn
# languages = [
# 	'vie_Latn',  # Vietnamese
# 	'ind_Latn',  # Indonesian
# 	'tha_Thai',  # Thai
# 	'zsm_Latn',  # Malay
# 	'mya_Mymr',  # Burmese -> 0 problem
# 	'tgl_Latn',  # Tagalog
# 	'khm_Khmr',  # Khmer
# 	'ceb_Latn',  # Cebuano
# 	'lao_Laoo',  # Lao
# 	'jav_Latn',  # Javanese
# 	'war_Latn',  # Waray
# 	'sun_Latn',  # Sundanese
# 	'ilo_Latn',  # Ilocano
# 	'tam_Taml',  # Tamil
# 	'zho_Hans',  # Chinese
# 	'eng_Latn'   # English
# ]

languages = []
for family, langs in EXP2_CONFIG['languages'].items():
	languages.extend(langs)

languages = [LANGNAME2LANGCODE[lang] for lang in languages]
languages

['eng_Latn',
 'deu_Latn',
 'spa_Latn',
 'fra_Latn',
 'arb_Arab',
 'heb_Hebr',
 'rus_Cyrl',
 'slk_Latn',
 'tha_Thai',
 'lao_Laoo',
 'khm_Khmr',
 'vie_Latn',
 'swh_Latn',
 'xho_Latn',
 'zul_Latn',
 'urd_Arab',
 'hin_Deva',
 'kan_Knda',
 'tel_Telu',
 'jpn_Jpan',
 'kor_Hang',
 'tur_Latn',
 'azb_Arab',
 'azj_Latn',
 'tgl_Latn',
 'ceb_Latn',
 'ilo_Latn',
 'war_Latn',
 'yue_Hant',
 'zho_Hans',
 'ind_Latn',
 'zsm_Latn',
 'min_Latn',
 'min_Arab',
 'bjn_Latn',
 'bjn_Arab',
 'jav_Latn',
 'sun_Latn']

In [4]:
# Initialize empty torch tensor to hold all activations: [text_id, layer, language, hidden_size]
activation_per_lang = torch.zeros((len(text_ids), num_layers + 1, len(languages), model_to_hidden_size[model_name])).to('cuda')

In [5]:
# Load activations for all languages
text_idx = 0
for lang_idx, lang in tqdm(enumerate(languages), total=len(languages), desc='Loading activations for all languages'):
	for layer_id in range(-1, num_layers):
		if layer_id == -1:
			paths = sorted(glob.glob(f'outputs/topic_classification/{model_name}/{extraction_mode}/{lang}/*/{token_position}/layer_embed_tokens.pt'))
		else:
			paths = sorted(glob.glob(f'outputs/topic_classification/{model_name}/{extraction_mode}/{lang}/*/{token_position}/layer_{layer_id}.pt'))
		if len(paths) != len(text_ids):
			print(f"Warning: Expected {len(text_ids)} files for language '{lang}' at layer {layer_id}, but found {len(paths)} files.")
			break
		for path in paths:
			activation = torch.load(path)
			activation_per_lang[text_idx, layer_id + 1, lang_idx, :] = activation.to('cuda')
			text_idx += 1
		text_idx = 0

Loading activations for all languages: 100%|██████████| 38/38 [02:07<00:00,  3.35s/it]


In [6]:
activation_per_lang.shape

torch.Size([204, 35, 38, 2560])

In [7]:
# Resulting shape: [layer_id, lang_idx, lang_idx]
# Layer 0: [0, lang_idx, lang_idx]
# Layer 1: [1, lang_idx, lang_idx]
# Layer 2: [2, lang_idx, lang_idx]
# ...
# Layer N: [N, lang_idx, lang_idx]

# I want to compare the distances between languages across layers

In [8]:
# Calculate cosine distance first for each pair of text, then average
# This is the proper way: distance first, then averaging
# Resulting shape: [layer_id, lang_idx, lang_idx]

# Initialize tensor to store all pairwise distances for each text
all_distances = torch.zeros((len(text_ids), num_layers + 1, len(languages), len(languages))).to('cuda')
all_similarities = torch.zeros((len(text_ids), num_layers + 1, len(languages), len(languages))).to('cuda')

print(f"Calculating distances for {len(text_ids)} texts...")

# For each text sample, calculate pairwise cosine distances
for text_idx in tqdm(range(len(text_ids)), desc="Processing texts"):
	# Get activations for this text
	text_activations = activation_per_lang[text_idx] # [layer_id, lang_idx, hidden_size]

	# Normalize activations per layer for comparability
	normalized_text_activations_layer = torch.nn.functional.normalize(text_activations, dim=0)
	
	# Cosine similarity
	# Normalize activations for cosine similarity
	normalized_text_activations = torch.nn.functional.normalize(normalized_text_activations_layer, dim=-1)
	# Calculate dot product for this text
	similarity = torch.matmul(normalized_text_activations_layer, normalized_text_activations_layer.transpose(-2, -1))
	# Store similarity matrix for debugging
	all_similarities[text_idx] = similarity
	# Convert to cosine distance (1 - cosine_similarity)
	all_distances[text_idx] = 1 - similarity

	# If similarity close to 0, print the text activations for debugging
	if torch.any(similarity < 0.01):
		print(f"Similarity close to 0 for text index {text_idx}. Debugging info:")
		print(f"Text activations shape: {text_activations.shape}")
		print(f"Text activations (first 5 values): {text_activations[0, 0, :5]}")
		print(f"Similarity matrix (first 5x5 block):\n{similarity[:5, :5]}")
	break

# Average the distances across all texts
# This gives us the mean cosine distance between each language pair for each layer
distance_matrix = all_distances.mean(dim=0)
# For debugging, also average the similarities
similarity_matrix = all_similarities.mean(dim=0)

print(f"Distance matrix shape: {distance_matrix.shape}")
print(f"Distance range: [{distance_matrix.min().item():.4f}, {distance_matrix.max().item():.4f}]")
print("Distance calculation complete using 'distance first, then average' method.")

Calculating distances for 204 texts...


Processing texts:   0%|          | 0/204 [00:00<?, ?it/s]

Similarity close to 0 for text index 0. Debugging info:
Text activations shape: torch.Size([35, 38, 2560])
Text activations (first 5 values): tensor([ 0.3281,  0.2217, -0.1230,  0.1260, -0.4863], device='cuda:0')
Similarity matrix (first 5x5 block):
tensor([[[0.0206, 0.0135, 0.0145, 0.0131, 0.0109, 0.0112, 0.0126, 0.0130,
          0.0150, 0.0149, 0.0138, 0.0116, 0.0119, 0.0123, 0.0124, 0.0129,
          0.0143, 0.0127, 0.0120, 0.0174, 0.0154, 0.0135, 0.0112, 0.0129,
          0.0130, 0.0117, 0.0126, 0.0120, 0.0161, 0.0187, 0.0131, 0.0128,
          0.0143, 0.0133, 0.0144, 0.0138, 0.0127, 0.0126],
         [0.0135, 0.0132, 0.0116, 0.0108, 0.0090, 0.0092, 0.0105, 0.0130,
          0.0119, 0.0121, 0.0115, 0.0096, 0.0109, 0.0100, 0.0100, 0.0102,
          0.0112, 0.0103, 0.0099, 0.0134, 0.0122, 0.0117, 0.0090, 0.0118,
          0.0115, 0.0094, 0.0099, 0.0097, 0.0126, 0.0140, 0.0107, 0.0107,
          0.0113, 0.0117, 0.0122, 0.0116, 0.0108, 0.0111],
         [0.0145, 0.0116, 0.0141, 0.0118




In [9]:
# Get top k minimum distance values in each layer with their language pair identifiers

# Set k as a variable
k = 50  # You can change this value as needed

# Initialize lists to store results
all_min_distances = []
all_min_distance_pairs = []

# For each layer, find the top k minimum distances and corresponding language pairs
for layer_id in range(-1, num_layers):
	layer_distances = distance_matrix[layer_id + 1]
	
	# Create a mask to exclude diagonal elements and upper triangle (to avoid duplicates)
	mask = torch.tril(torch.ones_like(layer_distances, dtype=torch.bool), diagonal=-1)
	
	# Get all distances excluding diagonal and upper triangle, then flatten
	lower_triangle_distances = layer_distances[mask]
	
	# Get top k minimum distances
	topk_min_values, topk_indices = torch.topk(lower_triangle_distances, k=k, largest=False)
	
	# Convert flat indices back to 2D coordinates
	layer_min_distances = []
	layer_min_pairs = []
	
	# Get indices of lower triangle elements
	lower_i, lower_j = torch.where(mask)
	
	for idx in range(k):
		flat_idx = topk_indices[idx].item()
		lang_i_idx = lower_i[flat_idx].item()
		lang_j_idx = lower_j[flat_idx].item()
		
		layer_min_distances.append(topk_min_values[idx].item())
		layer_min_pairs.append((languages[lang_i_idx], languages[lang_j_idx]))
	
	all_min_distances.append(layer_min_distances)
	all_min_distance_pairs.append(layer_min_pairs)

# Create a detailed DataFrame
detailed_results = []
for layer_id in range(-1, num_layers):
	for rank in range(k):
		lang1, lang2 = all_min_distance_pairs[layer_id + 1][rank]
		detailed_results.append({
			'Layer': layer_id,
			'Rank': rank + 1,
			'Min_Distance': all_min_distances[layer_id + 1][rank],
			'Language_Pair': f"{LANGCODE2LANGNAME[lang1]} ({lang1}) - {LANGCODE2LANGNAME[lang2]} ({lang2})"
		})

results_df = pd.DataFrame(detailed_results)

print(f"Top {k} minimum distances per layer:")
print(results_df.to_string(index=False))

# Also print a summary view grouped by layer
print("\nSummary by layer:")
for layer_id in range(-1, num_layers):
	print(f"\nLayer {layer_id:2d}:")
	for rank in range(k):
		lang1, lang2 = all_min_distance_pairs[layer_id + 1][rank]
		distance = all_min_distances[layer_id + 1][rank]
		print(f"  {rank+1}. {distance:.6f} - {LANGCODE2LANGNAME[lang1]} ({lang1}) & {LANGCODE2LANGNAME[lang2]} ({lang2})")


Top 50 minimum distances per layer:
 Layer  Rank  Min_Distance                                                       Language_Pair
    -1     1      0.004803               Chinese (Simplified) (zho_Hans) - Japanese (jpn_Jpan)
    -1     2      0.004810                Chinese (Simplified) (zho_Hans) - English (eng_Latn)
    -1     3      0.004815            Chinese (Simplified) (zho_Hans) - Yue Chinese (yue_Hant)
    -1     4      0.004816                            Japanese (jpn_Jpan) - English (eng_Latn)
    -1     5      0.004819                 Chinese (Simplified) (zho_Hans) - Korean (kor_Hang)
    -1     6      0.004821                   Chinese (Simplified) (zho_Hans) - Thai (tha_Thai)
    -1     7      0.004821                        Yue Chinese (yue_Hant) - Japanese (jpn_Jpan)
    -1     8      0.004823                             Korean (kor_Hang) - Japanese (jpn_Jpan)
    -1     9      0.004823                  Chinese (Simplified) (zho_Hans) - Hindi (hin_Deva)
    -1    10  

In [10]:
# Calculate mean and std deviation of the distances for each layer
layer_means = distance_matrix.mean(dim=(1, 2)).cpu().numpy()
layer_stds = distance_matrix.std(dim=(1, 2)).cpu().numpy()

# Create a DataFrame to display the statistics
layer_stats_df = pd.DataFrame({
	'Layer': range(-1, num_layers),
	'Mean_Distance': layer_means,
	'Std_Distance': layer_stds
})

print("Layer-wise Distance Statistics:")
print(layer_stats_df.to_string(index=False))


Layer-wise Distance Statistics:
 Layer  Mean_Distance  Std_Distance
    -1       0.004848      0.000009
     0      -0.001897      0.000811
     1      -0.001551      0.000763
     2      -0.002786      0.000956
     3      -0.004589      0.001200
     4      -0.007101      0.001398
     5      -0.010764      0.001848
     6      -0.012117      0.002040
     7      -0.012580      0.001946
     8      -0.012378      0.001839
     9      -0.014045      0.002014
    10      -0.015260      0.002237
    11      -0.018091      0.002643
    12      -0.019475      0.002980
    13      -0.014394      0.002607
    14      -0.014553      0.002671
    15      -0.018922      0.003521
    16      -0.031248      0.005760
    17      -0.042230      0.008584
    18      -0.050815      0.010630
    19      -0.065009      0.014673
    20      -0.087202      0.020626
    21      -0.110351      0.026261
    22      -0.128983      0.034340
    23      -0.146534      0.040065
    24      -0.169686      0.054

## Plotting and Cluster Formation

### Heatmap

In [11]:
lengths_per_family = {}
for family, langs in EXP2_CONFIG['languages'].items():
    lengths_per_family[family] = len(langs)

In [12]:
# Heatmap per layer of the silhouette scores between languages, make a subplot for each layer with 4 columns
import seaborn as sns
import matplotlib.pyplot as plt

# Find global min and max across all layers for consistent scale
global_min = float('inf')
global_max = float('-inf')
for layer_id in range(-1, num_layers):
	layer_distances = similarity_matrix[layer_id + 1]
	global_min = min(global_min, layer_distances.min().item())
	global_max = max(global_max, layer_distances.max().item())

fig, axes = plt.subplots(nrows=(num_layers + 1) // 4 + 1, ncols=4, figsize=(20, 5 * ((num_layers + 1) // 4 + 1)))
axes = axes.flatten()
for layer_id in tqdm(range(0, num_layers)):
	layer_distances = similarity_matrix[layer_id + 1]
	sns.heatmap(layer_distances.cpu().numpy(), 
		xticklabels=[LANGCODE2LANGNAME[lang] for lang in languages], 
		yticklabels=[LANGCODE2LANGNAME[lang] for lang in languages], 
		ax=axes[layer_id], 
		cmap='viridis',
		vmin=global_min,
		vmax=global_max,
		# annot=True,
		# fmt='.2f',
	)
	axes[layer_id].set_title(f'Layer {layer_id} Cosine Similarity')
	plt.setp(axes[layer_id].get_xticklabels(), rotation=90, ha='right', rotation_mode='anchor')

	# Setup fontsize of the x and y label
	axes[layer_id].tick_params(axis='x', labelsize=5)
	axes[layer_id].tick_params(axis='y', labelsize=5)
	
	# Add bolded grid lines to separate language families
	# Calculate cumulative positions for family boundaries
	cumulative_langs = 0
	for family, num_langs in lengths_per_family.items():
		cumulative_langs += num_langs
		# Draw horizontal and vertical lines at family boundaries
		axes[layer_id].axhline(cumulative_langs, color='black', linewidth=1.5)
		axes[layer_id].axvline(cumulative_langs, color='black', linewidth=1.5)

	# if layer_id + 1 == 3:
	# 	break
	
plt.tight_layout()
image_path = f'plot/cosine_heatmap_{model_name}.png'
os.makedirs('plot', exist_ok=True)
plt.savefig(image_path, dpi=300, bbox_inches='tight')
plt.close()

100%|██████████| 34/34 [01:20<00:00,  2.36s/it]


In [13]:
import seaborn as sns

# Make 5 bins of the silhouette scores and color the heatmap accordingly
import matplotlib.pyplot as plt

# Find global min and max across all layers for consistent scale
global_min = float('inf')
global_max = float('-inf')
for layer_id in range(-1, num_layers):
	layer_distances = similarity_matrix[layer_id + 1]
	# Exclude diagonal elements (which are 0)
	mask_no_diag = ~torch.eye(len(languages), dtype=torch.bool, device='cuda')
	non_diag_values = layer_distances[mask_no_diag]
	global_min = min(global_min, non_diag_values.min().item())
	global_max = max(global_max, non_diag_values.max().item())

# Create 3 bins
n_bins = 5
bin_edges = np.linspace(global_min, global_max, n_bins + 1)

# Create subplots
fig, axes = plt.subplots(nrows=(num_layers + 1) // 4 + 1, ncols=4, figsize=(20, 5 * ((num_layers + 1) // 4 + 1)))
axes = axes.flatten()

for layer_id in tqdm(range(0, num_layers)):
	layer_distances = similarity_matrix[layer_id + 1].cpu().numpy()
	
	# Discretize the distances into bins
	binned_distances = np.digitize(layer_distances, bin_edges) - 1
	binned_distances = np.clip(binned_distances, 0, n_bins - 1)
	
	sns.heatmap(binned_distances, 
		xticklabels=[LANGCODE2LANGNAME[lang] for lang in languages], 
		yticklabels=[LANGCODE2LANGNAME[lang] for lang in languages], 
		ax=axes[layer_id], 
		cmap='viridis',
		vmin=0,
		vmax=n_bins - 1,
		cbar_kws={'label': 'Bin', 'ticks': range(n_bins)}
	)
	axes[layer_id].set_title(f'Layer {layer_id} Binned Cosine Similarity')
	plt.setp(axes[layer_id].get_xticklabels(), rotation=90, ha='right', rotation_mode='anchor')

	# Setup fontsize of the x and y label
	axes[layer_id].tick_params(axis='x', labelsize=5)
	axes[layer_id].tick_params(axis='y', labelsize=5)
	
	# Add bolded grid lines to separate language families
	cumulative_langs = 0
	for family, num_langs in lengths_per_family.items():
		cumulative_langs += num_langs
		axes[layer_id].axhline(cumulative_langs, color='black', linewidth=1.5)
		axes[layer_id].axvline(cumulative_langs, color='black', linewidth=1.5)
	
plt.tight_layout()
image_path = f'plot/cosine_binned{n_bins}_heatmap_{model_name}.png'
os.makedirs('plot', exist_ok=True)
plt.savefig(image_path, dpi=300, bbox_inches='tight')
plt.close()

print(f"Bin edges: {bin_edges}")

100%|██████████| 34/34 [01:19<00:00,  2.33s/it]


Bin edges: [3.59604892e-05 5.43390575e-01 1.08674519e+00 1.63009980e+00
 2.17345442e+00 2.71680903e+00]


### Clustering

In [14]:
percentiles = np.percentile(range(-1, num_layers), [0, 2, 25, 50, 75, 100])
# Round to nearest integer
percentiles = [int(round(p)) for p in percentiles]
print(f"\nPercentile layers: {percentiles}")



Percentile layers: [-1, 0, 8, 16, 24, 33]


In [None]:
# Plot with heatmap - reversed colormap for high similarity highlighting
import matplotlib.pyplot as plt
import seaborn as sns

# # Calculate global min and max for consistent scale across all layers
# global_min = distance_matrix.min().cpu().numpy()
# global_max = distance_matrix.max().cpu().numpy()

percentiles = np.percentile(range(-1, num_layers), [2, 25, 50, 75, 100])
# Round to nearest integer
percentiles = [int(round(p)) for p in percentiles]

for layer_id in range(-1, num_layers):
	if layer_id not in percentiles:
		continue
	plt.figure(figsize=(10, 8))
	sns.heatmap(distance_matrix[layer_id + 1].cpu().numpy(), 
				xticklabels=languages, 
				yticklabels=languages, 
				cmap='viridis_r',  # Reversed colormap - bright colors for low distances (high similarity)
				# vmin=global_min,
				# vmax=global_max
				)
	plt.title(f'Layer {layer_id + 1} Language Distance Matrix (High Similarity Highlighted)', fontsize=12)
	plt.xlabel('Language', fontsize=10)
	plt.ylabel('Language', fontsize=10)
	plt.xticks(rotation=45, fontsize=4)
	plt.yticks(rotation=45, fontsize=4)
	
	# Adjust colorbar label size
	cbar = plt.gca().collections[0].colorbar
	cbar.ax.tick_params(labelsize=10)
	
	plt.tight_layout()
	plt.show()

In [15]:
import numpy as np
from sklearn.cluster import AgglomerativeClustering
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage
# Create subplots for dendrograms across all layers
fig, axes = plt.subplots(nrows=9, ncols=5, figsize=(25, 80))
fig.suptitle('Hierarchical Clustering Dendrograms Across All Layers', fontsize=16)

# Flatten axes for easier indexing
axes = axes.flatten()

for layer_id in range(-1, num_layers):
	ax_idx = layer_id + 1
	
	# Perform clustering for this layer
	linked = linkage(distance_matrix[layer_id + 1].cpu().numpy(), 'complete')
	
	# Create dendrogram in the corresponding subplot
	dendrogram(linked, 
			   labels=[LANGCODE2LANGNAME[lang] for lang in languages], 
			   orientation='right',
			   ax=axes[ax_idx])
	
	axes[ax_idx].set_title(f"Layer {layer_id}", fontsize=12)
	axes[ax_idx].tick_params(axis='y', labelsize=8)
	axes[ax_idx].tick_params(axis='x', labelsize=8)

# Save image
image_path = f'plot/cosine_dendrograms_{model_name}.png'
os.makedirs('plot', exist_ok=True)
plt.savefig(image_path, dpi=300, bbox_inches='tight')
plt.close()


In [None]:
import numpy as np
from sklearn.cluster import AgglomerativeClustering
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage

# Assuming 'distance_matrix' is your 90x90 numpy array

# 1. Perform clustering directly on the distance matrix
# No conversion is needed!
layer_id = 0  # Change this to the desired layer index
hac = AgglomerativeClustering(n_clusters=1, metric='precomputed', linkage='complete')
clusters = hac.fit_predict(distance_matrix[layer_id + 1].cpu().numpy())

print("Cluster assignments:", clusters)

# 2. Visualize the dendrogram with language labels - horizontal orientation
# The linkage function also works directly with a distance matrix
linked = linkage(distance_matrix[layer_id + 1].cpu().numpy(), 'complete')

plt.figure(figsize=(10, 15))
dendrogram(linked, labels=[LANGCODE2LANGNAME[lang] for lang in languages], orientation='right')
plt.title(f"Hierarchical Clustering Dendrogram Layer {layer_id}")
plt.xlabel("Distance")
plt.ylabel("Languages")
plt.tight_layout()
plt.show()


In [None]:
from sklearn.cluster import SpectralClustering
import numpy as np

# Assuming 'sim_matrix' is your 90x90 numpy array
# No need to convert to distance here, as it uses the affinity matrix directly!

spectral = SpectralClustering(n_clusters=4, affinity='precomputed', random_state=0)
clusters = spectral.fit_predict(similarity_matrix[1].cpu().numpy())

print("Cluster assignments:", clusters)

# Print cluster information with language names and codes
for cluster_id in np.unique(clusters):
	cluster_languages = [languages[i] for i in range(len(languages)) if clusters[i] == cluster_id]
	print(f"\nCluster {cluster_id} ({len(cluster_languages)} languages):")
	for lang_code in cluster_languages:
		print(f"  - {LANGCODE2LANGNAME[lang_code]} ({lang_code})")

In [None]:
from sklearn.cluster import AffinityPropagation
import numpy as np

# Assuming 'sim_matrix' is your 90x90 numpy array

# The 'damping' factor can be adjusted (0.5 to 1) to avoid oscillations
ap = AffinityPropagation(affinity='precomputed', damping=0.5, random_state=0)
clusters = ap.fit_predict(similarity_matrix[1].cpu().numpy())

# The number of clusters is found automatically
num_clusters = len(np.unique(clusters))
print(f"Found {num_clusters} clusters.")
print("Cluster assignments:", clusters)

# Print cluster information with language names and codes
for cluster_id in np.unique(clusters):
	cluster_languages = [languages[i] for i in range(len(languages)) if clusters[i] == cluster_id]
	print(f"\nCluster {cluster_id} ({len(cluster_languages)} languages):")
	for lang_code in cluster_languages:
		print(f"  - {LANGCODE2LANGNAME[lang_code]} ({lang_code})")
