In [None]:
from huggingface_hub import login
login("YOUR_HF_TOKEN")

In [None]:
#Test

In [None]:
from datasets import load_dataset

data = {}
data['Javanese'] = load_dataset("openlanguagedata/flores_plus", "jav_Latn", cache_dir='./huggingface/flores')
data['Sundanese'] = load_dataset("openlanguagedata/flores_plus", "sun_Latn", cache_dir='./huggingface/flores')
data['Welsh'] = load_dataset("openlanguagedata/flores_plus", "cym_Latn", cache_dir='./huggingface/flores')
data['Turkish'] = load_dataset("openlanguagedata/flores_plus", "tur_Latn", cache_dir='./huggingface/flores')
data['French'] = load_dataset("openlanguagedata/flores_plus", "fra_Latn", cache_dir='./huggingface/flores')
data['English'] = load_dataset("openlanguagedata/flores_plus", "eng_Latn", cache_dir='./huggingface/flores')
data['Indonesian'] = load_dataset("openlanguagedata/flores_plus", "ind_Latn", cache_dir='./huggingface/flores')

In [3]:
import pandas as pd
def convert_to_dataframe(data):
    train_df = data['dev'].to_pandas()
    val_df = data['devtest'].to_pandas()

    full_df = pd.concat([train_df, val_df], ignore_index=True)
    return full_df

for key in data.keys():
    data[key] = convert_to_dataframe(data[key])

In [4]:
prompt_template = """
You are a professional translator. 
Translate the following sentence from {src_lang} into {tgt_lang}.

Sentence:
"{src_sentence}"

Only output the translation, nothing else.
"""

In [None]:
import torch
import os
import pandas as pd
from transformers import AutoModelForCausalLM, AutoProcessor

folder_name = "Qwen/Qwen3-8B"

model = AutoModelForCausalLM.from_pretrained(
    folder_name,
    cache_dir=f"./huggingface/{folder_name}",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True
).to('mps')

processor = AutoProcessor.from_pretrained(
    folder_name,
    cache_dir=f"./huggingface/{folder_name}",
    trust_remote_code=True
)

def hook_fn(m, i, o, layer_id):
    os.makedirs(save_dir, exist_ok=True)
    save_path = os.path.join(save_dir, f"{layer_id}.pt")
    torch.save(o[0][-1, :].detach().cpu(), save_path)

for i, layer in enumerate(model.model.layers):
    layer.register_forward_hook(
        lambda m, i, o, layer_id=i: hook_fn(m, i, o, layer_id=layer_id)
    )

languages=["English", "French", "Indonesian", "Javanese", "Sundanese", "Welsh", "Turkish"]

for source_language in languages:
    for target_language in languages:
        if source_language != target_language:
            for _, row in data[source_language][:50].iterrows():
                text_id = str(row['id'])

                save_dir = f"./activations/flores/{folder_name}/{source_language}/{target_language}/{text_id}/"

                inputs = processor(
                    text=prompt_template.format(
                        src_lang=source_language,
                        tgt_lang=target_language,
                        src_sentence=row['text']
                    ),
                    return_tensors="pt"
                )

                inputs = {k: v.to("mps") for k, v in inputs.items()}

                with torch.no_grad():
                    outputs = model(**inputs)
                    print(text_id)

In [None]:
# One Language to Multilingual
import torch
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.manifold import TSNE
from matplotlib.lines import Line2D

models = [{'name': 'meta-llama/Llama-3.2-1B', 'layers': 16}, {'name': 'google/gemma-3-1b-pt', 'layers': 25}]
languages=["English", "French", "Indonesian", "Javanese", "Sundanese", "Welsh", "Turkish"]

cmap = plt.get_cmap('tab10')
color_map = {language: cmap(i) for i, language in enumerate(languages)}

for model_id in range (len(models)):
    for source_language in languages:
        if (model_id == 0):
            fig, axes = plt.subplots(int(models[model_id]['layers']/8), 8, figsize=(32, int(models[model_id]['layers']/2)))
            axes = axes.flatten()  
        else:
            fig, axes = plt.subplots(5, 5, figsize=(20, 20))
            axes = axes.flatten()  

        for layer in range(models[model_id]['layers']):
            label_language = []
            latent = []

            for target_language in languages:
                if (target_language != source_language):
                    base_path = f"./activations/flores/{models[model_id]['name']}/{source_language}/{target_language}/"
                    for text_id in os.listdir(base_path):
                        text_path = os.path.join(base_path, text_id)
                        if not os.path.isdir(text_path):
                            continue
                        path = os.path.join(text_path, f"{layer}.pt")
                        activation_values = torch.load(path)
                        if (model_id == 0):
                            latent.append(activation_values.to(torch.float32).numpy())
                        else:
                            latent.append(activation_values[-1, :].to(torch.float32).numpy())
                        label_language.append(target_language)

            latent = np.array(latent)
            tsne = TSNE(n_components=2, random_state=42)
            latent_2d = tsne.fit_transform(latent)

            ax = axes[layer]
            for lang in languages:
                indices = [i for i, lbl in enumerate(label_language) if lbl == lang]
                ax.scatter(latent_2d[indices, 0], latent_2d[indices, 1],
                        label=lang, color=color_map[lang], alpha=0.6, s=10)

            ax.set_title(f"Layer {layer}", fontsize=10)

        legend_elements = [Line2D([0], [0], marker='o', color='w',
                            label=lang, markerfacecolor=color_map[lang],
                            markersize=8, alpha=0.6) for lang in languages]
        plt.tight_layout(rect=[0, 0, 1, 0.90]) 
        fig.legend(handles=legend_elements,
                loc='upper center', bbox_to_anchor=(0.5, 0.96),
                ncol=len(languages), title='Languages')
        plt.suptitle(f'{models[model_id]['name']} MT Source Language {source_language}')
        plt.show()

In [None]:
# One Language to Multilingual
import torch
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.manifold import TSNE
from matplotlib.lines import Line2D

models = [{'name': 'meta-llama/Llama-3.2-1B', 'layers': 16}, {'name': 'google/gemma-3-1b-pt', 'layers': 25}]
languages=["English", "French", "Indonesian", "Javanese", "Sundanese", "Welsh", "Turkish"]

cmap = plt.get_cmap('tab10')
color_map = {language: cmap(i) for i, language in enumerate(languages)}

for model_id in range (len(models)):
    for target_language in languages:
        if (model_id == 0):
            fig, axes = plt.subplots(int(models[model_id]['layers']/8), 8, figsize=(32, int(models[model_id]['layers']/2)))
            axes = axes.flatten()  
        else:
            fig, axes = plt.subplots(5, 5, figsize=(20, 20))
            axes = axes.flatten()  

        for layer in range(models[model_id]['layers']):
            label_language = []
            latent = []

            for source_language in languages:
                if (target_language != source_language):
                    base_path = f"./activations/flores/{models[model_id]['name']}/{source_language}/{target_language}/"
                    for text_id in os.listdir(base_path):
                        text_path = os.path.join(base_path, text_id)
                        if not os.path.isdir(text_path):
                            continue
                        path = os.path.join(text_path, f"{layer}.pt")
                        activation_values = torch.load(path)
                        if (model_id == 0):
                            latent.append(activation_values.to(torch.float32).numpy())
                        else:
                            latent.append(activation_values[-1, :].to(torch.float32).numpy())
                        label_language.append(source_language)

            latent = np.array(latent)
            tsne = TSNE(n_components=2, random_state=42)
            latent_2d = tsne.fit_transform(latent)

            ax = axes[layer]
            for lang in languages:
                indices = [i for i, lbl in enumerate(label_language) if lbl == lang]
                ax.scatter(latent_2d[indices, 0], latent_2d[indices, 1],
                        label=lang, color=color_map[lang], alpha=0.6, s=10)

            ax.set_title(f"Layer {layer}", fontsize=10)

        legend_elements = [Line2D([0], [0], marker='o', color='w',
                            label=lang, markerfacecolor=color_map[lang],
                            markersize=8, alpha=0.6) for lang in languages]
        plt.tight_layout(rect=[0, 0, 1, 0.90]) 
        fig.legend(handles=legend_elements,
                loc='upper center', bbox_to_anchor=(0.5, 0.96),
                ncol=len(languages), title='Source Languages')
        plt.suptitle(f'{models[model_id]['name']} MT Target Language {target_language}')
        plt.show()

In [None]:
# One Language to Multilingual
import torch
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.manifold import TSNE
from matplotlib.lines import Line2D

models = [{'name': 'meta-llama/Llama-3.2-1B', 'layers': 16}, {'name': 'google/gemma-3-1b-pt', 'layers': 25}]
languages=["English", "French", "Indonesian", "Javanese", "Sundanese", "Welsh", "Turkish"]

cmap = plt.get_cmap('tab10')
color_map = {language: cmap(i) for i, language in enumerate(languages)}

for model_id in range (len(models)):
    for language in languages:
        if (model_id == 0):
            fig, axes = plt.subplots(int(models[model_id]['layers']/8), 8, figsize=(32, int(models[model_id]['layers']/2)))
            axes = axes.flatten()  
        else:
            fig, axes = plt.subplots(5, 5, figsize=(20, 20))
            axes = axes.flatten()  

        for layer in range(models[model_id]['layers']):
            label_language = []
            latent = []

            target_language = language
            for source_language in languages:
                if (target_language != source_language):
                    base_path = f"./activations/flores/{models[model_id]['name']}/{source_language}/{target_language}/"
                    for text_id in os.listdir(base_path):
                        text_path = os.path.join(base_path, text_id)
                        if not os.path.isdir(text_path):
                            continue
                        path = os.path.join(text_path, f"{layer}.pt")
                        activation_values = torch.load(path)
                        if (model_id == 0):
                            latent.append(activation_values.to(torch.float32).numpy())
                        else:
                            latent.append(activation_values[-1, :].to(torch.float32).numpy())
                        label_language.append(source_language)

            source_language = language
            for target_language in languages:
                if (target_language != source_language):
                    base_path = f"./activations/flores/{models[model_id]['name']}/{source_language}/{target_language}/"
                    for text_id in os.listdir(base_path):
                        text_path = os.path.join(base_path, text_id)
                        if not os.path.isdir(text_path):
                            continue
                        path = os.path.join(text_path, f"{layer}.pt")
                        activation_values = torch.load(path)
                        if (model_id == 0):
                            latent.append(activation_values.to(torch.float32).numpy())
                        else:
                            latent.append(activation_values[-1, :].to(torch.float32).numpy())
                        label_language.append(target_language)

            latent = np.array(latent)
            tsne = TSNE(n_components=2, random_state=42)
            latent_2d = tsne.fit_transform(latent)

            ax = axes[layer]
            for lang in languages:
                indices = [i for i, lbl in enumerate(label_language) if lbl == lang]
                ax.scatter(latent_2d[indices, 0], latent_2d[indices, 1],
                        label=lang, color=color_map[lang], alpha=0.6, s=10)

            ax.set_title(f"Layer {layer}", fontsize=10)

        legend_elements = [Line2D([0], [0], marker='o', color='w',
                            label=lang, markerfacecolor=color_map[lang],
                            markersize=8, alpha=0.6) for lang in languages]
        plt.tight_layout(rect=[0, 0, 1, 0.90]) 
        fig.legend(handles=legend_elements,
                loc='upper center', bbox_to_anchor=(0.5, 0.96),
                ncol=len(languages), title='Languages')
        plt.suptitle(f'{models[model_id]['name']} MT Language {language}')
        plt.show()