In [None]:
import os
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import torch
import pandas as pd
from tqdm import tqdm
import numpy as np
from copy import deepcopy
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import math
import pickle

import sys
current_dir = os.getcwd()
os.chdir('..')
sys.path.append('src')
from utils_topic_classification import inference_hooked_model, plot_by_category, plot_by_category_3d
os.chdir(current_dir)

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"] = "7"

In [None]:
# SIB-200 (kr, jp, en, fr, id, su, jv)

In [None]:
dataset_dict = {
    'kr': load_dataset('Davlan/sib200', name='kor_Hang'),
    'jp': load_dataset('Davlan/sib200', name='jpn_Jpan'),
    'en': load_dataset('Davlan/sib200', name='eng_Latn'),
	'fr': load_dataset('Davlan/sib200', name='fra_Latn'),
	'id': load_dataset('Davlan/sib200', name='ind_Latn'),
	'su': load_dataset('Davlan/sib200', name='sun_Latn'),
    'jv': load_dataset('Davlan/sib200', name='jav_Latn')
}

In [None]:
model_name = "Qwen/Qwen3-1.7B"

In [None]:
# load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="cuda"
)

model.eval()

In [None]:
prompt_en = """Classify the topic of the following text. Choose exactly one of the following topics:

- geography
- science/technology
- entertainment
- politics
- health
- travel
- sports

Text: {sentence}

Topic:"""

prompt_id = """Klasifikasikan topik dari teks berikut. Pilih tepat satu dari topik-topik berikut:

- geografi
- sains/teknologi
- hiburan
- politik
- kesehatan
- perjalanan
- olahraga

Teks: {sentence}

Topik:"""


In [None]:
print(type(dataset_dict['su']))

In [None]:
for lang, initial_dataset in dataset_dict.items():
    print(f"Processing language: {lang}")
    inference_hooked_model(initial_dataset=initial_dataset,
		lang=lang,
		model=model,
        tokenizer=tokenizer,
		model_name=model_name.split('/')[-1],
		initial_prompt=prompt_en,
		prompt_lang='en',
		save_results=False,
		is_base_model=True
    )
    inference_hooked_model(initial_dataset=initial_dataset,
		lang=lang,
		model=model,
		tokenizer=tokenizer,
		model_name=model_name.split('/')[-1],
		initial_prompt=None,
		prompt_lang=None,
		save_results=False,
		is_base_model=True
	)

# Plot

In [None]:
topics = ['science/technology', 'travel', 'politics', 'sports', 'health', 'entertainment', 'geography']
languages = list(dataset_dict.keys())

In [None]:
from glob import glob
activation_paths = glob(f'../outputs_1token/{model_name.split("/")[-1]}/prompt_en/**/*.pt', recursive=True)
len(activation_paths)

## 1 Time Forward Pass

In [None]:
model_name = "google/gemma-3-1b-pt"

In [None]:
model

### Plot by Topics

In [None]:
plot_by_category_3d(
    dataset_dict=dataset_dict, 
    model_name=model_name.split('/')[-1],
    num_layers=len(model.transformer.h) if 'bloom' in model_name else len(model.model.layers),
    labels=topics,
    languages=languages,
    outputs_dir='outputs_1token',
    prompt_lang='raw',
    save_plot=True,
    save_plot_indicator='topics',
    show_plot=False,
    save_tsne=True,
    calculate_tsne=True
)

In [None]:
plot_by_category(
    dataset_dict=dataset_dict, 
    model_name=model_name.split('/')[-1],
    num_layers=len(model.transformer.h) if 'bloom' in model_name else len(model.model.layers),
    labels=topics,
    languages=languages,
    outputs_dir='outputs_1token',
    prompt_lang='raw',
    save_plot=True,
    save_plot_indicator='topics',
    show_plot=False,
    save_tsne=True,
    calculate_tsne=True
)

### Plot by Language

In [None]:
plot_by_category(
	dataset_dict=dataset_dict,
	model_name=model_name.split('/')[-1],
	num_layers=len(model.transformer.h) if 'bloom' in model_name else len(model.model.layers),
	labels=languages,
	languages=languages,
	outputs_dir='outputs_1token',
	prompt_lang='raw',
	save_plot=True,
    save_plot_indicator='languages',
	show_plot=False,
	save_tsne=True,
	calculate_tsne=True
)

## Last Token until generation ends

### Plot by Category

In [None]:
layers = 28
labels = ['science/technology', 'travel', 'politics', 'sports', 'health', 'entertainment', 'geography']
cmap = plt.get_cmap('tab10')
color_map = {category: cmap(i) for i, category in enumerate(labels)}
fig, axes = plt.subplots(4, layers//4, figsize=(25, 10))
axes = axes.flatten()
for layer in range(layers):
	# Load the activation for the current layer for all samples
	activation_np = []
	color_points = []
	# Iterate through the dataset and load activations
	for idx, row in df_id.iterrows():
		for lang in ['id', 'en']:
			activation_path = f'outputs_last/prompt_en/{lang}/{row['index_id']}/{layer}.pt'
			activation = torch.load(activation_path)
			activation = activation.float()
			activation_np.append(activation.cpu().numpy())
			color_points.append(color_map[row['category']])
	activation_np = np.array(activation_np)
	# Perform t-SNE
	tsne = TSNE(n_components=2, random_state=42)
	activation_2d = tsne.fit_transform(activation_np)
	# Plot the t-SNE results (activation_2d), with colors based on the predicted category
	ax = axes[layer]
	ax.set_title(f'Layer {layer + 1}')
	ax.scatter(activation_2d[:, 0], activation_2d[:, 1], c=color_points, s=10, alpha=0.5)
	ax.set_xlabel('t-SNE Component 1')
	ax.set_ylabel('t-SNE Component 2')
plt.tight_layout()
plt.show()

In [None]:
layers = 28
labels = ['science/technology', 'travel', 'politics', 'sports', 'health', 'entertainment', 'geography']
cmap = plt.get_cmap('tab10')
color_map = {category: cmap(i) for i, category in enumerate(labels)}
fig, axes = plt.subplots(4, layers//4, figsize=(25, 10))
axes = axes.flatten()
for layer in range(layers):
	# Load the activation for the current layer for all samples
	activation_np = []
	color_points = []
	# Iterate through the dataset and load activations
	for idx, row in df_id.iterrows():
		for lang in ['id', 'en']:
			activation_path = f'outputs_last/prompt_id/{lang}/{row['index_id']}/{layer}.pt'
			activation = torch.load(activation_path)
			activation = activation.float()
			activation_np.append(activation.cpu().numpy())
			color_points.append(color_map[row['category']])
	activation_np = np.array(activation_np)
	# Perform t-SNE
	tsne = TSNE(n_components=2, random_state=42)
	activation_2d = tsne.fit_transform(activation_np)
	# Plot the t-SNE results (activation_2d), with colors based on the predicted category
	ax = axes[layer]
	ax.set_title(f'Layer {layer + 1}')
	ax.scatter(activation_2d[:, 0], activation_2d[:, 1], c=color_points, s=10, alpha=0.5)
	ax.set_xlabel('t-SNE Component 1')
	ax.set_ylabel('t-SNE Component 2')
plt.tight_layout()
plt.show()

In [None]:
layers = 28
labels = ['science/technology', 'travel', 'politics', 'sports', 'health', 'entertainment', 'geography']
cmap = plt.get_cmap('tab10')
color_map = {category: cmap(i) for i, category in enumerate(labels)}
fig, axes = plt.subplots(4, layers//4, figsize=(25, 10))
axes = axes.flatten()
for layer in range(layers):
	# Load the activation for the current layer for all samples
	activation_np = []
	color_points = []
	# Iterate through the dataset and load activations
	for idx, row in df_id.iterrows():
		for lang in ['id', 'en']:
			activation_path = f'outputs_last/prompt_raw/{lang}/{row['index_id']}/{layer}.pt'
			activation = torch.load(activation_path)
			activation = activation.float()
			activation_np.append(activation.cpu().numpy())
			color_points.append(color_map[row['category']])
	activation_np = np.array(activation_np)
	# Perform t-SNE
	tsne = TSNE(n_components=2, random_state=42)
	activation_2d = tsne.fit_transform(activation_np)
	# Plot the t-SNE results (activation_2d), with colors based on the predicted category
	ax = axes[layer]
	ax.set_title(f'Layer {layer + 1}')
	ax.scatter(activation_2d[:, 0], activation_2d[:, 1], c=color_points, s=10, alpha=0.5)
	ax.set_xlabel('t-SNE Component 1')
	ax.set_ylabel('t-SNE Component 2')
plt.tight_layout()
plt.show()

### Plot by Language

In [None]:
layers = 28
labels = ['science/technology', 'travel', 'politics', 'sports', 'health', 'entertainment', 'geography']
languages = ['id', 'en']
cmap = plt.get_cmap('tab10')
color_map = {lang: cmap(i) for i, lang in enumerate(languages)}
fig, axes = plt.subplots(4, layers//4, figsize=(25, 10))
axes = axes.flatten()
for layer in range(layers):
	# Load the activation for the current layer for all samples
	activation_np = []
	color_points = []
	# Iterate through the dataset and load activations
	for idx, row in df_id.iterrows():
		for lang in ['id', 'en']:
			activation_path = f'outputs_last/prompt_en/{lang}/{row['index_id']}/{layer}.pt'
			activation = torch.load(activation_path)
			activation = activation.float()
			activation_np.append(activation.cpu().numpy())
			color_points.append(color_map[lang])
	activation_np = np.array(activation_np)
	# Perform t-SNE
	tsne = TSNE(n_components=2, random_state=42)
	activation_2d = tsne.fit_transform(activation_np)
	# Plot the t-SNE results (activation_2d), with colors based on the predicted category
	ax = axes[layer]
	ax.set_title(f'Layer {layer + 1}')
	ax.scatter(activation_2d[:, 0], activation_2d[:, 1], c=color_points, s=10, alpha=0.5)
	ax.set_xlabel('t-SNE Component 1')
	ax.set_ylabel('t-SNE Component 2')
plt.tight_layout()
plt.show()

In [None]:
layers = 28
labels = ['science/technology', 'travel', 'politics', 'sports', 'health', 'entertainment', 'geography']
languages = ['id', 'en']
cmap = plt.get_cmap('tab10')
color_map = {lang: cmap(i) for i, lang in enumerate(languages)}
fig, axes = plt.subplots(4, layers//4, figsize=(25, 10))
axes = axes.flatten()
for layer in range(layers):
	# Load the activation for the current layer for all samples
	activation_np = []
	color_points = []
	# Iterate through the dataset and load activations
	for idx, row in df_id.iterrows():
		for lang in ['id', 'en']:
			activation_path = f'outputs_last/prompt_id/{lang}/{row['index_id']}/{layer}.pt'
			activation = torch.load(activation_path)
			activation = activation.float()
			activation_np.append(activation.cpu().numpy())
			color_points.append(color_map[lang])
	activation_np = np.array(activation_np)
	# Perform t-SNE
	tsne = TSNE(n_components=2, random_state=42)
	activation_2d = tsne.fit_transform(activation_np)
	# Plot the t-SNE results (activation_2d), with colors based on the predicted category
	ax = axes[layer]
	ax.set_title(f'Layer {layer + 1}')
	ax.scatter(activation_2d[:, 0], activation_2d[:, 1], c=color_points, s=10, alpha=0.5)
	ax.set_xlabel('t-SNE Component 1')
	ax.set_ylabel('t-SNE Component 2')
plt.tight_layout()
plt.show()

In [None]:
layers = 28
labels = ['science/technology', 'travel', 'politics', 'sports', 'health', 'entertainment', 'geography']
languages = ['id', 'en']
cmap = plt.get_cmap('tab10')
color_map = {lang: cmap(i) for i, lang in enumerate(languages)}
fig, axes = plt.subplots(4, layers//4, figsize=(25, 10))
axes = axes.flatten()
for layer in range(layers):
	# Load the activation for the current layer for all samples
	activation_np = []
	color_points = []
	# Iterate through the dataset and load activations
	for idx, row in df_id.iterrows():
		for lang in ['id', 'en']:
			activation_path = f'outputs_last/prompt_raw/{lang}/{row['index_id']}/{layer}.pt'
			activation = torch.load(activation_path)
			activation = activation.float()
			activation_np.append(activation.cpu().numpy())
			color_points.append(color_map[lang])
	activation_np = np.array(activation_np)
	# Perform t-SNE
	tsne = TSNE(n_components=2, random_state=42)
	activation_2d = tsne.fit_transform(activation_np)
	# Plot the t-SNE results (activation_2d), with colors based on the predicted category
	ax = axes[layer]
	ax.set_title(f'Layer {layer + 1}')
	ax.scatter(activation_2d[:, 0], activation_2d[:, 1], c=color_points, s=10, alpha=0.5)
	ax.set_xlabel('t-SNE Component 1')
	ax.set_ylabel('t-SNE Component 2')
plt.tight_layout()
plt.show()