In [None]:
import warnings
import logging
from transformers.utils import logging as hf_logging

hf_logging.set_verbosity_error()
warnings.filterwarnings("ignore", message=".*Transparent hugepages.*")

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, 
    confusion_matrix, 
    precision_score, 
    recall_score, 
    f1_score,
    classification_report
)
import os
import torch
import torch.nn as nn
import torch_xla.runtime as xr
import torch_xla
import torch_xla.core.xla_model as xm
import torch_xla.distributed.xla_multiprocessing as xmp
import torch_xla.distributed.parallel_loader as pl
from torch.utils.data.distributed import DistributedSampler
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ.pop('TPU_PROCESS_ADDRESSES')

MODEL_NAME = "tabularisai/multilingual-sentiment-analysis"
LEARNING_RATE = 1e-5
EPOCHS = 12
BATCH_SIZE = 8

txt = pd.read_csv("/kaggle/input/mrbeast-youtube-comment-sentiment-analysis/sentiment_analysis_dataset.csv", on_bad_lines='skip')

text = list(txt.iloc[:, 0].astype("str"))
labels = list(txt.iloc[:, 1].str.strip().str.capitalize())

encoder = LabelEncoder()
labels = encoder.fit_transform(labels)

class_counts = np.bincount(labels)
total_samples = len(labels)
num_classes = len(class_counts)
class_weights = total_samples / (num_classes * class_counts)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float)

X_train, X_test, y_train, y_test = train_test_split(
    text, labels,
    test_size=0.2,
    random_state=42,
    stratify=labels
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

train_encodings = tokenizer(X_train, padding=True, truncation=True, return_tensors='pt')
test_encodings = tokenizer(X_test, padding=True, truncation=True, return_tensors='pt')

input_ids = train_encodings['input_ids']
attention_mask = train_encodings['attention_mask']
train_labels = torch.tensor(y_train)
train_dataset = TensorDataset(input_ids, attention_mask, train_labels)

input_ids = test_encodings['input_ids']
attention_mask = test_encodings['attention_mask']
test_labels = torch.tensor(y_test)
test_dataset = TensorDataset(input_ids, attention_mask, test_labels)

def _mp_fn(index):  # 'index' is the TPU core number (0-7)
    
    # 1. ACQUIRE DEVICE
    device = torch_xla.device()
    weights_tensor = class_weights_tensor.to(device)
    
    # 2. MODEL LOADING (Move inside)
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels = 3,
        ignore_mismatched_sizes=True
    )
    model.to(device)

    # 3. DATALOADERS (Modify for distributed training)
    # Use DistributedSampler to split data across cores
    train_sampler = DistributedSampler(
        train_dataset,
        num_replicas=xr.world_size(),  # Total cores (8)
        rank=xr.global_ordinal(),            # This core's ID
        shuffle=True
    )
    
    # NOTE: Your batch_size=2 is PER CORE. Global batch size will be 2 * 8 = 16.
    # You should increase this for better TPU performance.
    dataloader = DataLoader(
        train_dataset, 
        batch_size=BATCH_SIZE, 
        sampler=train_sampler
    )

    test_sampler = DistributedSampler(
        test_dataset,
        num_replicas=xr.world_size(),
        rank=xr.global_ordinal(),
        shuffle=False  # No need to shuffle test data
    )
    
    test_dataloader = DataLoader(
        test_dataset, 
        batch_size=BATCH_SIZE, 
        sampler=test_sampler
    )

    # 4. OPTIMIZER & SCHEDULER (Move inside)
    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
    criterion = nn.CrossEntropyLoss(weight=weights_tensor)
    num_epochs = EPOCHS
    total_step = len(dataloader) * num_epochs # len(dataloader) is correct here
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=600,
        num_training_steps=total_step
    )

    # 5. TRAINING LOOP (Modify 3 key parts)
    for epoch in range(num_epochs):
        # --- WRAP DataLoaders with ParallelLoader ---
        p_dataloader = pl.ParallelLoader(dataloader, [device]).per_device_loader(device)
        p_test_dataloader = pl.ParallelLoader(test_dataloader, [device]).per_device_loader(device)
        model.train()
        # Use xm.master_print to print only from one core
        xm.master_print(f'Epoch{epoch + 1}/{num_epochs}')
        total_train_loss = torch.tensor(0.0, device=device)
        total_train_correct = torch.tensor(0, device=device, dtype=torch.long)
        total_train_samples = torch.tensor(0, device=device, dtype=torch.long)
        
        # Use the ParallelLoader
        for b_input_ids, b_mask, b_labels in p_dataloader:
            b_input_ids = b_input_ids.to(device)
            b_mask = b_mask.to(device)
            b_labels = b_labels.to(device)

            optimizer.zero_grad()
            outputs = model(
                b_input_ids,
                attention_mask = b_mask,
                labels = b_labels
            )
            
            logits = outputs.logits
            loss = criterion(logits, b_labels)
            total_train_loss += loss
            preds = torch.argmax(logits, dim=1)
            total_train_correct += (preds == b_labels).sum()
            total_train_samples += b_labels.size(0)
            
            loss.backward()
            
            xm.optimizer_step(optimizer, barrier=True)

            scheduler.step()

        # --- 3. Aggregate tensors from all cores ---
        global_train_loss = xm.all_reduce(xm.REDUCE_SUM, total_train_loss)
        global_train_correct = xm.all_reduce(xm.REDUCE_SUM, total_train_correct)
        global_train_samples = xm.all_reduce(xm.REDUCE_SUM, total_train_samples)

        # --- 4. Calculate and report global metrics ---
        total_train_batches = len(dataloader) * xr.world_size() 

        if total_train_batches > 0 and global_train_samples > 0:
            avg_train_loss = global_train_loss / total_train_batches
            train_accuracy = global_train_correct.float() / global_train_samples
            xm.master_print(f"Epoch {epoch + 1} complete. Global Avg Loss: {avg_train_loss:.4f}, Global Train Accuracy: {train_accuracy:.4f}")
        else:
            xm.master_print(f"Epoch {epoch + 1} complete. No training data.")

        
        # 6. VALIDATION LOOP (Modify 2 parts)
        model.eval()
        total_val_loss = torch.tensor(0.0, device=device)
        total_val_correct = torch.tensor(0, device=device, dtype=torch.long)
        total_val_samples = torch.tensor(0, device=device, dtype=torch.long)
        
        with torch.no_grad():
            # Use the ParallelLoader
            for b_input_ids, b_mask, b_labels in p_test_dataloader:
                b_input_ids=b_input_ids.to(device)
                b_mask=b_mask.to(device)
                b_labels=b_labels.to(device)
                
                outputs = model(
                b_input_ids,
                attention_mask = b_mask,
                labels = b_labels
            )
                
            logits = outputs.logits
            loss = criterion(logits, b_labels)
            total_val_loss += loss
            preds = torch.argmax(logits, dim=1)
            total_val_correct += (preds == b_labels).sum()
            total_val_samples += b_labels.size(0)
                
        global_val_loss = xm.all_reduce(xm.REDUCE_SUM, total_val_loss)
        global_val_correct = xm.all_reduce(xm.REDUCE_SUM, total_val_correct)
        global_val_samples = xm.all_reduce(xm.REDUCE_SUM, total_val_samples)

        # 4. Calculate global metrics
        # len(test_dataloader) is batches_per_core, so multiply by world_size
        total_val_batches = len(test_dataloader) * xr.world_size() 
        
        avg_test_loss = global_val_loss / total_val_batches
        test_accuracy = global_val_correct.float() / global_val_samples # Use .float() for precision
        xm.master_print(f"Epoch {epoch + 1} complete. Avg Loss: {avg_test_loss:.4f}, Validtion Accuracy: {test_accuracy:.4f}")

    # 7. FINAL METRICS (Run only on master core)
    if xm.is_master_ordinal():
        print("\n")
        print("Saving model and tokenizer...")
        # Use xm.save() to correctly save from the TPU
        xm.save(model.state_dict(), "my_trained_model.pt")
        # Save the tokenizer so you can load it easily later
        tokenizer.save_pretrained("./my_tokenizer_directory")
        print("Model and tokenizer saved successfully.")
        
if __name__ == "__main__":
    xmp.spawn(_mp_fn, args=(), nprocs=None, start_method='fork')


In [None]:
# --- 1. Define Paths and Model Config ---
MODEL_NAME = MODEL_NAME
SAVED_MODEL_PATH = "/kaggle/working/my_trained_model.pt"
SAVED_TOKENIZER_PATH = "/kaggle/working/my_tokenizer_directory"
NUM_LABELS = 3
LABEL_NAMES = ['Negative (0)', 'Neutral (1)', 'Positive (2)']

# --- 2. Load Model and Tokenizer ---
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(SAVED_TOKENIZER_PATH)

print("Loading model architecture...")
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME, 
    num_labels=NUM_LABELS,
    ignore_mismatched_sizes=True
)

print("Loading trained weights...")
model.load_state_dict(torch.load(SAVED_MODEL_PATH))

# Use GPU if available, otherwise CPU. This does NOT use TPU.
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()
print(f"Model loaded successfully on {device}")

# --- 3. Create a SIMPLE DataLoader ---
# We use the 'test_dataset' variable from your first script
# NO DistributedSampler, NO ParallelLoader
eval_dataloader = DataLoader(
    test_dataset, 
    batch_size=BATCH_SIZE  # A standard batch size is fine
)

# --- 4. Run Evaluation ---
print("Running evaluation...")
all_predictions = []
with torch.no_grad():
    for batch in eval_dataloader:
        # Dataloader gives (input_ids, attention_mask, labels)
        # We only need the first two. Move them to the device.
        b_input_ids = batch[0].to(device)
        b_mask = batch[1].to(device)
        
        outputs = model(b_input_ids, attention_mask=b_mask)
        logits = outputs.logits
        
        # Get predictions and move them back to CPU
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        all_predictions.append(preds)

# Flatten all predictions into a single numpy array
y_pred = np.concatenate(all_predictions)

# --- 5. Get Metrics ---
# This will now work perfectly: len(y_pred) == 1361 and len(y_test) == 1361
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro')
recall = recall_score(y_test, y_pred, average='macro')
f1 = f1_score(y_test, y_pred, average='macro')

print(f"--- Performance ---")
print(f"Accuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")
print("\n")

print("--- Classification Report ---")
print(classification_report(y_test, y_pred, target_names=['Negative (0)', 'Neutral (1)', 'Positive (2)']))
print("\n")


In [None]:
model.eval()

test_text = ["that's how you use power of money, not billion dollars cars. Bless him"]
inputs = tokenizer(
    test_text,
    padding=True,
    truncation=True,
    return_tensors='pt'
).to(device)

with torch.no_grad():
    outputs = model(
        inputs['input_ids'],
        attention_mask=inputs['attention_mask']
    )
logits = outputs.logits
print(logits)

prediction = torch.argmax(logits, dim=1)
print(f"Prediction: {prediction.item()} (0=Neg, 1=Neu, 2=Pos)")

In [None]:
import IPython
app = IPython.Application.instance()
app.kernel.do_shutdown(True)