In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import pandas as pd
from tqdm import tqdm

# Load model and tokenizer
model_name = "tabularisai/multilingual-sentiment-analysis"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Batched prediction function with tqdm
def predict_sentiment(texts, batch_size=32):
    all_preds = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Predicting Sentiment"):
        batch_texts = texts[i:i + batch_size]
        inputs = tokenizer(batch_texts, return_tensors="pt", truncation=True, padding=True, max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        preds = torch.argmax(outputs.logits, dim=-1).tolist()
        all_preds.extend(preds)
    return all_preds

# Load input JSONL file
input_path = "/content/merged_output.jsonl"
df = pd.read_json(input_path, lines=True)

# Predict and add label column
df["label"] = predict_sentiment(df["text"].tolist())

# Save to new JSONL file
output_path = "/content/labelled_output.jsonl"
df.to_json(output_path, orient="records", lines=True, force_ascii=False)

print(f"\nLabeled dataset saved to {output_path}")