# NPPE-1 Multilingual Sentiment Analysis Notebook

This notebook demonstrates how to fine-tune the **LLaMA 3.1-8B-Instruct** model (loaded from Kaggle storage) using a LoRA adapter for multilingual sentiment classification. It includes all the necessary steps from loading the data to creating the final submission file. Progress bars via `tqdm` are used during training and inference.

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/llama-3.1/transformers/8b-instruct/2/model.safetensors.index.json
/kaggle/input/llama-3.1/transformers/8b-instruct/2/model-00003-of-00004.safetensors
/kaggle/input/llama-3.1/transformers/8b-instruct/2/config.json
/kaggle/input/llama-3.1/transformers/8b-instruct/2/LICENSE
/kaggle/input/llama-3.1/transformers/8b-instruct/2/model-00001-of-00004.safetensors
/kaggle/input/llama-3.1/transformers/8b-instruct/2/README.md
/kaggle/input/llama-3.1/transformers/8b-instruct/2/USE_POLICY.md
/kaggle/input/llama-3.1/transformers/8b-instruct/2/tokenizer.json
/kaggle/input/llama-3.1/transformers/8b-instruct/2/tokenizer_config.json
/kaggle/input/llama-3.1/transformers/8b-instruct/2/model-00004-of-00004.safetensors
/kaggle/input/llama-3.1/transformers/8b-instruct/2/special_tokens_map.json
/kaggle/input/llama-3.1/transformers/8b-instruct/2/.gitattributes
/kaggle/input/llama-3.1/transformers/8b-instruct/2/model-00002-of-00004.safetensors
/kaggle/input/llama-3.1/transformers/8b-instruct/2/gener

In [2]:
%%capture
!pip install bitsandbytes
!pip install accelerate
!pip install peft
!pip install --upgrade transformers
!pip install evaluate

In [3]:
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
import random

from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    get_linear_schedule_with_warmup,
    DataCollatorWithPadding
)
from datasets import Dataset
from peft import LoraConfig, get_peft_model
from transformers import BitsAndBytesConfig
torch.manual_seed(42)
random.seed(42)

In [4]:
# Adjust the file paths based on your Kaggle dataset folder structure
train_csv_path = "/kaggle/input/multi-lingual-sentiment-analysis/train.csv"
test_csv_path  = "/kaggle/input/multi-lingual-sentiment-analysis/test.csv"

tran_df = pd.read_csv(train_csv_path)
test_df = pd.read_csv(test_csv_path)

print("Training data shape:", tran_df.shape)
print(tran_df.head())

print("\nTest data shape:", test_df.shape)
print(test_df.head())

Training data shape: (1000, 4)
   ID                                           sentence     label language
0   1  কর্মীদের ভাল আচরণ এবং খাবারের পাশাপাশি পানীয় ...  Positive       bn
1   2  ગોદરેજ સેન્ટ્રલ એસીમાં તેના કન્ડેન્સર પર 2 વર્...  Positive       gu
2   3  கதைக்களம் பிடித்திருந்தது, அனைத்து நடிகர்களும்...  Positive       ta
3   4  ਵੌਇਸ-ਓਵਰ ਬਹੁਤ ਵਧੀਆ ਸੀ ਅਤੇ ਕਹਾਣੀ ਦੀ ਸੀਮਾ ਵਿੱਚ ਇ...  Positive       pa
4   5  जुथानि थाखाय जायगा गैया। गुबुन मुवा सोग्रा जाय...  Negative       bd

Test data shape: (100, 3)
   ID                                           sentence language
0   1                    1120 mAh, ਓਵਰਚਾਰਜਿੰਗ ਦੀ ਸੁਰੱਖਿਆ       pa
1   2  તે સઘન મોઇશ્ચરાઇઝિંગ પ્રદાન કરે છે અને સરસ સ્વ...       gu
2   3                      1120 ಎಂಎಎಚ್, ಮಿತಿಮೀರಿದ ರಕ್ಷಣೆ       kn
3   4  ভাৰতত নিৰ্মিত সৰ্বশ্ৰেষ্ঠ পাৰফিউম ব্ৰেণ্ডবোৰৰ ...       as
4   5  میں نے حال ہی میں "انفولڈ" سے ایک ٹیمپلیٹ خرید...       ur


In [None]:
from datasets import Dataset

# Map labels to integers (e.g., 0: Negative, 1: Positive)
label_map = {"Negative": 0, "Positive": 1}
tran_df["label"] = tran_df["label"].map(label_map)

# Create datasets from the DataFrames
full_train_dataset = Dataset.from_pandas(tran_df)
test_dataset = Dataset.from_pandas(test_df)

# Split the training dataset into train and validation sets (e.g., 80% train, 20% validation)
split_datasets = full_train_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split_datasets["train"]
val_dataset = split_datasets["test"]

print("Train size:", len(train_dataset))
print("Validation size:", len(val_dataset))
print("Test size:", len(test_dataset))

model_name = "/kaggle/input/llama-3.1/transformers/8b-instruct/2"  # update as needed

# Load the tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Ensure the tokenizer has a padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(example):
    return tokenizer(
        example["sentence"],
        padding="max_length",
        truncation=True,
        max_length=512  # adjust max_length if needed
    )

# Tokenize the datasets using the Hugging Face datasets library
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set the format for PyTorch
tokenized_train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
tokenized_val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
tokenized_test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

print("Tokenization complete!")

Train size: 800
Validation size: 200
Test size: 100


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Tokenization complete!


In [6]:
from transformers import BitsAndBytesConfig
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,  # computation in float16
    bnb_4bit_quant_type="nf4",             # common quantization type; adjust if needed
    llm_int8_enable_fp32_cpu_offload=True,
)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2,
    quantization_config=bnb_config,
    device_map="auto",        # automatically place model on available GPU(s)
    torch_dtype=torch.float16  # use float16 for model parameters
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Model loaded with 4-bit quantization and FP16 precision.")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/llama-3.1/transformers/8b-instruct/2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded with 4-bit quantization and FP16 precision.


In [7]:
from peft import prepare_model_for_kbit_training

lora_config = LoraConfig(
    r=16,                # LoRA rank
    lora_alpha=32,      # scaling factor
    target_modules=["q_proj", "v_proj"],  # adjust target modules if necessary
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_CLS"  # task: sequence classification
)
model = prepare_model_for_kbit_training(model) #adapting to 4bit scaling fp16 was giving error in next step
model = get_peft_model(model, lora_config)
print("LoRA configuration applied to the model.")

LoRA configuration applied to the model.


In [8]:
import numpy as np
import torch
from transformers import TrainingArguments, Trainer
import evaluate

# --- Ensure the tokenizer has a pad token ---
if tokenizer.pad_token is None:
    # Use the EOS token as the pad token
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
    model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = tokenizer.pad_token_id
data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    padding=True,
    return_tensors="pt"
)

# Update your training arguments to log more frequently and reduce training time.
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",        # Evaluate every eval_steps
    eval_steps=50,
    num_train_epochs=2,           
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    fp16=True,                    # Enable FP16 training
    half_precision_backend="amp", # Use AMP for half-precision backend
    gradient_accumulation_steps=2,  # Accumulate gradients over 2 steps
    adam_beta1=0.05,
    adam_beta2=0.995,
    learning_rate=2e-4,
    weight_decay=0.002,
    logging_dir="logs",
    logging_strategy="steps",
    logging_steps=50,             # Log every 10 steps for more frequent feedback
    logging_first_step=True,      # Log the very first step
    save_steps=50,
    save_total_limit=20,
    load_best_model_at_end=True,
    push_to_hub=False,
    report_to="none"              # Logging to console; you can change to "wandb" or "tensorboard" if desired
)

# --- Define compute_metrics (weighted F1) ---
metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=-1)
    return metric.compute(predictions=predictions, references=labels, average="weighted")

# Create your Trainer instance as before:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    compute_metrics=compute_metrics,
    data_collator = data_collator,
)

# Start training
trainer.train()


Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss,F1
50,1.104,0.356712,0.859944
100,0.362,0.225614,0.929874
150,0.1895,0.231672,0.93491
200,0.1505,0.250217,0.93491


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TrainOutput(global_step=200, training_loss=0.45414862215518953, metrics={'train_runtime': 3332.3081, 'train_samples_per_second': 0.48, 'train_steps_per_second': 0.06, 'total_flos': 3.43396526850048e+16, 'train_loss': 0.45414862215518953, 'epoch': 2.0})

In [9]:
eval_results = trainer.evaluate()
print("Evaluation results:", eval_results)

Evaluation results: {'eval_loss': 0.2256142646074295, 'eval_f1': 0.92987354476114, 'eval_runtime': 122.4084, 'eval_samples_per_second': 1.634, 'eval_steps_per_second': 0.408, 'epoch': 2.0}


In [None]:
from torch.utils.data import DataLoader
from tqdm import tqdm
import pandas as pd

# Use the tokenized test dataset
test_dataloader = DataLoader(tokenized_test_dataset, batch_size=25, shuffle=False)

model.eval()
all_preds = []

with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Inference"):
        inputs = {
            "input_ids": batch["input_ids"].to("cuda"),
            "attention_mask": batch["attention_mask"].to("cuda")
        }
        outputs = model(**inputs)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)
        all_preds.extend(preds.cpu().numpy().tolist())

# Map integer predictions back to sentiment labels
inv_label_map = {v: k for k, v in label_map.items()}  # e.g. {0: "Negative", 1: "Positive"}
pred_sentiments = [inv_label_map[pred] for pred in all_preds]

# Create submission DataFrame (ensure IDs match those in test_df)
submission = pd.DataFrame({
    "ID": test_df["ID"],
    "label": pred_sentiments
})

submission.to_csv("submission.csv", index=False)
print("Submission file created as submission.csv")


Inference: 100%|██████████| 4/4 [00:53<00:00, 13.32s/it]

Submission file created as submission.csv



