In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:

# ================================================================
# ‚úÖ AMAZON REVIEWS SENTIMENT CLASSIFIER (ROBERTA-BASE)
# ================================================================

import os
import json
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    TextClassificationPipeline,
)
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# ================================================================
# üß© LOAD AMAZON JSONL DATA
# ================================================================
def load_amazon_jsonl(folder_path, max_samples=5000):
    data = []
    for file in os.listdir(folder_path):
        if file.endswith(".jsonl"):
            path = os.path.join(folder_path, file)
            print(f"üìÇ Loading: {file}")
            with open(path, "r", encoding="utf-8") as f:
                for line in f:
                    try:
                        obj = json.loads(line)
                        text = (
                            obj.get("reviewText")
                            or obj.get("review_text")
                            or obj.get("review_body")
                            or obj.get("text")
                        )
                        rating = obj.get("overall") or obj.get("rating")
                        if text and rating:
                            data.append({
                                "text": text.strip(),
                                "label": int(rating) - 1  # labels 0‚Äì4
                            })
                        if len(data) >= max_samples:
                            break
                    except json.JSONDecodeError:
                        continue
    df = pd.DataFrame(data)
    print(f"‚úÖ Loaded {len(df)} samples.")
    print("Sample rows:\n", df.head(2).to_dict(orient="records"))
    print("Label distribution:\n", df['label'].value_counts().to_dict())
    return df


# ================================================================
# ‚öôÔ∏è TRAINER CLASS
# ================================================================
class SentimentTrainer:
    def __init__(self, model_name="roberta-base"):
        self.model_name = model_name
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"üöÄ Device: {self.device}")

    def prepare(self, num_labels=5):
        print(f"üöÄ Loading tokenizer and model: {self.model_name}")
        tok = AutoTokenizer.from_pretrained(
            self.model_name, use_fast=True, local_files_only=False
        )
        model = AutoModelForSequenceClassification.from_pretrained(
            self.model_name, num_labels=num_labels
        )
        return model.to(self.device), tok

    def tokenize(self, examples, tokenizer):
        return tokenizer(
            examples["text"],
            truncation=True,
            padding="max_length",
            max_length=128,
        )

    def train(self, model, tokenizer, train_df, eval_df):
        print("üîÅ Tokenizing train dataset (num_proc=1)...")
        train_ds = Dataset.from_pandas(train_df)
        eval_ds = Dataset.from_pandas(eval_df)

        train_ds = train_ds.map(lambda e: self.tokenize(e, tokenizer), batched=True)
        eval_ds = eval_ds.map(lambda e: self.tokenize(e, tokenizer), batched=True)

        columns = ["input_ids", "attention_mask", "label"]
        train_ds.set_format(type="torch", columns=columns)
        eval_ds.set_format(type="torch", columns=columns)

        args = TrainingArguments(
            output_dir="./results",
            eval_strategy="steps",
            save_strategy="epoch",
            learning_rate=2e-5,
            per_device_train_batch_size=8,
            per_device_eval_batch_size=8,
            num_train_epochs=3,
            weight_decay=0.01,
            logging_dir="./logs",
            logging_steps=50,
            report_to="none",
            disable_tqdm=False,
        )

        trainer = Trainer(
            model=model,
            args=args,
            train_dataset=train_ds,
            eval_dataset=eval_ds,
        )

        print("‚úÖ Pre-training checks passed. Starting Trainer.train() now.")
        trainer.train()
        print("‚úÖ Trainer finished.")
        return trainer


# ================================================================
# üìä EVALUATION FUNCTION
# ================================================================
def evaluate_model(trainer, eval_df, tokenizer, model):
    eval_ds = Dataset.from_pandas(eval_df)
    eval_ds = eval_ds.map(lambda e: tokenizer(e["text"], truncation=True, padding="max_length", max_length=128), batched=True)
    eval_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

    preds = trainer.predict(eval_ds)
    preds_logits = preds.predictions
    preds_labels = np.argmax(preds_logits, axis=1)
    true_labels = preds.label_ids

    acc = accuracy_score(true_labels, preds_labels)
    f1 = f1_score(true_labels, preds_labels, average="weighted")
    print("\nüìà Evaluation Results:")
    print(f"Accuracy: {acc:.4f}")
    print(f"Weighted F1: {f1:.4f}")
    print("\nDetailed Classification Report:\n")
    print(classification_report(true_labels, preds_labels))
    return acc, f1


# ================================================================
# üí¨ PREDICTION FUNCTION
# ================================================================
def predict_sentiment(texts, tokenizer, model):
    pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)
    label_map = {
        0: "‚≠ê Terrible",
        1: "‚≠ê Bad",
        2: "‚≠ê‚≠ê Neutral",
        3: "‚≠ê‚≠ê‚≠ê Good",
        4: "‚≠ê‚≠ê‚≠ê‚≠ê Excellent"
    }
    outputs = pipe(texts, truncation=True)
    print("\nüí¨ Predictions:")
    for t, o in zip(texts, outputs):
        label_id = int(o["label"].split("_")[-1]) if "_" in o["label"] else np.argmax(o["score"])
        sentiment = label_map.get(label_id, f"Label {label_id}")
        print(f"\nüìù Text: {t}\nüîπ Sentiment: {sentiment} ({o['score']:.2f})")


# ================================================================
# üíæ SAVE / LOAD MODEL HELPERS
# ================================================================
def save_model(trainer, tokenizer, save_dir="./roberta_sentiment"):
    os.makedirs(save_dir, exist_ok=True)
    trainer.save_model(save_dir)
    tokenizer.save_pretrained(save_dir)
    print(f"üíæ Model and tokenizer saved to {save_dir}")

def load_model(save_dir="./roberta_sentiment"):
    model = AutoModelForSequenceClassification.from_pretrained(save_dir)
    tokenizer = AutoTokenizer.from_pretrained(save_dir)
    print(f"üì¶ Model loaded from {save_dir}")
    return model, tokenizer


# ================================================================
# üß† MAIN EXECUTION
# ================================================================
def main(data_path, max_samples=5000):
    df = load_amazon_jsonl(data_path, max_samples)
    if len(df) == 0:
        raise ValueError("No data loaded. Check your JSONL keys or folder path.")
    train_df, eval_df = train_test_split(df, test_size=0.2, random_state=42)

    trainer_obj = SentimentTrainer("roberta-base")
    model, tokenizer = trainer_obj.prepare(num_labels=5)
    trainer = trainer_obj.train(model, tokenizer, train_df, eval_df)

    # Evaluate
    evaluate_model(trainer, eval_df, tokenizer, model)

    # Predict sample texts
    sample_texts = [
        "This product is amazing, I absolutely love it!",
        "Terrible quality, completely disappointed.",
        "It‚Äôs okay, not too bad but not great either."
    ]
    predict_sentiment(sample_texts, tokenizer, model)

    # Save model
    save_model(trainer, tokenizer)


# ================================================================
# üöÄ RUN
# ================================================================
if __name__ == "__main__":
    main("/kaggle/input/amazonreviews", max_samples=50000)

2025-11-01 13:25:55.617040: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762003555.639591     139 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762003555.646368     139 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


üìÇ Loading: All_Beauty.jsonl
üìÇ Loading: Appliances.jsonl
üìÇ Loading: Health_and_Personal_Care.jsonl
‚úÖ Loaded 50002 samples.
Sample rows:
 [{'text': "This spray is really nice. It smells really good, goes on really fine, and does the trick. I will say it feels like you need a lot of it though to get the texture I want. I have a lot of hair, medium thickness. I am comparing to other brands with yucky chemicals so I'm gonna stick with this. Try it!", 'label': 4}, {'text': 'This product does what I need it to do, I just wish it was odorless or had a soft coconut smell. Having my head smell like an orange coffee is offputting. (granted, I did know the smell was described but I was hoping it would be light)', 'label': 3}]
Label distribution:
 {4: 30161, 3: 6875, 0: 5355, 2: 4598, 1: 3013}
üöÄ Device: cuda
üöÄ Loading tokenizer and model: roberta-base


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


üîÅ Tokenizing train dataset (num_proc=1)...


Map:   0%|          | 0/40001 [00:00<?, ? examples/s]

Map:   0%|          | 0/10001 [00:00<?, ? examples/s]

‚úÖ Pre-training checks passed. Starting Trainer.train() now.


Step,Training Loss,Validation Loss
50,1.2114,0.930951
100,0.9561,0.809791
150,0.8413,0.781279
200,0.8079,0.739547
250,0.7736,0.773079
300,0.7784,0.713349
350,0.7533,0.71299
400,0.7714,0.683276
450,0.7394,0.682257
500,0.7648,0.699761


‚úÖ Trainer finished.


Map:   0%|          | 0/10001 [00:00<?, ? examples/s]


üìà Evaluation Results:
Accuracy: 0.7575
Weighted F1: 0.7474

Detailed Classification Report:

              precision    recall  f1-score   support

           0       0.70      0.75      0.72      1070
           1       0.36      0.31      0.34       597
           2       0.46      0.49      0.47       899
           3       0.51      0.36      0.42      1399
           4       0.88      0.93      0.91      6036

    accuracy                           0.76     10001
   macro avg       0.58      0.57      0.57     10001
weighted avg       0.74      0.76      0.75     10001


üí¨ Predictions:

üìù Text: This product is amazing, I absolutely love it!
üîπ Sentiment: ‚≠ê‚≠ê‚≠ê‚≠ê Excellent (0.99)

üìù Text: Terrible quality, completely disappointed.
üîπ Sentiment: ‚≠ê Terrible (0.96)

üìù Text: It‚Äôs okay, not too bad but not great either.
üîπ Sentiment: ‚≠ê‚≠ê Neutral (0.85)
üíæ Model and tokenizer saved to ./roberta_sentiment


In [None]:
!pip install transformers==4.43.3


In [None]:
import transformers
print(transformers.__version__)


In [None]:
!pip uninstall -y peft


In [None]:
!pip install peft==0.10.0


In [2]:
import transformers, peft, accelerate
print("Transformers:", transformers.__version__)
print("PEFT:", peft.__version__)
print("Accelerate:", accelerate.__version__)


Transformers: 4.43.3
PEFT: 0.10.0
Accelerate: 0.33.0


In [None]:
!pip uninstall -y transformers peft accelerate
!pip cache purge


In [None]:
!pip install transformers==4.43.3 peft==0.10.0 accelerate==0.33.0 --no-deps
