In [2]:
import torch
import random
import numpy as np

seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [3]:
import  pandas as pd
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [4]:
# Directly Downloading dataset from kaggle and loading
import kagglehub
import os

# Download latest version
path = kagglehub.dataset_download("tarkkaanko/amazon")

print("Path to dataset files:", os.listdir(path))

Downloading from https://www.kaggle.com/api/v1/datasets/download/tarkkaanko/amazon?dataset_version_number=1...


100%|██████████| 582k/582k [00:00<00:00, 721kB/s]

Extracting files...
Path to dataset files: ['amazon_reviews.csv']





# 1. Data Preprocessing

In [5]:
df = pd.read_csv(f'{path}/amazon_reviews.csv')
print(df.shape)
df.head(5)

(4915, 12)


Unnamed: 0.1,Unnamed: 0,reviewerName,overall,reviewText,reviewTime,day_diff,helpful_yes,helpful_no,total_vote,score_pos_neg_diff,score_average_rating,wilson_lower_bound
0,0,,4.0,No issues.,2014-07-23,138,0,0,0,0,0.0,0.0
1,1,0mie,5.0,"Purchased this for my device, it worked as adv...",2013-10-25,409,0,0,0,0,0.0,0.0
2,2,1K3,4.0,it works as expected. I should have sprung for...,2012-12-23,715,0,0,0,0,0.0,0.0
3,3,1m2,5.0,This think has worked out great.Had a diff. br...,2013-11-21,382,0,0,0,0,0.0,0.0
4,4,2&amp;1/2Men,5.0,"Bought it with Retail Packaging, arrived legit...",2013-07-13,513,0,0,0,0,0.0,0.0


In [6]:
# Removing the other nucessary columns
df = df[['reviewText','overall']]

# Convert ratings to binary labels (-1=negative, 0=neutral, 1=positive)
df['label'] = df['overall'].apply(lambda x: 0 if x <= 2 else 1 if x == 3 else 2 if x>3 else None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label'] = df['overall'].apply(lambda x: 0 if x <= 2 else 1 if x == 3 else 2 if x>3 else None)


In [7]:
df.head()

Unnamed: 0,reviewText,overall,label
0,No issues.,4.0,2
1,"Purchased this for my device, it worked as adv...",5.0,2
2,it works as expected. I should have sprung for...,4.0,2
3,This think has worked out great.Had a diff. br...,5.0,2
4,"Bought it with Retail Packaging, arrived legit...",5.0,2


In [8]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
2,4449
0,324
1,142


In [9]:
def clean_text(text):
    # Checks if the text are actully string or not
    if not isinstance(text, str):
        text = str(text)
    # Convet to lowerCase
    text = text.lower()
    # Remove Urls
    #text = re.sub('https?://\S+|www\.\S+','',text)
    # Remove Html tags
    #text = re.sub(r'<.*?>','',text)
    # Remove special characters ann numbers
    text = re.sub(r'[^a-zA-Z\s]','',text)

    # Split the sentences into words
    text= text.split()

    # Applying stopwords and lemmatization
    text = [lemmatizer.lemmatize(word) for word in text if not word in stopwords.words('english')]

    # Remove extra whitespace
    text = ' '.join(text)
    return text

df['cleaned_reviewText'] = df['reviewText'].apply(clean_text)

In [10]:
df.head()

Unnamed: 0,reviewText,overall,label,cleaned_reviewText
0,No issues.,4.0,2,issue
1,"Purchased this for my device, it worked as adv...",5.0,2,purchased device worked advertised never much ...
2,it works as expected. I should have sprung for...,4.0,2,work expected sprung higher capacity think mad...
3,This think has worked out great.Had a diff. br...,5.0,2,think worked greathad diff bran gb card went s...
4,"Bought it with Retail Packaging, arrived legit...",5.0,2,bought retail packaging arrived legit orange e...


In [11]:
df.isnull().sum()

Unnamed: 0,0
reviewText,1
overall,0
label,0
cleaned_reviewText,0


In [12]:
df.dropna(inplace = True)

# 2. Tokenization (No Manual Vectorization Needed for Transformers)

#### Transformers handle vectorization internally through their tokenizers:

In [13]:
! pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [14]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['cleaned_reviewText'],padding='max_length',truncation= True)

# Filling the nan valuer with empth string
df['cleaned_reviewText'] = df['cleaned_reviewText'].fillna('').astype(str)

# Converting to HuggingFace Dataset format
from datasets import Dataset
dataset = Dataset.from_pandas(df)
tokenize_dataset = dataset.map(tokenize_function,batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/4914 [00:00<?, ? examples/s]

In [15]:
print(tokenize_dataset)

Dataset({
    features: ['reviewText', 'overall', 'label', 'cleaned_reviewText', '__index_level_0__', 'input_ids', 'attention_mask'],
    num_rows: 4914
})


In [16]:
print(tokenize_dataset.features)

{'reviewText': Value(dtype='string', id=None), 'overall': Value(dtype='float64', id=None), 'label': Value(dtype='int64', id=None), 'cleaned_reviewText': Value(dtype='string', id=None), '__index_level_0__': Value(dtype='int64', id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}


# 3. Model Traning using Transformers Models

In [17]:
! pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [22]:
from transformers import AutoModelForSequenceClassification, TrainingArguments , Trainer
import numpy as np
import evaluate

# Load model
model = AutoModelForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels= 5 # hyperparamerter
)

#split dataset
train_test = tokenize_dataset.train_test_split(test_size=0.2)

# Metrics
metric = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    logits,labels = eval_pred
    predictions = np.argmax(logits,axis = -1)
    return metric.compute(predictions=predictions,references = labels)

# Training arguments
training_args = TrainingArguments(
    output_dir = './results',
    eval_strategy='epoch',
    save_strategy='epoch',
    learning_rate=2e-5,
    per_device_eval_batch_size=16,
    per_device_train_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    report_to='none' #disable wandb/etc for simplicty

)
# Treiner
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset=train_test['train'],
    eval_dataset=train_test['test'],
    compute_metrics= compute_metrics
)

# Train
trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.228509,0.925738
2,No log,0.287567,0.923703
3,0.299700,0.251457,0.93591
4,0.299700,0.273026,0.933876
5,0.147400,0.320956,0.92472
6,0.147400,0.357944,0.917599
7,0.081600,0.422804,0.910478
8,0.081600,0.451519,0.905392
9,0.038200,0.4462,0.91353
10,0.038200,0.458094,0.910478


TrainOutput(global_step=2460, training_loss=0.11851865024101443, metrics={'train_runtime': 2182.3521, 'train_samples_per_second': 18.013, 'train_steps_per_second': 1.127, 'total_flos': 5207572035225600.0, 'train_loss': 0.11851865024101443, 'epoch': 10.0})

In [23]:
from transformers import pipeline
import torch

def predict_sentiment(reviews, model_path='./results/checkpoint-best', batch_size=16):
    """
    Predict sentiment for new Amazon reviews

    Args:
        reviews (str/list): Single review or list of reviews
        model_path (str): Path to saved model checkpoint
        batch_size (int): Batch size for prediction

    Returns:
        dict/list: Predictions with star ratings (1-5) and confidence scores
    """
    # Load trained model and tokenizer
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

    # Create pipeline
    classifier = pipeline(
        "text-classification",
        model=model,
        tokenizer=tokenizer,
        device=torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
        batch_size=batch_size
    )

    # Single vs batch processing
    is_single = isinstance(reviews, str)
    inputs = [reviews] if is_single else reviews

    # Get predictions
    raw_preds = classifier(inputs)

    # Format results
    def format_pred(pred):
        star = int(pred['label'].split('_')[-1])  # Convert LABEL_1 → 1
        return {
            'stars': star,
            'sentiment': 'positive' if star > 3 else 'negative' if star < 3 else 'neutral',
            'confidence': float(pred['score'])
        }

    results = [format_pred(p) for p in raw_preds]
    return results[0] if is_single else results

In [None]:
# Example usage
test_reviews = [
    "This product changed my life! Worth every penny.",
    "Terrible quality. Broke after 2 days.",
    "It's okay, but the packaging was damaged.",
    "The item works as described, nothing special."
]

predictions = predict_sentiment(test_reviews)

for review, pred in zip(test_reviews, predictions):
    print(f"Review: {review[:50]}...")
    print(f"Predicted: {pred['stars']} stars ({pred['sentiment']}, {pred['confidence']:.2f} confidence)")
    print("-" * 80)