In [30]:
%pip install pandas
import pandas as pd

Note: you may need to restart the kernel to use updated packages.


In [31]:
data_df = pd.read_csv('/kaggle/input/blinkit-vs-zepto-vs-instamart-reviews/reviews.csv')
data_df.head()

Unnamed: 0,rating,date,review,platform
0,2,30 December 2024,"I was using it for long time, but have to stop...",blinkit
1,1,4 November 2024,Loving the fast deliveries and mostly they are...,blinkit
2,1,31 October 2024,The customer support is very disappointing. I ...,blinkit
3,5,29 August 2024,"I've been using Blinkit for a while now, and i...",blinkit
4,2,31 December 2024,Blinkit was my go to app and it was rare that ...,blinkit


In [32]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4620 entries, 0 to 4619
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   rating    4620 non-null   int64 
 1   date      4620 non-null   object
 2   review    4620 non-null   object
 3   platform  4620 non-null   object
dtypes: int64(1), object(3)
memory usage: 144.5+ KB


In [33]:
# Preprocess the data_df
# we are removing special characters and converting everything to lowercase
# after that we defining labels(sentiement) based on review
# negetive = 0 = rating 1-2
# neutral = 1 = rating 3
# positive = 2 = rating 4-5

import re

def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    return text

# Create sentiment labels
data_df['label'] = data_df['rating'].apply(lambda x: 0 if x <= 2 else (1 if x == 3 else 2))

# Preprocess the 'Review' column
data_df['review'] = data_df['review'].astype(str) # ensure review column is string type
data_df['processed_review'] = data_df['review'].apply(preprocess_text)

data_df.head()

Unnamed: 0,rating,date,review,platform,label,processed_review
0,2,30 December 2024,"I was using it for long time, but have to stop...",blinkit,0,i was using it for long time but have to stop ...
1,1,4 November 2024,Loving the fast deliveries and mostly they are...,blinkit,0,loving the fast deliveries and mostly they are...
2,1,31 October 2024,The customer support is very disappointing. I ...,blinkit,0,the customer support is very disappointing i r...
3,5,29 August 2024,"I've been using Blinkit for a while now, and i...",blinkit,2,ive been using blinkit for a while now and it ...
4,2,31 December 2024,Blinkit was my go to app and it was rare that ...,blinkit,0,blinkit was my go to app and it was rare that ...


In [34]:
%pip install transformers datasets torch scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [35]:
# impoorting nessecary modules

import os
os.environ["WANDB_DISABLED"] = "true"  # Disable W&B logging

# import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Load the dataset
df = data_df

In [36]:
# Map ratings to sentiment labels
def assign_sentiment(rating):
    if rating in [1, 2]:
        return 0  # Negative
    elif rating == 3:
        return 1  # Neutral
    else:
        return 2  # Positive

df['sentiment'] = df['rating'].apply(assign_sentiment)

In [37]:
# Split into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['review'], df['sentiment'], test_size=0.2, random_state=42
)

In [38]:
# Tokenization using a pre-trained tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(texts):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=512)

train_encodings = tokenize_function(list(train_texts))
test_encodings = tokenize_function(list(test_texts))

In [39]:

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_dict({'input_ids': train_encodings['input_ids'], 'attention_mask': train_encodings['attention_mask'], 'labels': train_labels})
test_dataset = Dataset.from_dict({'input_ids': test_encodings['input_ids'], 'attention_mask': test_encodings['attention_mask'], 'labels': test_labels})

# Load a pre-trained DistilBERT model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=3)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [40]:

# Define evaluation metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}


In [41]:
# Training arguments
# Hyperparameters can be changed
training_args = TrainingArguments(
    report_to=None,  # Ensure W&B is not used
    output_dir="./results",
    eval_strategy="epoch",  # Updated to "eval_strategy"
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    num_train_epochs=4, # change the number of epochs as needed, here keeping it as 3 only (more epoch might result loss)
    weight_decay=0.01,
    logging_dir="./logs",
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [42]:
# Trainer API
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

In [43]:
# Train the model
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.264281,0.924242,0.907808,0.891985,0.924242
2,No log,0.24871,0.925325,0.908633,0.892816,0.925325
3,No log,0.249972,0.924242,0.914476,0.907165,0.924242
4,No log,0.26001,0.927489,0.91872,0.914299,0.927489


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=464, training_loss=0.26068273083917026, metrics={'train_runtime': 444.744, 'train_samples_per_second': 33.242, 'train_steps_per_second': 1.043, 'total_flos': 1958432946978816.0, 'train_loss': 0.26068273083917026, 'epoch': 4.0})

In [44]:
# Evaluate the model
trainer.evaluate()



{'eval_loss': 0.26000964641571045,
 'eval_accuracy': 0.9274891774891775,
 'eval_f1': 0.9187200815107792,
 'eval_precision': 0.9142993432300436,
 'eval_recall': 0.9274891774891775,
 'eval_runtime': 8.8369,
 'eval_samples_per_second': 104.562,
 'eval_steps_per_second': 6.563,
 'epoch': 4.0}

In [45]:
import torch

# Test the model on custom inputs
text = "I love using Blinkit! The deliveries are so fast."
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)

# Move inputs to the same device as the model
inputs = inputs.to(model.device) # Move input tensors to the GPU

outputs = model(**inputs)
predicted_class = outputs.logits.argmax(-1).item()
sentiment_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
print(f"Predicted Sentiment: {sentiment_map[predicted_class]}")

Predicted Sentiment: Positive
