In [7]:
# ------------------------------------------------------------
# Intelligent Customer Feedback Analysis System
# Part 2 - Sentiment Classification using DistilBERT
# ------------------------------------------------------------

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch

# Load the dataset
data = pd.read_csv("sentiment_cleaned.csv")
print("Dataset shape:", data.shape)
print(data.head())

# ------------------------------------------------------------
# Step 1: Identify the correct feedback column
# ------------------------------------------------------------
# Some datasets use 'Review', 'Summary', or 'clean_text'
text_column = None
for col in data.columns:
    if col.lower() in ['review', 'summary', 'clean_text', 'feedback']:
        text_column = col
        break

if text_column is None:
    raise ValueError("No text column found. Please check your dataset columns.")

print(f"\nUsing column '{text_column}' as the text input.")

# ------------------------------------------------------------
# Step 2: Clean and prepare data
# ------------------------------------------------------------
# Drop rows with missing text or sentiment
data.dropna(subset=[text_column, 'Sentiment'], inplace=True)

# Encode labels numerically (Positive=2, Neutral=1, Negative=0)
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['Sentiment'].str.lower())

# ------------------------------------------------------------
# Step 3: Split into training and testing sets
# ------------------------------------------------------------
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data[text_column].tolist(),
    data['label'].tolist(),
    test_size=0.2,
    random_state=42
)

# ------------------------------------------------------------
# Step 4: Tokenization using DistilBERT tokenizer
# ------------------------------------------------------------
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize_data(example):
    return tokenizer(example['text'], truncation=True, padding='max_length', max_length=128)

train_data = Dataset.from_dict({'text': train_texts, 'label': train_labels})
test_data = Dataset.from_dict({'text': test_texts, 'label': test_labels})

train_dataset = train_data.map(tokenize_data, batched=True)
test_dataset = test_data.map(tokenize_data, batched=True)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# ------------------------------------------------------------
# Step 5: Load DistilBERT model
# ------------------------------------------------------------
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)

# ------------------------------------------------------------
# Step 6: Define training parameters
# ------------------------------------------------------------
# Removed 'evaluation_strategy' since it's not supported in older versions
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=50,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_strategy="no"
)

# ------------------------------------------------------------
# Step 7: Initialize the Trainer
# ------------------------------------------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# ------------------------------------------------------------
# Step 8: Train the model
# ------------------------------------------------------------
trainer.train()

# ------------------------------------------------------------
# Step 9: Save the model and tokenizer
# ------------------------------------------------------------
model.save_pretrained("sentiment_model")
tokenizer.save_pretrained("sentiment_model")

print("\nModel training complete and saved as 'sentiment_model/'.")

# ------------------------------------------------------------
# Step 10: Test a sample prediction
# ------------------------------------------------------------
sample_text = "The product quality was amazing and delivery was fast!"
inputs = tokenizer(sample_text, return_tensors="pt", truncation=True, padding=True)
outputs = model(**inputs)
predicted_label = torch.argmax(outputs.logits).item()
print("\nSample Prediction:", label_encoder.inverse_transform([predicted_label])[0])


Dataset shape: (2000, 6)
                                         ProductName  ProductPrice  Rate  \
0  ZEBRONICS Zeb-Vita Plus 16 W Bluetooth Laptop/...        1349.0   5.0   
1  realme Mobile Game Finger Sleeves  Gaming Acce...         199.0   5.0   
2  Shri Vasunandi Traders chandelier_jhoomar_pend...         505.0   1.0   
3  AutoKraftZ Most Popular Sun UV Protection Arm ...         129.0   3.0   
4                      Bajaj GX1 500 W Mixer Grinder        2299.0   4.0   

            Review         Summary Sentiment  
0        Wonderful            nice  positive  
1         Terrific            best  positive  
2  useless product  very bad items  negative  
3        Just okay           Ok ok   Neutral  
4  Worth the money            nice  positive  

Using column 'Review' as the text input.


Map:   0%|          | 0/1600 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33manjalichebathina[0m ([33manjalichebathina-eluru-college-of-engineering-and-technology[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Step,Training Loss
10,1.1093
20,1.0243
30,0.8456
40,0.8617
50,0.6005
60,0.4898
70,0.3518
80,0.4196
90,0.5455
100,0.2453



Model training complete and saved as 'sentiment_model/'.

Sample Prediction: positive
