In [1]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


In [2]:
# Step 1: Load the dataset

file_path = r'C:\My data\Machine Learning\LLM\ReviewDatabase.xlsx'
df = pd.read_excel(file_path)

In [3]:
# Step 2: Data Preprocessing
# `reviewText` is the column containing the reviews
df = df[['reviewText', 'overall']]  # Selecting relevant columns

# Converting ratings to sentiment labels
def get_sentiment(rating):
    if rating >= 4:
        return 2  # Positive
    elif rating == 3:
        return 1  # Neutral
    else:
        return 0  # Negative

df['sentiment'] = df['overall'].apply(get_sentiment)

# Splitting the data into training and testing sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['reviewText'].tolist(), df['sentiment'].tolist(), test_size=0.2, random_state=42
)


In [4]:
# Step 3: Tokenize the texts using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the input texts and encode them for the BERT model
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)

# Convert the encodings into PyTorch tensors
train_dataset = torch.utils.data.TensorDataset(
    torch.tensor(train_encodings['input_ids']), torch.tensor(train_labels)
)
test_dataset = torch.utils.data.TensorDataset(
    torch.tensor(test_encodings['input_ids']), torch.tensor(test_labels)
)


In [5]:
# Step 4: Load pre-trained BERT model for classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [6]:
# Step 5: Set up training using PyTorch DataLoader
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=False)

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [7]:
# Step 6: Define optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

# Training loop
epochs = 2
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        batch_input_ids, batch_labels = [b.to(device) for b in batch]

        optimizer.zero_grad()
        outputs = model(input_ids=batch_input_ids, labels=batch_labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1}, Loss: {avg_loss}")



Epoch 1, Loss: 0.5440545627581221
Epoch 2, Loss: 0.4178771397897175


In [8]:
# Step 7: Evaluate the model on the test set
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in test_loader:
        batch_input_ids, batch_labels = [b.to(device) for b in batch]
        outputs = model(input_ids=batch_input_ids)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        predictions.extend(preds)
        true_labels.extend(batch_labels.cpu().numpy())

# Calculate accuracy and classification report
accuracy = accuracy_score(true_labels, predictions)
print(f"Test Accuracy: {accuracy}")
print(classification_report(true_labels, predictions, target_names=['Negative', 'Neutral', 'Positive']))


Test Accuracy: 0.9279279279279279
              precision    recall  f1-score   support

    Negative       0.00      0.00      0.00         7
     Neutral       0.00      0.00      0.00         1
    Positive       0.93      1.00      0.96       103

    accuracy                           0.93       111
   macro avg       0.31      0.33      0.32       111
weighted avg       0.86      0.93      0.89       111



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
# Step 8: Inference on new data
def predict_sentiment(review_text):
    model.eval()
    inputs = tokenizer(review_text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        sentiment = torch.argmax(logits, dim=1).item()
        sentiment_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
        return sentiment_map[sentiment]

In [10]:
# Example prediction
sample_review = "This product is really amazing, I loved it!"
predicted_sentiment = predict_sentiment(sample_review)
print(f"Predicted Sentiment: {predicted_sentiment}")

Predicted Sentiment: Positive


In [11]:
# Step 9: Predict sentiment for all reviews and add a new column to the dataframe
df['Predicted Sentiment'] = df['reviewText'].apply(predict_sentiment)

# Step 10: Save the dataframe with the new sentiment column to a new Excel file
output_file_path = r'C:\My data\Machine Learning\LLM\AmazonReviewSentiments.xlsx'
df.to_excel(output_file_path, index=False)

print(f"Data saved to {output_file_path}")

Data saved to C:\My data\Machine Learning\LLM\AmazonReviewSentiments.xlsx
