In [1]:
#same code i used in Rating_Reviews_NN_PyTorch
import pandas as pd
file_path=r"C:\Users\MSI\OneDrive\Desktop\voicecom\reviews.xlsx"
df = pd.read_excel(file_path)

print(df.head())
print(df.shape)
print(df.columns)
print(df.dtypes)
print(df.describe())
print(df.isnull().sum())
print(df.info())
df["label"] = df["rating"].map({"positive": 1, "negative": 0})
print(df.head())

import spacy
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
def batch_spacy_lemmas(texts):
    docs = nlp.pipe(texts, batch_size=100, n_process=-1) 
    for doc in docs:
        lemmas = [
            token.lemma_.lower()
            for token in doc
            if not token.is_stop and not token.is_punct and not token.is_space and token.lemma_.isalpha()
        ]
        yield " ".join(lemmas)
df["lemmatized_review"] = list(batch_spacy_lemmas(df["review"]))
print(df["lemmatized_review"][3])

     rating                                             review
0  negative  terrible place to work for i just heard a stor...
1  negative   hours , minutes total time for an extremely s...
2  negative  my less than stellar review is for service . w...
3  negative  i m granting one star because there s no way t...
4  negative  the food here is mediocre at best . i went aft...
(56000, 2)
Index(['rating', 'review'], dtype='object')
rating    object
review    object
dtype: object
          rating                                             review
count      56000                                              56000
unique         2                                              55993
top     negative  i work a arrowhead in a smaller store so befor...
freq       28000                                                  2
rating    0
review    0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56000 entries, 0 to 55999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype

In [2]:
import torch
import numpy as np
from collections import Counter
from torch import nn
from torch.nn.functional import binary_cross_entropy_with_logits
from torch.utils.data import DataLoader, TensorDataset, random_split
from torchmetrics.classification import BinaryAccuracy, BinaryPrecision

In [3]:
torch.manual_seed(42)

<torch._C.Generator at 0x1a4a3d5cc30>

In [4]:
def build_vocab(texts, min_freq=2, specials=["<pad>", "<unk>"]):
    counter = Counter()
    for text in texts:
        counter.update(text.split())
    words = [w for w, c in counter.items() if c >= min_freq]
    itos = specials + words
    stoi = {word: i for i, word in enumerate(itos)}
    return stoi, itos

stoi, itos = build_vocab(df["lemmatized_review"])

def numericalize(text, stoi, max_len=100):
    tokens = text.split()
    ids = [stoi.get(t, stoi["<unk>"]) for t in tokens[:max_len]]
    ids += [stoi["<pad>"]] * (max_len - len(ids))
    return torch.tensor(ids, dtype=torch.long)

df["tensor_review"] = df["lemmatized_review"].apply(lambda x: numericalize(x, stoi, max_len=100))

In [5]:
class ReviewClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim=64, hidden_dim=32):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.fc = nn.Sequential(
            nn.Linear(embed_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )

    def forward(self, x):
        embedded = self.embedding(x)  
        pooled = embedded.mean(dim=1)  
        return self.fc(pooled).squeeze(1)

model = ReviewClassifier(vocab_size=len(stoi))
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()

In [6]:
X = torch.stack(df["tensor_review"].tolist())
y = torch.tensor(df["label"].values, dtype=torch.float32)

dataset = TensorDataset(X, y)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size

g = torch.Generator().manual_seed(42)
train_ds, test_ds = random_split(dataset, [train_size, test_size], generator=g)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=64)


In [7]:
accuracy_metric = BinaryAccuracy()
precision_metric = BinaryPrecision()

def run_epoch(model, loader, train=True):
    if train:
        model.train()
    else:
        model.eval()

    total_loss = 0
    accuracy_metric.reset()
    precision_metric.reset()

    for texts, labels in loader:
        if train:
            optimizer.zero_grad()

        logits = model(texts)
        loss = criterion(logits, labels)
        total_loss += loss.item()

        probs = torch.sigmoid(logits)
        preds = (probs >= 0.5).float()
        labels = labels.float()

        accuracy_metric.update(preds, labels)
        precision_metric.update(preds, labels)

        if train:
            loss.backward()
            optimizer.step()

    avg_loss = total_loss / len(loader)
    acc = accuracy_metric.compute().item()
    prec = precision_metric.compute().item()

    return avg_loss, acc, prec

print("Model Configuration:")
print(f"  Seed: 42")
print(f"  Embedding Dim: 64")
print(f"  Hidden Dim: 32")
print(f"  Learning Rate: 1e-3")
print(f"  Batch Size: 64")
print(f"  Max Length: 100")
print(f"  Vocab Size: {len(stoi)}")

Model Configuration:
  Seed: 42
  Embedding Dim: 64
  Hidden Dim: 32
  Learning Rate: 1e-3
  Batch Size: 64
  Max Length: 100
  Vocab Size: 28460


In [8]:
epochs = 5
for epoch in range(1, epochs + 1):
    train_loss, train_acc, train_prec = run_epoch(model, train_loader, train=True)
    test_loss, test_acc, test_prec = run_epoch(model, test_loader, train=False)

    print(f"Epoch {epoch}:")
    print(f"  Train  | Loss: {train_loss:.4f} | Acc: {train_acc:.4f} | Precision: {train_prec:.4f}")
    print(f"  Test   | Loss: {test_loss:.4f} | Acc: {test_acc:.4f} | Precision: {test_prec:.4f}")

Epoch 1:
  Train  | Loss: 0.4634 | Acc: 0.7785 | Precision: 0.8198
  Test   | Loss: 0.3146 | Acc: 0.8741 | Precision: 0.8733
Epoch 2:
  Train  | Loss: 0.2711 | Acc: 0.8964 | Precision: 0.8917
  Test   | Loss: 0.2720 | Acc: 0.8913 | Precision: 0.8761
Epoch 3:
  Train  | Loss: 0.2220 | Acc: 0.9151 | Precision: 0.9101
  Test   | Loss: 0.2587 | Acc: 0.8996 | Precision: 0.8951
Epoch 4:
  Train  | Loss: 0.1915 | Acc: 0.9275 | Precision: 0.9234
  Test   | Loss: 0.2578 | Acc: 0.9007 | Precision: 0.9017
Epoch 5:
  Train  | Loss: 0.1684 | Acc: 0.9375 | Precision: 0.9343
  Test   | Loss: 0.2633 | Acc: 0.9015 | Precision: 0.8966
