In [1]:
import pandas as pd

In [2]:
file_path=r"C:\Users\MSI\OneDrive\Desktop\voicecom\reviews.xlsx"
df = pd.read_excel(file_path)

print(df.head())
print(df.shape)
print(df.columns)
print(df.dtypes)
print(df.describe())
print(df.isnull().sum())
print(df.info())

     rating                                             review
0  negative  terrible place to work for i just heard a stor...
1  negative   hours , minutes total time for an extremely s...
2  negative  my less than stellar review is for service . w...
3  negative  i m granting one star because there s no way t...
4  negative  the food here is mediocre at best . i went aft...
(56000, 2)
Index(['rating', 'review'], dtype='object')
rating    object
review    object
dtype: object
          rating                                             review
count      56000                                              56000
unique         2                                              55993
top     negative  i work a arrowhead in a smaller store so befor...
freq       28000                                                  2
rating    0
review    0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56000 entries, 0 to 55999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype

In [3]:
df["label"] = df["rating"].map({"positive": 1, "negative": 0})
print(df.head())

     rating                                             review  label
0  negative  terrible place to work for i just heard a stor...      0
1  negative   hours , minutes total time for an extremely s...      0
2  negative  my less than stellar review is for service . w...      0
3  negative  i m granting one star because there s no way t...      0
4  negative  the food here is mediocre at best . i went aft...      0


In [4]:
import spacy

In [5]:
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
def batch_spacy_lemmas(texts):
    docs = nlp.pipe(texts, batch_size=100, n_process=-1) 
    for doc in docs:
        lemmas = [
            token.lemma_.lower()
            for token in doc
            if not token.is_stop and not token.is_punct and not token.is_space and token.lemma_.isalpha()
        ]
        yield " ".join(lemmas)

In [None]:
df["lemmatized_review"] = list(batch_spacy_lemmas(df["review"]))

In [None]:
print(df["lemmatized_review"][3])

In [None]:
from collections import Counter

def build_vocab(texts, min_freq=2, specials=["<pad>", "<unk>"]):
    counter = Counter()
    for text in texts:
        counter.update(text.split())
    words = [w for w, c in counter.items() if c >= min_freq]
    itos = specials + words
    stoi = {word: i for i, word in enumerate(itos)}

    return stoi, itos

stoi, itos = build_vocab(df["lemmatized_review"])

def numericalize(text, stoi, max_len=100):
    tokens = text.split()
    ids = [stoi.get(t, stoi["<unk>"]) for t in tokens[:max_len]]
    ids += [stoi["<pad>"]] * (max_len - len(ids))
    return torch.tensor(ids, dtype=torch.long)



In [None]:
import torch
df["tensor_review"] = df["lemmatized_review"].apply(lambda x: numericalize(x, stoi, max_len=100))

In [None]:
print(df["lemmatized_review"][3])
print(df["tensor_review"][3])

In [None]:
from torch import nn
from torch.nn.functional import binary_cross_entropy_with_logits
from sklearn.metrics import precision_score, accuracy_score
import numpy as np

class ReviewClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim=64, hidden_dim=32):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.fc = nn.Sequential(
            nn.Linear(embed_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)
        )

    def forward(self, x):
        embedded = self.embedding(x)  
        pooled = embedded.mean(dim=1)  
        return self.fc(pooled).squeeze(1)  

model = ReviewClassifier(vocab_size=len(stoi))
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.BCEWithLogitsLoss()


In [None]:
from torch.utils.data import DataLoader, TensorDataset, random_split


X = torch.stack(df["tensor_review"].tolist())
y = torch.tensor(df["label"].values, dtype=torch.float32)

dataset = TensorDataset(X, y)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_ds, test_ds = random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=64)


In [None]:
from torchmetrics.classification import BinaryAccuracy, BinaryPrecision
accuracy_metric = BinaryAccuracy()
precision_metric = BinaryPrecision()

def run_epoch(model, loader, train=True):
    if train:
        model.train()
    else:
        model.eval()

    total_loss = 0

    
    accuracy_metric.reset()
    precision_metric.reset()

    for texts, labels in loader:
        if train:
            optimizer.zero_grad()

        logits = model(texts)
        loss = criterion(logits, labels)
        total_loss += loss.item()

        probs = torch.sigmoid(logits)
        preds = (probs >= 0.5).float() 
        labels = labels.float()

        # Update metrics
        accuracy_metric.update(preds, labels)
        precision_metric.update(preds, labels)

        if train:
            loss.backward()
            optimizer.step()

    avg_loss = total_loss / len(loader)
    acc = accuracy_metric.compute().item()
    prec = precision_metric.compute().item()

    return avg_loss, acc, prec


In [None]:
epochs = 100
for epoch in range(1, epochs + 1):
    train_loss, train_acc, train_prec = run_epoch(model, train_loader, train=True)
    test_loss, test_acc, test_prec = run_epoch(model, test_loader, train=False)

    print(f"Epoch {epoch}:")
    print(f"  Train  | Loss: {train_loss:.4f} | Acc: {train_acc:.4f} | Precision: {train_prec:.4f}")
    print(f"  Test   | Loss: {test_loss:.4f} | Acc: {test_acc:.4f} | Precision: {test_prec:.4f}")
