In [33]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix,f1_score
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import re
import os

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [3]:
df = pd.read_csv("../data/fake reviews dataset.csv")

df.head()

Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor..."
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I..."
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it i..."
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the s...


In [4]:
print("Shape:", df.shape)
print("\nLabel distribution:")
print(df['label'].value_counts())

Shape: (40432, 4)

Label distribution:
label
CG    20216
OR    20216
Name: count, dtype: int64


In [5]:
for col in df.columns:
    if col == "text_":
        df = df.rename(columns={'text_':"text"})

In [6]:
df.head()

Unnamed: 0,category,rating,label,text
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor..."
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I..."
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it i..."
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the s...


In [7]:
df = df[['text', 'label']].dropna()

In [8]:
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower()) #- any character that is NOT a letter (a‚Äìz, A‚ÄìZ) or whitespace. - Those characters are replaced with '' 
    text = re.sub(r'\s+', ' ', text) #- one or more whitespace characters.- Replaces them with a single space
    return text.strip()

df['cleaned'] = df['text'].apply(clean_text) #- Applies your clean_text function to each row.
df = df[df['cleaned'].str.len() > 10] #removing trivial, noisy, or uninformative samples that could hurt model training.

In [9]:
df.head()

Unnamed: 0,text,label,cleaned
0,"Love this! Well made, sturdy, and very comfor...",CG,love this well made sturdy and very comfortabl...
1,"love it, a great upgrade from the original. I...",CG,love it a great upgrade from the original ive ...
2,This pillow saved my back. I love the look and...,CG,this pillow saved my back i love the look and ...
3,"Missing information on how to use it, but it i...",CG,missing information on how to use it but it is...
4,Very nice set. Good quality. We have had the s...,CG,very nice set good quality we have had the set...


In [10]:
df['label'] = df['label'].map({'CG': 1, 'OR':0})

print(df.shape)

(40430, 3)


In [11]:
X = df['cleaned']
y = df['label']
print(X.shape)
print(y.shape)

(40430,)
(40430,)


In [12]:
X_train, X_temp, y_train, y_temp = train_test_split(X,y,test_size=0.3,random_state=42, stratify=y)

X_val,X_test,y_val,y_test = train_test_split(X_temp,y_temp,test_size=0.5,random_state=42,stratify=y_temp)

print(f"Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")


Train: 28301, Val: 6064, Test: 6065


DistilBERT is a distilled version of BERT: it keeps the same Transformer encoder ideas but with fewer layers so it runs faster and uses less memory. It learns language patterns from large text corpora and can be fine‚Äëtuned for tasks like classification. The Hugging Face DistilBertForSequenceClassification wraps the encoder and adds a small classification head on top to output logits for each class.

Architecture in simple terms
- Transformer encoder: reads the whole sentence and builds contextual word vectors (each token‚Äôs meaning depends on the whole sentence).
- Distillation: a teacher model (BERT) teaches a smaller student (DistilBERT) to mimic its behavior so the student is compact but still strong.
- Classification head: a linear layer on top of the pooled output that maps to num_labels (here 2).


üîπ What is PyTorch actually doing?
- PyTorch is a deep learning framework. Think of it as the ‚Äúengine‚Äù that lets us build, train, and run neural networks.
- It provides:
- Tensors (its basic data structure, like NumPy arrays but with GPU support).
- Automatic differentiation (so gradients for backpropagation are calculated automatically).
- Modules (ready‚Äëmade building blocks like layers, optimizers, losses).
- In your code, PyTorch is handling:
- The dataset (Dataset and DataLoader classes).
- The training loop (forward pass, loss calculation, backward pass, optimizer step).
- Moving data and models to GPU/CPU (.to(device)).

üîπ What are Tensors?
- A tensor is just a multi‚Äëdimensional array (like a matrix, but more general).
- Example:
- Scalar ‚Üí 5 (0D tensor).
- Vector ‚Üí [1,2,3] (1D tensor).
- Matrix ‚Üí [[1,2],[3,4]] (2D tensor).
- Higher dimensions ‚Üí images, batches of text, etc.
- PyTorch tensors can live on GPU, which makes training super fast compared to normal Python arrays.

üîπ What is the Attention Mask?
- When we tokenize text, we pad shorter sentences to a fixed length (say 256 tokens).
- Example: "hello world" ‚Üí [101, 7592, 2088, 102, 0, 0, 0...] (zeros are padding).
- The attention mask tells the model which tokens are real and which are just padding.
- 1 ‚Üí real token.
- 0 ‚Üí padding.
- Without this, the model would waste attention on meaningless padding tokens.

 Why do we need tensors?
- Deep learning models (like DistilBERT) are basically giant math machines.
- They don‚Äôt understand text directly ‚Äî they understand numbers.
- A tensor is the data structure PyTorch uses to hold these numbers.
- Think of a tensor as a container for numbers that can be:
- 1D (like a list of token IDs for one sentence).
- 2D (like a batch of sentences, each with token IDs).
- 3D+ (images, video frames, etc.).
The key advantage: tensors can live on the GPU, so millions of operations can be done in parallel very fast. Without tensors, training would be painfully slow.

üîπ What help are tensors doing?
- They allow PyTorch to:
- Store the tokenized text (numbers instead of words).
- Store the attention mask (1s and 0s for real vs. padded tokens).
- Store the labels (like 0 for "Fake", 1 for "Real").
- During training:
- These tensors are fed into the model.
- The model does matrix multiplications and attention calculations on them.
- Gradients (also tensors) are computed and used to update weights.
So tensors are the bridge between your text data and the math inside the neural network.

üîπ Why is the dataset being sent as a list?
When you create ReviewDataset(X_train.tolist(), y_train.tolist(), tokenizer):
- X_train and y_train are often NumPy arrays or Pandas series.
- .tolist() converts them into Python lists because the custom Dataset class expects normal lists it can index with __getitem__.
- Lists are easier for PyTorch‚Äôs Dataset to handle when fetching samples one by one.
So: lists = simple containers of text and labels ‚Üí dataset wraps them ‚Üí DataLoader batches them.

üîπ What is the DataLoader and what is it doing?
Think of DataLoader as a waiter in a restaurant:
- The Dataset is the kitchen (it has all the food = samples).
- The DataLoader is the waiter who brings food to the table in small batches.
- The Model is the customer who eats the food (learns from the data).
What DataLoader does:
- Batching: Instead of giving the model one sentence at a time, it gives (say) 16 sentences together. This speeds up training and stabilizes learning.
- Shuffling: Randomizes the order of samples each epoch so the model doesn‚Äôt memorize sequence patterns.
- Parallel loading: Can fetch data using multiple workers (threads) to keep GPU busy.
So DataLoader = efficient delivery system for data ‚Üí model.

üîπ What is AdamW and how does it optimize the model?
Optimization = how the model learns by adjusting its weights.
AdamW in simple terms:
- Adam = Adaptive Moment Estimation. It‚Äôs like a smart gradient descent:
- It looks at the gradient (direction of error).
- It keeps track of past gradients (momentum) so it doesn‚Äôt zig‚Äëzag too much.
- It adapts the learning rate for each parameter individually.
- W = Weight Decay. This prevents weights from growing too large (regularization).
What happens during optimization:
- Model makes a prediction ‚Üí compares with true label ‚Üí computes loss.
- PyTorch calculates gradients (how much each weight contributed to the error).
- AdamW uses these gradients to adjust weights slightly in the right direction.
- Example: if a weight made the model predict too high, AdamW nudges it lower.
- Repeat for many batches ‚Üí model gradually learns patterns in the data.

üîπ Putting it all together (flow)
- Dataset ‚Üí holds text + labels.
- DataLoader ‚Üí batches them into tensors.
- Model ‚Üí takes tensors, runs forward pass, outputs predictions.
- Loss ‚Üí measures how wrong predictions are.
- Backpropagation ‚Üí computes gradients.
- AdamW optimizer ‚Üí updates weights using gradients.
- Repeat for many epochs ‚Üí model improves accuracy.

‚ö° Simple analogy:
- Dataset = library of books.
- DataLoader = librarian who brings 16 books at a time.
- Model = student reading books.
- Loss = exam score showing mistakes.
- AdamW = teacher correcting the student‚Äôs notes so they improve next time.


## üîπ Why do we need the loop for each epoch?
Let‚Äôs break it down:

### 1. `model.train()` alone is not enough
- `model.train()` just **switches the model into training mode** (turns on dropout, etc.).  
- It does **not** actually train the model.  
- Training requires:  
  - Feeding data (forward pass).  
  - Calculating loss.  
  - Backpropagation (gradients).  
  - Optimizer step (update weights).  
- That‚Äôs why we need the loop ‚Äî to actually perform these steps repeatedly.

---

### 2. The **epoch loop** ensures repeated exposure
- One epoch = model sees the entire dataset once.  
- But one pass is not enough ‚Äî the model won‚Äôt learn well from just one look.  
- Multiple epochs = repeated practice.  
- Each epoch refines the weights further, reducing loss and improving accuracy.

---

### 3. The **batch loop** inside each epoch
- Datasets are too big to feed all at once (memory issue).  
- So we split into **mini‚Äëbatches** (e.g., 16 samples).  
- For each batch:
  1. Forward pass ‚Üí model predicts.  
  2. Loss ‚Üí compare prediction vs. true labels.  
  3. Backward pass ‚Üí compute gradients.  
  4. Optimizer step ‚Üí update weights.  
- Repeat until all batches are done ‚Üí that completes one epoch.

---

## üîπ Why can‚Äôt we just do one epoch?
- Imagine learning multiplication tables:
  - If you study them once, you‚Äôll forget quickly.  
  - If you repeat them multiple times, you get better and faster.  
- Same with the model:  
  - One epoch = rough first attempt.  
  - More epochs = gradual refinement.  
  - Too many epochs = risk of memorizing (overfitting).  

---

## üîπ Full training loop explained in simple flow
1. **Set training mode** ‚Üí `model.train()`.  
2. **Epoch loop** ‚Üí repeat training passes (e.g., 3 times).  
3. **Batch loop** ‚Üí for each mini‚Äëbatch:  
   - Get tensors (`input_ids`, `attention_mask`, `labels`).  
   - Forward pass ‚Üí predictions.  
   - Loss ‚Üí how wrong predictions are.  
   - Backward pass ‚Üí compute gradients.  
   - Optimizer step ‚Üí update weights.  
4. **Print loss** ‚Üí see how much error remains after each epoch.  

---

‚ö° **Analogy:**  
Think of training like practicing basketball free throws:  
- **Batch loop** = each shot you take.  
- **Epoch loop** = one full practice session (all shots).  
- **Multiple epochs** = multiple practice sessions over days.  
- Just saying ‚ÄúI‚Äôm practicing‚Äù (`model.train()`) doesn‚Äôt make you better ‚Äî you actually need to take the shots (the loops).

---





In [13]:
class ReviewDataset(Dataset): #defines a PyTorch dataset wrapperdefines a PyTorch dataset wrapper
    def __init__(self,texts,labels,tokenizer,max_len=256): #- stores texts, labels, the tokenizer, and a max length for padding/truncation.
        self.texts = texts 
        self.labels = labels
        self.tokenizer = tokenizer 
        self.max_len = max_len
        
    def __len__(self): # tells PyTorch how many samples exist.
        return len(self.texts)
    
    def __getitem__(self ,idx):
        text = str(self.texts[idx]) #fetch raw text and label. #- idx is the index of the sample to retrieve.
        label = self.labels[idx]
        encoding = self.tokenizer( #- tokenizer converts text to input_ids and attention_mask tensors; truncation and padding='max_length' ensure fixed length
            text,
            truncation = True,
            padding='max_length',# Cut off if longer than max_len
            max_length=self.max_len,
            return_tensors ='pt'# Return PyTorch tensors
        )
        return{
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }



In [26]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') #downloads tokenizer vocab and rules
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2) #downloads pre-trained DistilBERT model with a classification head for 2 classes
model.to(device) # moves model to GPU/CPU

#Dataset 
train_dataset = ReviewDataset(X_train.tolist(), y_train.tolist(), tokenizer)
val_dataset = ReviewDataset(X_val.tolist(), y_val.tolist(), tokenizer)

train_loader = DataLoader(train_dataset , batch_size = 16, shuffle= True)
val_loader = DataLoader(val_dataset, batch_size=16)

optimizer = AdamW(model.parameters(), lr=2e-5)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# load tokenizer model    

model.train()

print("Training DistilBERT...")
for epoch in range(3):
    total_loss = 0
    for batch in tqdm(train_loader): # - tqdm is just a progress bar library in Python.
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} - Loss: {total_loss/len(train_loader):.4f}")

print("DistilBERT Training Complete!")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training DistilBERT...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1769/1769 [3:32:42<00:00,  7.21s/it]     


Epoch 1 - Loss: 0.1495


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1769/1769 [3:04:04<00:00,  6.24s/it]  


Epoch 2 - Loss: 0.0510


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1769/1769 [2:27:07<00:00,  4.99s/it]  

Epoch 3 - Loss: 0.0251
DistilBERT Training Complete!






## üîπ First, the big picture
In **machine learning (ML)** you usually do:
- Train on **X_train, y_train**  
- Test on **X_test, y_test**  
- Compare predictions (`y_pred`) with true labels (`y_test`)  

In **PyTorch NLP**, the idea is the same ‚Äî but instead of plain arrays, we use **tensors + DataLoader** to feed the model.  

So evaluation = **feed unseen data into the trained model ‚Üí collect predictions ‚Üí compare with true labels ‚Üí compute accuracy/metrics.**

---

## üîπ Your evaluation code
```python
model.eval()
preds = []
true = []

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        batch_preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        preds.extend(batch_preds)
        true.extend(batch['labels'].numpy())

print("DistilBERT Accuracy:", accuracy_score(true, preds))
print(classification_report(true, preds, target_names=['Real', 'Fake']))
```

---

## üîπ Step by step explanation

### 1. `model.eval()`
- Switches the model into **evaluation mode**.  
- This turns off things like **dropout** (randomly dropping neurons during training).  
- Ensures predictions are stable and consistent.

---

### 2. `with torch.no_grad():`
- Tells PyTorch: ‚ÄúDon‚Äôt calculate gradients now.‚Äù  
- Why? Because during evaluation we don‚Äôt need backpropagation ‚Äî we‚Äôre just checking performance.  
- Saves memory and speeds things up.

---

### 3. Looping through `val_loader`
- `val_loader` is the DataLoader for your **validation set** (X_val, y_val).  
- It gives batches of tokenized text + labels.  
- For each batch:
  - `input_ids` ‚Üí tokenized text.  
  - `attention_mask` ‚Üí tells model which tokens are padding.  
  - `labels` ‚Üí true class (0 or 1).  

---

### 4. Forward pass (getting predictions)
```python
outputs = model(input_ids, attention_mask=attention_mask)
```
- The model processes the batch and outputs **logits** (raw scores for each class).  
- Example: for binary classification, logits might look like `[2.3, -1.1]` ‚Üí meaning class 0 is more likely.

---

### 5. Converting logits to predictions
```python
batch_preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
```
- `torch.argmax(..., dim=1)` ‚Üí picks the class with the highest score.  
- Converts tensor ‚Üí NumPy array ‚Üí easy to store.  
- Example: `[2.3, -1.1]` ‚Üí prediction = `0`.

---

### 6. Collecting predictions and true labels
```python
preds.extend(batch_preds)
true.extend(batch['labels'].numpy())
```
- `preds` = all predicted labels across batches.  
- `true` = all actual labels across batches.  
- At the end, you have two lists: just like `y_pred` and `y_test` in ML.

---

### 7. Computing metrics
```python
print("DistilBERT Accuracy:", accuracy_score(true, preds))
print(classification_report(true, preds, target_names=['Real', 'Fake']))
```
- **Accuracy** = fraction of correct predictions.  
- **Classification report** = precision, recall, F1‚Äëscore for each class (`Real`, `Fake`).  
- This is exactly like scikit‚Äëlearn evaluation, just with tensors ‚Üí arrays conversion.

---

## üîπ Why 3 splits (train, val, test)?
- **Train set (X_train, y_train)** ‚Üí used to teach the model (update weights).  
- **Validation set (X_val, y_val)** ‚Üí used during training to check performance and tune hyperparameters (like learning rate, batch size).  
- **Test set (X_test, y_test)** ‚Üí final unseen data to measure how well the model generalizes.  

üëâ In your code, they used **train + val**. Often after training, you‚Äôd also evaluate on **test** to report final accuracy.

---

‚ö° **Analogy:**  
- **Train set** = practice questions.  
- **Validation set** = mock exam to check progress.  
- **Test set** = real exam to measure final ability.  
- Evaluation loop = grading the exam: collect answers (preds), compare with correct answers (true), compute score (accuracy/F1).

---

‚úÖ So evaluation in PyTorch NLP is the same idea as ML:  
- Get predictions ‚Üí compare with true labels ‚Üí compute metrics.  
- The only difference is the **DataLoader + tensors** machinery feeding the model.  

In [27]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
# distilbert_tokenizer = AutoTokenizer.from_pretrained("../backend/nlp/distilbert_fake_review")
model = AutoModelForSequenceClassification.from_pretrained("../backend/nlp/distilbert_fake_review")
# distilbert_model.eval()

In [28]:
# CELL 6 - Evaluate DistilBERT
model.eval()
preds = []
true = []

with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        batch_preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()
        preds.extend(batch_preds)
        true.extend(batch['labels'].numpy())

print("DistilBERT Accuracy:", accuracy_score(true, preds))
print(classification_report(true, preds, target_names=['Real', 'Fake']))

DistilBERT Accuracy: 0.9736147757255936
              precision    recall  f1-score   support

        Real       0.98      0.96      0.97      3032
        Fake       0.96      0.98      0.97      3032

    accuracy                           0.97      6064
   macro avg       0.97      0.97      0.97      6064
weighted avg       0.97      0.97      0.97      6064



In [42]:
save_path = "../backend/nlp/distilbert_fake_review"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)
print(f"Model saved to {save_path}")

Model saved to ../backend/nlp/distilbert_fake_review


In [15]:
import os
if not os.path.exists("../data/glove.6B.300d.txt"):
    print("Downloading GloVe 300d...")
    !wget -O ../data/glove.6B.zip https://nlp.stanford.edu/data/glove.6B.zip
    !unzip ../data/glove.6B.zip -d ../data/
    print("GloVe downloaded!")
else:
    print("GloVe already exists!")

GloVe already exists!


In [16]:
from collections import Counter
import numpy as np

# Build vocab
all_words = [word for text in df['cleaned'] for word in text.split()]
vocab = Counter(all_words)
vocab = ['<PAD>', '<UNK>'] + [word for word, count in vocab.most_common()]

word_to_idx = {word: idx for idx, word in enumerate(vocab)}
vocab_size = len(vocab)

# Load GloVe
embedding_dim = 300
embedding_matrix = np.zeros((vocab_size, embedding_dim))

glove_path = "../data/glove.6B.300d.txt"
with open(glove_path, encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        if word in word_to_idx:
            idx = word_to_idx[word]
            embedding_matrix[idx] = np.array(values[1:], dtype='float32')

In [17]:
# For unknown words ‚Üí random
for i in range(vocab_size):
    if np.all(embedding_matrix[i] == 0):
        embedding_matrix[i] = np.random.normal(scale=0.6, size=(embedding_dim,))

print(f"Embedding matrix shape: {embedding_matrix.shape}")

Embedding matrix shape: (48456, 300)


In [18]:
class BiLSTMDataset(Dataset):
    def __init__(self, texts, labels, word_to_idx, max_len=100):
        self.texts = texts
        self.labels = labels
        self.word_to_idx = word_to_idx
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        words = text.split()[:self.max_len]
        indices = [self.word_to_idx.get(word, 1) for word in words]  # 1 = <UNK>
        if len(indices) < self.max_len:
            indices += [0] * (self.max_len - len(indices))  # 0 = <PAD>
        
        return {
            'indices': torch.tensor(indices, dtype=torch.long),
            'label': torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [19]:
class BiLSTM(nn.Module):
    def __init__(self, embedding_matrix, hidden_dim=128, num_layers=2):
        super().__init__()
        self.embedding = nn.Embedding.from_pretrained(
            torch.FloatTensor(embedding_matrix), padding_idx=0
        )
        self.lstm = nn.LSTM(
            input_size=300,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            bidirectional=True,
            batch_first=True,
            dropout=0.5
        )
        self.fc = nn.Linear(hidden_dim * 2, 2)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        final_hidden = lstm_out[:, -1, :]  # Last timestep
        out = self.dropout(final_hidden)
        return self.fc(out)

In [20]:
# Datasets
bilstm_train = BiLSTMDataset(X_train.tolist(), y_train.tolist(), word_to_idx)
bilstm_val = BiLSTMDataset(X_val.tolist(), y_val.tolist(), word_to_idx)

train_loader_bilstm = DataLoader(bilstm_train, batch_size=64, shuffle=True)
val_loader_bilstm = DataLoader(bilstm_val, batch_size=64)

# Model
bilstm_model = BiLSTM(embedding_matrix).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(bilstm_model.parameters(), lr=0.001)

In [21]:
print("Training BiLSTM + GloVe...")
bilstm_model.train()
for epoch in range(8):
    total_loss = 0
    for batch in tqdm(train_loader_bilstm):
        optimizer.zero_grad()
        indices = batch['indices'].to(device)
        labels = batch['label'].to(device)
        outputs = bilstm_model(indices)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} - Loss: {total_loss/len(train_loader_bilstm):.4f}")

Training BiLSTM + GloVe...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 443/443 [02:42<00:00,  2.72it/s]


Epoch 1 - Loss: 0.6749


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 443/443 [02:13<00:00,  3.31it/s]


Epoch 2 - Loss: 0.5811


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 443/443 [01:43<00:00,  4.28it/s]


Epoch 3 - Loss: 0.3687


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 443/443 [01:45<00:00,  4.20it/s]


Epoch 4 - Loss: 0.2524


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 443/443 [01:45<00:00,  4.19it/s]


Epoch 5 - Loss: 0.2052


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 443/443 [01:43<00:00,  4.26it/s]


Epoch 6 - Loss: 0.1655


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 443/443 [01:41<00:00,  4.38it/s]


Epoch 7 - Loss: 0.1343


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 443/443 [01:41<00:00,  4.35it/s]

Epoch 8 - Loss: 0.1175





In [22]:
# Save
torch.save(bilstm_model.state_dict(), "../backend/nlp/bilstm_fake_review.pth")
print("BiLSTM model saved!")

BiLSTM model saved!


In [29]:
# DistilBERT predictions (from earlier)
distilbert_preds = preds

In [30]:
# BiLSTM predictions
bilstm_model.eval()
bilstm_preds = []
with torch.no_grad():
    for batch in val_loader_bilstm:
        indices = batch['indices'].to(device)
        outputs = bilstm_model(indices)
        pred = torch.argmax(outputs, dim=1).cpu().numpy()
        bilstm_preds.extend(pred)

In [34]:
print("=== MODEL COMPARISON ===")
print(f"DistilBERT Accuracy: {accuracy_score(true, distilbert_preds):.4f} | F1: {f1_score(true, distilbert_preds):.4f}")
print(f"BiLSTM+GloVe Accuracy: {accuracy_score(true, bilstm_preds):.4f} | F1: {f1_score(true, bilstm_preds):.4f}")

=== MODEL COMPARISON ===
DistilBERT Accuracy: 0.9736 | F1: 0.9739
BiLSTM+GloVe Accuracy: 0.9309 | F1: 0.9303


In [35]:
# Ensemble (simple voting)
ensemble_preds = []
for d, b in zip(distilbert_preds, bilstm_preds):
    ensemble_preds.append(1 if (d + b) >= 1 else 0)  # majority vote

print(f"ENSEMBLE Accuracy: {accuracy_score(true, ensemble_preds):.4f} | F1: {f1_score(true, ensemble_preds):.4f}")

ENSEMBLE Accuracy: 0.9535 | F1: 0.9550
