# Step 1:Loading Data Set

In [None]:
!pip install gdown
import gdown
import pandas as pd

# Google Drive file ID from the shared link
file_id = "1A8mqzrqeTUj8Rbh52w0mru_GxONjHVJv"
gdown.download(f"https://drive.google.com/uc?id={file_id}", "reviews.csv", quiet=False)

# Load CSV into DataFrame
df = pd.read_csv("reviews.csv")
print(df.head())

#Load & Clean Text
import pandas as pd
import re
from sklearn.model_selection import train_test_split

df = pd.read_csv('reviews.csv')  # Make sure it's in your working directory

def clean_text(text): #Removing Extra text and setting everyhting to lowercase
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text.strip()

df['review'] = df['review'].apply(clean_text)

#Tokenize & Build Vocabulary
from collections import Counter

def tokenize(text):
    return text.split()

df['tokens'] = df['review'].apply(tokenize)

# Build vocab
word_counts = Counter([word for tokens in df['tokens'] for word in tokens]) # For Vocab words counting
vocab = {word: idx+1 for idx, (word, _) in enumerate(word_counts.items())}  # 0 reserved for padding

#Convert to Indexed Sequences & Pad
MAX_LEN = 100

def encode(tokens):
    return [vocab.get(word, 0) for word in tokens][:MAX_LEN]

df['encoded'] = df['tokens'].apply(encode)
df['padded'] = df['encoded'].apply(lambda x: x + [0]*(MAX_LEN - len(x)))

#Train/Val/Test Split
X = df['padded'].tolist()
y = df['sentiment'].tolist()  # Assuming binary labels: 0 or 1

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42) #Dataset ko training, validation, aur test sets me split kar rahe hain
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)



Downloading...
From: https://drive.google.com/uc?id=1A8mqzrqeTUj8Rbh52w0mru_GxONjHVJv
To: /content/reviews.csv
100%|██████████| 66.2M/66.2M [00:01<00:00, 39.7MB/s]


                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


# **Step 2: Build the LSTM Model**

In [None]:
import torch
import torch.nn as nn

class SentimentLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim=100, hidden_dim=128):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size+1, embed_dim, padding_idx=0) #Word embeddings layer
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True) #LSTM layer for sequence modeling
        self.fc = nn.Linear(hidden_dim, 1) #Final classification layer
        self.sigmoid = nn.Sigmoid() #Output to probability

    def forward(self, x):
        x = self.embedding(x)
        _, (hn, _) = self.lstm(x)
        out = self.fc(hn.squeeze(0))
        return self.sigmoid(out)


# **Step 3: Train the Model**

In [None]:
from torch.utils.data import DataLoader, TensorDataset
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Convert string labels to numerical labels
label_map = {'positive': 1, 'negative': 0}
y_train_numeric = [label_map[label] for label in y_train]
y_val_numeric = [label_map[label] for label in y_val]

train_data = TensorDataset(torch.tensor(X_train), torch.tensor(y_train_numeric))
val_data = TensorDataset(torch.tensor(X_val), torch.tensor(y_val_numeric))

train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
val_loader = DataLoader(val_data, batch_size=64)

model = SentimentLSTM(vocab_size=len(vocab)).to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

train_losses, val_losses = [], []

for epoch in range(10):
    model.train()
    epoch_loss = 0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.float().to(device)
        optimizer.zero_grad() #Previous gradients ko reset
        preds = model(xb).squeeze()
        loss = criterion(preds, yb)
        loss.backward() #Backpropagation for gradient calculation
        optimizer.step() #Model parameters update
        epoch_loss += loss.item()
    train_losses.append(epoch_loss / len(train_loader))

    # Validation
    model.eval()
    with torch.no_grad():
        val_loss = 0
        for xb, yb in val_loader:
            xb, yb = xb.to(device), yb.float().to(device)
            preds = model(xb).squeeze()
            loss = criterion(preds, yb)
            val_loss += loss.item()
        val_losses.append(val_loss / len(val_loader))

# **Step 4: Evaluate**

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Convert string labels to numerical labels for the test set
label_map = {'positive': 1, 'negative': 0}
y_test_numeric = [label_map[label] for label in y_test]

test_data = TensorDataset(torch.tensor(X_test), torch.tensor(y_test_numeric))
test_loader = DataLoader(test_data, batch_size=64)

model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for xb, yb in test_loader:
        xb = xb.to(device)
        preds = model(xb).squeeze().cpu().numpy()
        preds = (preds > 0.5).astype(int)
        all_preds.extend(preds)
        all_labels.extend(yb.numpy())

print("Accuracy:", accuracy_score(all_labels, all_preds)) #Model ki accuracy
print("Precision:", precision_score(all_labels, all_preds)) #Precision metric for positive class
print("Recall:", recall_score(all_labels, all_preds)) #Recall metric for positive class
print("F1 Score:", f1_score(all_labels, all_preds)) #Harmonic mean of precision and recall

# **Step 5: Plot Losses**

In [None]:
import matplotlib.pyplot as plt

plt.plot(train_losses, label='Train Loss') #Training loss vs epochs plot
plt.plot(val_losses, label='Val Loss') #Validation loss vs epochs plot
plt.legend()
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Loss vs Epochs')
plt.show()
