In [None]:
!nvidia-smi

Thu Oct 10 09:42:07 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P8              10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [2]:
import os
import pandas as pd

# Verify the file path
file_path = r"Toys_and_Games.json"
if not os.path.exists(file_path):
    raise FileNotFoundError(f"The file at {file_path} does not exist.")

# Load the dataset from the JSON Lines file
try:
    data = pd.read_json(file_path, lines=True)
except ValueError as e:
    raise ValueError(f"Error reading JSON Lines file: {e}")

# Explore the data (look at the first few rows)
print(data.head())   # Check the first few rows of the dataset
print(data.columns)  # Check the column names
print(data.info())   # Get a summary of the data types and null values

                                    _id      reviewerID        asin  \
0  {'$oid': '5a13282b741a2384e879a620'}  A3C9CSW3TJITGT  0005069491   
1  {'$oid': '5a13282b741a2384e879a621'}  A31POTIYCKSZ9G  0076561046   
2  {'$oid': '5a13282b741a2384e879a622'}  A2GGHHME9B6W4O  0131358936   
3  {'$oid': '5a13282b741a2384e879a61f'}   AMEVO2LY6VEJA  0000191639   
4  {'$oid': '5a13282b741a2384e879a623'}  A1FSLDH43ORWZP  0133642984   

    reviewerName helpful                                         reviewText  \
0          Renee  [0, 0]  I love these felt nursery rhyme characters and...   
1  So CA Teacher  [0, 0]  I see no directions for its use. Therefore I h...   
2     Dalilah G.  [0, 0]  This is a great tool for any teacher using the...   
3  Nicole Soeder  [0, 0]  Great product, thank you! Our son loved the pu...   
4  Dayna English  [0, 0]  Although not as streamlined as the Algebra I m...   

   overall                                     summary  unixReviewTime  \
0        4  Charming cha

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from torch.utils.data import DataLoader, TensorDataset
import torch

# 1. Preprocess the data (drop missing values and split into features/target)
data.dropna(inplace=True)

X = data["reviewText"]
y = data['class']  # Assuming 'class' is your target label (fake or real review)

# 2. Prepare the vectorizer (TF-IDF)
vectorizer = TfidfVectorizer()

# 3. Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Vectorize the training and testing sets
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# 5. Define a generator function to yield mini-batches of data
def batch_generator(X, y, batch_size=32):
    num_batches = X.shape[0] // batch_size
    for i in range(num_batches + 1):
        X_batch = X[i * batch_size : (i + 1) * batch_size]
        y_batch = y[i * batch_size : (i + 1) * batch_size]
        yield X_batch.toarray(), y_batch  # Convert to dense inside generator

# 6. Convert labels to tensor
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

# Use the batch_generator inside your training loop later


In [None]:
# Build the torch Logistic Regression model
import torch
import torch.nn as nn
import torch.optim as optim

class LogisticRegression(nn.Module):
    def __init__(self, input_dim):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_dim, 1)

    def forward(self, x):
        out = torch.sigmoid(self.linear(x)).squeeze(1)  # Use squeeze to remove extra dimension
        return out


# innitiate the model
input_dim = X_train_vectorized.shape[1]
model = LogisticRegression(input_dim)

In [None]:
# RUN MODEL ON GPU

from torch.utils.data import DataLoader, Dataset
from scipy.sparse import csr_matrix

# Custom Dataset to handle sparse data
class SparseTensorDataset(Dataset):
    def __init__(self, X_sparse, y):
        self.X_sparse = csr_matrix(X_sparse)  # Store sparse matrix
        self.y = torch.tensor(y.values, dtype=torch.float32).view(-1, 1)

    def __len__(self):
        return self.X_sparse.shape[0]

    def __getitem__(self, idx):
        # Convert each sample on the fly to dense
        X_dense = torch.tensor(self.X_sparse[idx].toarray(), dtype=torch.float32)
        y = self.y[idx]
        return X_dense, y

# 1. Create SparseTensorDataset instead of dense TensorDataset
train_dataset = SparseTensorDataset(X_train_vectorized, y_train)
test_dataset = SparseTensorDataset(X_test_vectorized, y_test)

# 2. Use DataLoader for mini-batches
train_loader = DataLoader(train_dataset, batch_size=2000, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=2000, shuffle=False)

# 3. Move model to GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LogisticRegression(input_dim).to(device)




In [None]:
# # RUN MODEL ON CPU

# # Convert data to Pytorch tensor
# X_train_tensor = torch.tensor(X_train_vectorized.toarray(), dtype=torch.float32)
# y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)

# X_test_tensor = torch.tensor(X_test_vectorized.toarray(), dtype=torch.float32)
# y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

In [None]:
# Define the Loss function and optimizer
criterion = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Training Loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()

    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        # Forward pass
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


In [None]:
# Evaluation loop (same idea as above)
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for X_batch, y_batch in test_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        outputs = model(X_batch)
        predictions = outputs.round()
        total += y_batch.size(0)
        correct += (predictions == y_batch).sum().item()

    accuracy = correct / total
    print(f'Test Accuracy: {accuracy:.4f}')