In [1]:
import os
import pandas as pd

# # Verify the file path
# file_path = r"Toys_and_Games.json"
# if not os.path.exists(file_path):
#     raise FileNotFoundError(f"The file at {file_path} does not exist.")

# # Read the file in chunks
# chunk_size = 10000  # Adjust the chunk size based on your memory constraints
# chunks = pd.read_json(file_path, lines=True, chunksize=chunk_size)

# # List to store processed chunks
# processed_chunks = []

# # Process each chunk
# for chunk in chunks:
#     # Drop all columns except for 'reviewText' and 'class'
#     chunk = chunk[['reviewText', 'class']]

#     # Perform necessary operations on each chunk
#     print(chunk.head())   # Check the first few rows of the chunk
#     print(chunk.columns)  # Check the column names
#     print(chunk.info())   # Get a summary of the data types and null values

#     # Example: Convert data types to more memory-efficient types
#     chunk['class'] = chunk['class'].astype('category')

#     # Append the processed chunk to the list
#     processed_chunks.append(chunk)

# # Combine all processed chunks into a single DataFrame
# final_df = pd.concat(processed_chunks, ignore_index=True)

# # Save the final DataFrame to a new file
# final_df.to_csv('processed_reviews.csv', index=False)

# Read the CSV file into a DataFrame
final_df = pd.read_csv('processed_reviews.csv')

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from torch.utils.data import DataLoader, Dataset
import torch
import numpy as np

# Preprocess the data
final_df.dropna(inplace=True)
X = final_df["reviewText"]
y = final_df['class']  # Assuming 'class' is your target label (fake or real review)

# 1. Use TF-IDF with a limited number of features to reduce memory usage
vectorizer = TfidfVectorizer(max_features=15000)  # Limit to 15000 features

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the training and testing sets
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Custom dataset class to handle sparse tensors
class SparseDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        X_row = self.X[idx].toarray().squeeze()
        y_row = self.y[idx]
        return torch.tensor(X_row, dtype=torch.float32), torch.tensor(y_row, dtype=torch.float32)

# Create datasets
train_dataset = SparseDataset(X_train_vectorized, y_train.values)
test_dataset = SparseDataset(X_test_vectorized, y_test.values)

# Custom collate function to handle sparse tensors
def sparse_collate_fn(batch):
    X_batch, y_batch = zip(*batch)
    X_batch = torch.stack(X_batch)
    y_batch = torch.stack(y_batch).view(-1, 1)  # Reshape y_batch to have the same shape as y_pred
    return X_batch.cuda(), y_batch.cuda()

# Use DataLoader for efficient batch processing on GPU with custom collate_fn
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=sparse_collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=sparse_collate_fn)

# Model (assuming there's a simple neural network to classify reviews)
class SpamReviewClassifier(torch.nn.Module):
    def __init__(self, input_size):
        super(SpamReviewClassifier, self).__init__()
        self.fc1 = torch.nn.Linear(input_size, 128)
        self.relu = torch.nn.ReLU()
        self.fc2 = torch.nn.Linear(128, 1)
        self.sigmoid = torch.nn.Sigmoid()

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return self.sigmoid(x)

# Initialize the model and move it to GPU
model = SpamReviewClassifier(input_size=15000).cuda()

# Loss and optimizer
criterion = torch.nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 5
for epoch in range(epochs):
    model.train()
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        y_pred = model(X_batch)
        loss = criterion(y_pred, y_batch)
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")

Epoch 1/5, Loss: 0.0431184396147728
Epoch 2/5, Loss: 0.07764448970556259
Epoch 3/5, Loss: 0.2717161774635315
Epoch 4/5, Loss: 0.4241695702075958
Epoch 5/5, Loss: 0.1997235119342804


In [9]:
# Evaluation function
def evaluate(model, test_loader, criterion):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            y_pred = model(X_batch)
            test_loss += criterion(y_pred, y_batch).item()
            pred = (y_pred > 0.5).float()  # Convert probabilities to binary predictions
            correct += pred.eq(y_batch).sum().item()

    test_loss /= len(test_loader.dataset)
    accuracy = correct / len(test_loader.dataset)
    return test_loss, accuracy

# Evaluate the model on the test dataset
test_loss, test_accuracy = evaluate(model, test_loader, criterion)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")

Test Loss: 0.0066, Test Accuracy: 0.9187
