In [1]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
import string

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

data_path = '/kaggle/input/yelp-dataset/yelp_academic_dataset_review.json'
data = pd.read_json(data_path, lines=True, nrows=200000)
data['target'] = data['stars'].map(lambda x: 0 if x <= 2 else 1)
data = data[['text', 'target']]
min_class_size = min(data['target'].value_counts())  # 38,038
data_balanced = (data.groupby('target', group_keys=False)[['text', 'target']]
                 .apply(lambda x: x.sample(n=min_class_size, random_state=42))
                 .reset_index(drop=True))
print(f"Balanced dataset size: {len(data_balanced)}")

alphabet = "abcdefghijklmnopqrstuvwxyz0123456789,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{} \n"
char_to_idx = {char: idx + 1 for idx, char in enumerate(alphabet)}
vocab_size = len(alphabet) + 1  # 71
def text_to_indices(text, max_len=1014):
    indices = [char_to_idx.get(c, 0) for c in str(text.lower())[:max_len]]
    indices = [min(i, 69) for i in indices]
    return indices + [0] * (max_len - len(indices))
data_balanced['indices'] = data_balanced['text'].apply(text_to_indices)

# Split data
X = list(data_balanced['indices'])
y = list(data_balanced['target'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.float)
    def __len__(self): return len(self.X)
    def __getitem__(self, idx): return self.X[idx], self.y[idx]

train_dataset = TextDataset(X_train, y_train)
test_dataset = TextDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64)

class ImprovedCharCNN(nn.Module):
    def __init__(self, vocab_size=71, embed_dim=128, num_filters=1024):
        super(ImprovedCharCNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.conv1 = nn.Conv1d(embed_dim, num_filters, kernel_size=7, padding=3)
        self.conv2 = nn.Conv1d(num_filters, num_filters, kernel_size=7, padding=3)
        self.conv3 = nn.Conv1d(num_filters, num_filters, kernel_size=3, padding=1)
        self.conv4 = nn.Conv1d(num_filters, num_filters, kernel_size=3, padding=1)
        self.conv5 = nn.Conv1d(num_filters, num_filters, kernel_size=3, padding=1)
        self.conv6 = nn.Conv1d(num_filters, num_filters, kernel_size=3, padding=1)
        self.pool = nn.MaxPool1d(kernel_size=3, stride=3)
        self.dropout = nn.Dropout(0.5)
        self.fc1 = nn.Linear(num_filters * 37, 2048)
        self.fc2 = nn.Linear(2048, 2048)
        self.fc3 = nn.Linear(2048, 1)
    
    def forward(self, x):
        x = self.embedding(x).transpose(1, 2)
        x = torch.relu(self.conv1(x))
        x = self.pool(x)
        x = torch.relu(self.conv2(x))
        x = self.pool(x)
        x = torch.relu(self.conv3(x))
        x = torch.relu(self.conv4(x))
        x = torch.relu(self.conv5(x))
        x = torch.relu(self.conv6(x))
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        x = self.dropout(x)
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

models = []
seeds = [42, 43, 44]
for i, seed in enumerate(seeds):
    print(f"\nTraining Model {i+1} with seed {seed}")
    torch.manual_seed(seed)
    model = ImprovedCharCNN(vocab_size=vocab_size).to(device)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.5)
    
    best_acc = 0.0
    patience = 3
    trigger = 0
    
    for epoch in range(15):
        model.train()
        total_loss = 0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs).squeeze()
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg_loss = total_loss / len(train_loader)
        
        model.eval()
        test_preds, test_true = [], []
        with torch.no_grad():
            for inputs, labels in test_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs).squeeze()
                preds = (torch.sigmoid(outputs) > 0.5).long()
                test_preds.extend(preds.cpu().tolist())
                test_true.extend(labels.cpu().tolist())
        test_acc = accuracy_score(test_true, test_preds)
        print(f'Model {i+1}, Epoch {epoch+1}, Loss: {avg_loss:.4f}, Test Acc: {test_acc:.4f}')
        
        if test_acc > best_acc:
            best_acc = test_acc
            trigger = 0
            torch.save(model.state_dict(), f'model_{i+1}_seed_{seed}.pth')
        else:
            trigger += 1
            if trigger >= patience:
                print(f'Model {i+1} Early stopping at epoch {epoch+1}, Best Acc: {best_acc:.4f}')
                break
        
        scheduler.step()
    
    models.append(model)

print("\nEvaluating Ensemble...")
ensemble_preds, test_true = [], []
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = [torch.sigmoid(model(inputs).squeeze()) for model in models]
        avg_outputs = torch.mean(torch.stack(outputs), dim=0) 
        preds = (avg_outputs > 0.5).long()
        ensemble_preds.extend(preds.cpu().tolist())
        test_true.extend(labels.cpu().tolist())

ensemble_acc = accuracy_score(test_true, ensemble_preds)
ensemble_precision = precision_score(test_true, ensemble_preds)
ensemble_recall = recall_score(test_true, ensemble_preds)
print(f'Ensemble Test Accuracy: {ensemble_acc:.4f}')
print(f'Ensemble Precision: {ensemble_precision:.4f}, Recall: {ensemble_recall:.4f}')

Using device: cuda
Balanced dataset size: 76076

Training Model 1 with seed 42
Model 1, Epoch 1, Loss: 0.6364, Test Acc: 0.8352
Model 1, Epoch 2, Loss: 0.2985, Test Acc: 0.8993
Model 1, Epoch 3, Loss: 0.2420, Test Acc: 0.9088
Model 1, Epoch 4, Loss: 0.1831, Test Acc: 0.8910
Model 1, Epoch 5, Loss: 0.1436, Test Acc: 0.9055
Model 1, Epoch 6, Loss: 0.1100, Test Acc: 0.8941
Model 1 Early stopping at epoch 6, Best Acc: 0.9088

Training Model 2 with seed 43
Model 2, Epoch 1, Loss: 0.4883, Test Acc: 0.8583
Model 2, Epoch 2, Loss: 0.2679, Test Acc: 0.9050
Model 2, Epoch 3, Loss: 0.2259, Test Acc: 0.9063
Model 2, Epoch 4, Loss: 0.1625, Test Acc: 0.9120
Model 2, Epoch 5, Loss: 0.1200, Test Acc: 0.9040
Model 2, Epoch 6, Loss: 0.0854, Test Acc: 0.9053
Model 2, Epoch 7, Loss: 0.0335, Test Acc: 0.8968
Model 2 Early stopping at epoch 7, Best Acc: 0.9120

Training Model 3 with seed 44
Model 3, Epoch 1, Loss: 0.6434, Test Acc: 0.8557
Model 3, Epoch 2, Loss: 0.2866, Test Acc: 0.9039
Model 3, Epoch 3, Lo