In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import torch.nn.functional as F
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from datasets import load_dataset
from torch.utils.data import TensorDataset, DataLoader

In [2]:
# Check if CUDA is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
class CNN(nn.Module):
    def __init__(self, num_labels, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(num_labels, hidden_dim)
        self.conv = nn.Conv1d(hidden_dim, hidden_dim, kernel_size=3, padding=1)
        self.fc = nn.Linear(hidden_dim, num_labels)

    def forward(self, input_ids):
        embedded = self.embedding(input_ids)
        embedded = embedded.permute(0, 2, 1)  # Reshape for convolution
        conv_output = self.conv(embedded)
        conv_output = F.relu(conv_output)
        pooled_output, _ = torch.max(conv_output, dim=2)  # Max pooling
        logits = self.fc(pooled_output)
        return logits

In [4]:
# Load the dataset
dataset = load_dataset("amazon_us_reviews", "Apparel_v1_00")
train_data = dataset['train']
train_data = train_data.select(range(100000))
df = train_data.to_pandas()
df = df[['customer_id', 'review_headline', 'review_body', 'star_rating']]
df.columns = ['customer_id', 'review_headline', 'review_body', 'star_rating']
df.set_index('customer_id', inplace=True)

Found cached dataset amazon_us_reviews (/home/z123010/.cache/huggingface/datasets/amazon_us_reviews/Apparel_v1_00/0.1.0/17b2481be59723469538adeb8fd0a68b0ba363bbbdd71090e72c325ee6c7e563)


  0%|          | 0/1 [00:00<?, ?it/s]

In [5]:
# Map sentiment labels to numerical values
df['sentiment'] = df['star_rating'].map({5: 'good', 4: 'good', 3: 'neutral', 2: 'bad', 1: 'bad'})
possible_labels = df.sentiment.unique()
label_dict = {possible_label: index for index, possible_label in enumerate(possible_labels)}
df['label'] = df.sentiment.replace(label_dict)

In [6]:
# Split the data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    df.index.values,
    df.label.values,
    test_size=0.15,
    random_state=17,
    stratify=df.label.values
)
df['data_type'] = ['not_set']*df.shape[0]
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

In [7]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
encoded_data_train_headline = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].review_headline.values,
    add_special_tokens=True,
    return_attention_mask=True,
    padding='longest',
    max_length=256,
    truncation=True,
    return_tensors='pt'
)

encoded_data_train_body = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].review_body.values,
    add_special_tokens=True,
    return_attention_mask=True,
    padding='longest',
    max_length=256,
    truncation=True,
    return_tensors='pt'
)

input_ids_train_headline = encoded_data_train_headline['input_ids']
attention_masks_train_headline = encoded_data_train_headline['attention_mask']

input_ids_train_body = encoded_data_train_body['input_ids']
attention_masks_train_body = encoded_data_train_body['attention_mask']

input_ids_train = torch.cat((input_ids_train_headline, input_ids_train_body), dim=1)
attention_masks_train = torch.cat((attention_masks_train_headline, attention_masks_train_body), dim=1)

labels_train = torch.tensor(df[df.data_type=='train'].label.values)


encoded_data_val_headline = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].review_headline.values,
    add_special_tokens=True,
    return_attention_mask=True,
    padding='longest',
    max_length=256,
    truncation=True,
    return_tensors='pt'
)

encoded_data_val_body = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].review_body.values,
    add_special_tokens=True,
    return_attention_mask=True,
    padding='longest',
    max_length=256,
    truncation=True,
    return_tensors='pt'
)

input_ids_val_headline = encoded_data_val_headline['input_ids']
attention_masks_val_headline = encoded_data_val_headline['attention_mask']

input_ids_val_body = encoded_data_val_body['input_ids']
attention_masks_val_body = encoded_data_val_body['attention_mask']

input_ids_val = torch.cat((input_ids_val_headline, input_ids_val_body), dim=1)
attention_masks_val = torch.cat((attention_masks_val_headline, attention_masks_val_body), dim=1)

labels_val = torch.tensor(df[df.data_type=='val'].label.values)

In [8]:
# Create data loaders
train_data = data.TensorDataset(input_ids_train, attention_masks_train, labels_train)
val_data = data.TensorDataset(input_ids_val, attention_masks_val, labels_val)

batch_size = 8
train_loader = data.DataLoader(train_data, batch_size=batch_size)
val_loader = data.DataLoader(val_data, batch_size=batch_size)

In [9]:
# Initialize the RCNN model
hidden_dim = 128  # Choose an appropriate hidden dimension
model = CNN(len(possible_labels), hidden_dim)
model.to(device)

CNN(
  (embedding): Embedding(3, 128)
  (conv): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
  (fc): Linear(in_features=128, out_features=3, bias=True)
)

In [10]:
# Define the optimizer and loss function
optimizer = optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()

In [11]:
import torch

torch.cuda.empty_cache()
torch.backends.cuda.max_split_size_mb = 2200

In [12]:
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.deterministic = False
torch.backends.cudnn.allow_tf32 = True
data = torch.randn([1, 3, 470, 446], dtype=torch.float, device='cuda', requires_grad=True)
net = torch.nn.Conv2d(3, 64, kernel_size=[7, 7], padding=[0, 0], stride=[1, 1], dilation=[1, 1], groups=1)
net = net.cuda().float()
out = net(data)
out.backward(torch.randn_like(out))
torch.cuda.synchronize()
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [13]:
# Training loop
epochs = 5
for epoch in range(epochs):
    model.train()
    train_loss = 0.0
    train_acc = 0.0

    for batch in train_loader:
        input_ids = batch[0].to(device)
        labels = batch[2].to(device)  # Update the index to access labels

        optimizer.zero_grad()

        outputs = model(input_ids.squeeze(1))
        _, predicted_labels = torch.max(outputs, 1)
        loss = loss_fn(outputs, labels)

        loss.backward()
        optimizer.step()

        train_loss += loss.item() * input_ids.size(0)
        train_acc += accuracy_score(predicted_labels.cpu().numpy(), labels.cpu().numpy()) * input_ids.size(0)

    epoch_train_loss = train_loss / len(X_train)
    epoch_train_acc = train_acc / len(X_train)

    # Validation loop
    model.eval()
    val_loss = 0.0
    val_acc = 0.0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch[0].to(device)
            labels = batch[2].to(device)  # Update the index to access labels

            outputs = model(input_ids.squeeze(1))
            _, predicted_labels = torch.max(outputs, 1)
            loss = loss_fn(outputs, labels)

            val_loss += loss.item() * input_ids.size(0)
            val_acc += accuracy_score(predicted_labels.cpu().numpy(), labels.cpu().numpy()) * input_ids.size(0)

    epoch_val_loss = val_loss / len(X_val)
    epoch_val_acc = val_acc / len(X_val)

    print(f'Epoch {epoch+1}/{epochs}')
    print(f'Train Loss: {epoch_train_loss:.4f} | Train Accuracy: {epoch_train_acc:.4f}')
    print(f'Val Loss: {epoch_val_loss:.4f} | Val Accuracy: {epoch_val_acc:.4f}')
    print('-------------------------------------------')

../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [46,0,0], thread: [32,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [46,0,0], thread: [33,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [46,0,0], thread: [34,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [46,0,0], thread: [35,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [46,0,0], thread: [36,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [46,0,0], thread: [37,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [46,0,0], t

RuntimeError: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
