In [2]:
import json
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments


KeyboardInterrupt



In [None]:
# Load the JSON data
with open('AMAZON_FASHION.json', 'r') as f:
    data = json.load(f)

# Create a DataFrame
df = pd.DataFrame(data)


In [None]:
# Map ratings to class indices (0 to 4)
df['overall'] = df['overall'].astype(int) - 1


In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(df['reviewText'], df['overall'], test_size=0.2)

In [None]:
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True)

In [None]:
class ReviewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset = ReviewsDataset(train_encodings, train_labels)
val_dataset = ReviewsDataset(val_encodings, val_labels)

# Load the pre-trained BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)