In [1]:
import torch
from transformers import BertModel, BertTokenizer
import json
import pandas as pd
import gzip
from torch.utils.data import DataLoader, TensorDataset, random_split
from transformers import BertTokenizer, BertForSequenceClassification

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
# Step 1: Load Data from JSON File
def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield json.loads(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')


df = getDF('AMAZON_FASHION_5.json.gz')
df = df[df["reviewText"].apply(lambda x: isinstance(x, str))]

In [6]:
df=df.drop('vote',axis=1)
df=df.drop('image',axis=1)

In [11]:
X= df["reviewText"].tolist()
labels = df["overall"].tolist()


In [12]:
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
encoded_inputs = tokenizer(X, padding=True, truncation=True, return_tensors='pt')

In [13]:
labels = torch.tensor(labels, dtype=torch.float32).unsqueeze(1)
dataset = TensorDataset(encoded_inputs['input_ids'], encoded_inputs['attention_mask'], labels)

train_size = int(0.8 * len(dataset))
eval_size = len(dataset) - train_size
train_dataset, eval_dataset = random_split(dataset, [train_size, eval_size])


In [14]:
batch_size = 8
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
eval_loader = DataLoader(eval_dataset, batch_size=batch_size)

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model = BertForSequenceClassification.from_pretrained(model_name, num_labels=1).to(device)
optimizer = torch.optim.AdamW(bert_model.parameters(), lr=2e-5)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
num_epochs = 10
for epoch in range(num_epochs):
    # Training
    bert_model.train()
    total_train_loss = 0.0
    num_train_batches = 0
    for batch in train_loader:
        input_ids, attention_mask, label = [t.to(device) for t in batch]
        optimizer.zero_grad()
        outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask, labels=label)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_train_loss += loss.item()
        num_train_batches += 1
    avg_train_loss = total_train_loss / num_train_batches

    # Evaluation
    bert_model.eval()
    total_eval_loss = 0.0
    num_eval_batches = 0
    with torch.no_grad():
        for batch in eval_loader:
            input_ids, attention_mask, label = [t.to(device) for t in batch]
            outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask, labels=label)
            total_eval_loss += outputs.loss.item()
            num_eval_batches += 1
    avg_eval_loss = total_eval_loss / num_eval_batches

    print(f"Epoch [{epoch+1}/{num_epochs}], Training Loss: {avg_train_loss:.4f}, Evaluation Loss: {avg_eval_loss:.4f}")

Epoch [1/10], Training Loss: 0.4821, Evaluation Loss: 0.1159
Epoch [2/10], Training Loss: 0.0727, Evaluation Loss: 0.0747
Epoch [3/10], Training Loss: 0.0542, Evaluation Loss: 0.0643
Epoch [4/10], Training Loss: 0.0431, Evaluation Loss: 0.0421
Epoch [5/10], Training Loss: 0.0395, Evaluation Loss: 0.0393
Epoch [6/10], Training Loss: 0.0409, Evaluation Loss: 0.0461
Epoch [7/10], Training Loss: 0.0335, Evaluation Loss: 0.0388
Epoch [8/10], Training Loss: 0.0313, Evaluation Loss: 0.0400
Epoch [9/10], Training Loss: 0.0299, Evaluation Loss: 0.0634
Epoch [10/10], Training Loss: 0.0282, Evaluation Loss: 0.0270


In [18]:
review_text = "t was normal"

tokenized_review = tokenizer(review_text, padding=True, truncation=True, return_tensors='pt')

input_ids = tokenized_review['input_ids'].to(device)
attention_mask = tokenized_review['attention_mask'].to(device)

with torch.no_grad():
    bert_model.eval()
    outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask)
    predicted_rating = outputs.logits.item()

print(f"Predicted rating: {predicted_rating:.2f}")

Predicted rating: 4.77
