In [1]:
import pandas as pd
import re

# Load your preprocessed dataset
df_balanced = pd.read_csv("df_balanced.csv")

# Inspect Data set 
df_balanced.head()


Unnamed: 0.1,Unnamed: 0,book name,review title,reviewer,reviewer rating,review description,is_verified,date,timestamp,ASIN,Author,title_group,text,clean_text,sentiment,label,label_id
0,832,Rich Dad Poor Dad: What the Rich Teach Their K...,This book changed my life,sherritha,5,groundbreaking book that explores the stark co...,True,20-10-2023,"Reviewed in the United States October 20, 2023",1612681131,Robert Kiyosaki,Other,This book changed my life. groundbreaking book...,this book changed my life groundbreaking book ...,book changed life groundbreaking book explores...,not recommended,
1,764,Paint by Sticker Kids: Christmas: Create 10 Pi...,takes lots of time,Julie,4,"This is a fun activity. However, for small 5 y...",True,26-04-2023,"Reviewed in the United States April 26, 2023",152350675X,Workman Publishing,Other,takes lots of time. This is a fun activity. Ho...,takes lots of time this is a fun activity howe...,takes lots time fun activity however small yr ...,neutral,2.0
2,205,A Little Life,Heartbreaking but beautiful,Amazon Customer,5,I rarely come across a book that lingers with ...,True,02-10-2023,"Reviewed in the United States October 2, 2023",804172706,Hanya Yanagihara,Other,Heartbreaking but beautiful. I rarely come acr...,heartbreaking but beautiful i rarely come acro...,heartbreaking beautiful rarely come across boo...,neutral,2.0
3,363,Hundred Years' War on Palestine,Must read in these times,Tina,5,This book is required reading to get the real ...,True,04-11-2023,"Reviewed in the United States November 4, 2023",1250787653,Rashid Khalidi,Other,Must read in these times. This book is require...,must read in these times this book is required...,must read times book required reading get real...,neutral,2.0
4,193,A Court of Thorns and Roses (A Court of Thorns...,Beautiful Cover,Ohlookabunny,4,"Having heard much raving about this series, I ...",True,19-07-2023,"Reviewed in the United States July 19, 2023",1635575567,Sarah J. Maas,Other,Beautiful Cover. Having heard much raving abou...,beautiful cover having heard much raving about...,beautiful cover heard much raving series pokin...,recommended,0.0


In [2]:
import pandas as pd
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
import torch
from torch.serialization import safe_globals
from transformers.tokenization_utils_base import BatchEncoding

# Load balanced dataset
df_balanced_loaded = pd.read_csv('df_balanced.csv')

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    df_balanced_loaded['sentiment'], df_balanced_loaded['label_id'], test_size=0.2, random_state=42
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize with token_type_ids included
train_encodings = tokenizer(
    list(X_train),
    truncation=True,
    padding=True,
    return_tensors="pt",
    return_token_type_ids=True
)
test_encodings = tokenizer(
    list(X_test),
    truncation=True,
    padding=True,
    return_tensors="pt",
    return_token_type_ids=True
)

# Convert labels to tensors
train_labels = torch.tensor(y_train.values)
test_labels = torch.tensor(y_test.values)

# Convert to DataFrames
train_df = pd.DataFrame({
    'input_ids': train_encodings['input_ids'].tolist(),
    'attention_mask': train_encodings['attention_mask'].tolist(),
    'token_type_ids': train_encodings['token_type_ids'].tolist(),
    'label': train_labels.tolist()
})

test_df = pd.DataFrame({
    'input_ids': test_encodings['input_ids'].tolist(),
    'attention_mask': test_encodings['attention_mask'].tolist(),
    'token_type_ids': test_encodings['token_type_ids'].tolist(),
    'label': test_labels.tolist()
})

# Save to CSV
train_df.to_csv('train_tokenized.csv', index=False)
test_df.to_csv('test_tokenized.csv', index=False)




In [3]:
# Tokenize inputs into a format that the Bert model can train on 
from torch.utils.data import Dataset

class ReviewDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

In [4]:
# create datasets
train_dataset = ReviewDataset(train_encodings, train_labels)
test_dataset = ReviewDataset(test_encodings, test_labels)


In [5]:
torch.save(train_dataset, "train_dataset.pt")
torch.save(test_dataset, "test_dataset.pt")
