In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd

In [19]:
# Load your dataset
# Assuming your dataset is in a CSV file
df = pd.read_csv('reviews.csv')
df.head()

Unnamed: 0,text,rating
0,Visited the place with family friends to savou...,2
1,We went there for a friend's bday. The food wa...,2
2,Sindhi Sweets in sector 11 in Panchkula is ver...,2
3,Me and my sister visited there for some snacki...,2
4,"Conveniently located in the sector 11 market, ...",2


In [20]:
# Split the dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df['rating']

1950    2
602     1
986     2
596     2
693     2
       ..
1638    0
1095    2
1130    2
1294    2
860     2
Name: rating, Length: 1928, dtype: int64

In [21]:

# Define a custom dataset class for loading data
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [22]:
# Set up BERT model and tokenizer
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [23]:
# Create DataLoader for training and testing sets
max_len = 128
batch_size = 32

train_dataset = CustomDataset(train_df['text'].values, train_df['rating'].values, tokenizer, max_len)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

test_dataset = CustomDataset(test_df['text'].values, test_df['rating'].values, tokenizer, max_len)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [24]:
# Set up optimizer and loss function
optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# Training loop
epochs = 3
for epoch in range(epochs):
    model.train()
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()




In [25]:
# Evaluation
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

In [26]:
# Calculate accuracy
accuracy = accuracy_score(true_labels, predictions)
print(f'Accuracy: {accuracy}')

Accuracy: 0.8921161825726142


In [27]:
# Save model and tokenizer
#model.save_pretrained('/home/uditrawat/Projects/Projects-Udit/AI-Projects/P-TrueCount/TrueCount/apps/save/model')
#tokenizer.save_pretrained('/home/uditrawat/Projects/Projects-Udit/AI-Projects/P-TrueCount/TrueCount/apps/save/tokenizer')

('/home/uditrawat/Projects/Projects-Udit/AI-Projects/P-TrueCount/TrueCount/apps/save/tokenizer/tokenizer_config.json',
 '/home/uditrawat/Projects/Projects-Udit/AI-Projects/P-TrueCount/TrueCount/apps/save/tokenizer/special_tokens_map.json',
 '/home/uditrawat/Projects/Projects-Udit/AI-Projects/P-TrueCount/TrueCount/apps/save/tokenizer/vocab.txt',
 '/home/uditrawat/Projects/Projects-Udit/AI-Projects/P-TrueCount/TrueCount/apps/save/tokenizer/added_tokens.json')