In [1]:
from sklearn.preprocessing import  LabelEncoder
#from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
import pickle
import pandas as pd

In [5]:
data= pd.read_csv("data.csv")

In [7]:


# Prepare the data for BERT
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

In [13]:
# # Fine-tune the BERT model
def train_model(data):
    model_name = 'bert-base-uncased'
    tokenizer = BertTokenizer.from_pretrained(model_name)

    # Ensure labels are encoded properly
    label_encoder = LabelEncoder()
    labels = label_encoder.fit_transform(data["sentiment_category"].tolist())
    
    # Check number of unique labels
    num_labels = len(set(labels))
    
    # Initialize the model with the correct number of labels
    model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

    texts = data["reviews_lemmatized"].astype(str).tolist()
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=512)
    dataset = CustomDataset(encodings, labels)
    
    training_args = TrainingArguments(
        output_dir='./results',
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
    )

    trainer.train()
    
    return tokenizer, model, label_encoder

In [15]:
tokenizer, model, label_encoder= train_model(data)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
10,1.2054
20,1.1222
30,0.9366
40,0.789
50,0.5421
60,0.5028
70,0.4025
80,0.3876
90,0.6136
100,0.3792


In [17]:
# # Save the model
with open('BERT.pkl', 'wb') as model_file:
    pickle.dump((tokenizer, data, model, label_encoder), model_file)

In [19]:
with open('BERT.pkl', 'rb') as model_file:
    tokenizer, data, model, label_encoder = pickle.load(model_file)

In [21]:
def predict(text, tokenizer, model, label_encoder):
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding="max_length")
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    predicted_label = torch.argmax(predictions, dim=1).item()
    return label_encoder.inverse_transform([predicted_label])[0], predictions

In [23]:
user_input="The shoes was really nice, but the looks are just okay"

In [25]:
predicted_label, predictions = predict(user_input, tokenizer, model, label_encoder)

In [27]:
predicted_label

'Positive'

In [29]:
predictions

tensor([[0.0052, 0.0011, 0.9937]])