In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.33.1-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m57.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.1-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.8/294.8 kB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m110.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m79.2 MB/s[0m eta [36m0:00:

In [2]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from sklearn.metrics import accuracy_score
import numpy as np
import random
import time
import datetime
from torch.cuda.amp import autocast, GradScaler



In [3]:
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)


In [4]:
# Load your pandas DataFrame
data = pd.read_csv("/content/sentiment_data.csv")  # Replace with your actual file path

# Define labels
labels = data['Sentiment'].apply(lambda x: 0 if x <= 0.5 else 1)
#print(labels)

# Tokenize and encode the sentences
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
max_length = 80  # You can adjust this based on your specific needs

# Tokenize and encode sentences
input_ids = []
attention_masks = []
# Define a scaler for gradient scaling
scaler = GradScaler()

for sentence in data['Sentence']:
    encoded_dict = tokenizer.encode_plus(
        sentence,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
        truncation=True
    )

    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

# Convert lists to tensors
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels.values)


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [5]:
# Split the dataset
train_inputs, test_inputs, train_labels, test_labels, train_masks, test_masks = train_test_split(
    input_ids, labels, attention_masks, random_state=42, test_size=0.1)

# Further split the test set into validation and test sets
val_inputs, test_inputs, val_labels, test_labels, val_masks, test_masks = train_test_split(
    test_inputs, test_labels, test_masks, random_state=42, test_size=0.5)


In [6]:
batch_size = 32

# Create DataLoader for training data
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create DataLoader for validation data
val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

# Create DataLoader for test data
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)


In [7]:
# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2,  # Three classes: positive, negative, neutral
    output_attentions=False,
    output_hidden_states=False
)

# Set up GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 10
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:

# Training loop
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_dataloader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        model.zero_grad()
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1} - Average Training Loss: {avg_train_loss:.4f}")

# Evaluation loop
model.eval()
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
predictions, true_labels = [], []

for batch in val_dataloader:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

    with torch.no_grad():
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

    logits = outputs.logits
    logits = logits.detach().cpu().numpy()
    label_ids = labels.to('cpu').numpy()

    predictions.append(logits)
    true_labels.append(label_ids)
    tmp_eval_accuracy = accuracy_score(label_ids, np.argmax(logits, axis=1))

    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1

print(f"Validation Accuracy: {eval_accuracy / nb_eval_steps:.4f}")

# Classification report on the validation set
predicted_labels = np.argmax(np.concatenate(predictions, axis=0), axis=1)
true_labels = np.concatenate(true_labels, axis=0)
report = classification_report(true_labels, predicted_labels, target_names=["negative", "positive"])

print(report)


Epoch 1 - Average Training Loss: 0.4209
Epoch 2 - Average Training Loss: 0.2073
Epoch 3 - Average Training Loss: 0.1029
Epoch 4 - Average Training Loss: 0.0622
Epoch 5 - Average Training Loss: 0.0305
Epoch 6 - Average Training Loss: 0.0170
Epoch 7 - Average Training Loss: 0.0109
Epoch 8 - Average Training Loss: 0.0062
Epoch 9 - Average Training Loss: 0.0042
Epoch 10 - Average Training Loss: 0.0024
Validation Accuracy: 0.9281
              precision    recall  f1-score   support

    negative       0.95      0.94      0.94       203
    positive       0.87      0.88      0.87        89

    accuracy                           0.92       292
   macro avg       0.91      0.91      0.91       292
weighted avg       0.92      0.92      0.92       292



In [9]:
# Evaluation on the test set
model.eval()
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
predictions, true_labels = [], []

for batch in test_dataloader:
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

    with torch.no_grad():
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

    logits = outputs.logits
    logits = logits.detach().cpu().numpy()
    label_ids = labels.to('cpu').numpy()

    predictions.append(logits)
    true_labels.append(label_ids)
    tmp_eval_accuracy = accuracy_score(label_ids, np.argmax(logits, axis=1))

    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1

print(f"Test Accuracy: {eval_accuracy / nb_eval_steps:.4f}")



Test Accuracy: 0.9250


In [13]:
true_labels = np.array(true_labels)

# Classification report on the test set
predicted_labels = np.argmax(np.concatenate(predictions, axis=0), axis=1)
report = classification_report(true_labels, predicted_labels, target_names=["negative", "positive"])

print(report)

# Optionally, you can load the model later using:
# model = BertForSequenceClassification.from_pretrained("bert_sentiment_model")

# You can also use the model for inference on new data
# For example:
# new_sentence = "This is a positive sentence."
# encoded_dict = tokenizer.encode_plus(
#     new_sentence,
#     add_special_tokens=True,
#     max_length=max_length,
#     padding='max_length',
#     return_attention_mask=True,
#     return_tensors='pt',
#     truncation=True
# )
# input_ids = encoded_dict['input_ids'].to(device)
# attention_mask = encoded_dict['attention_mask'].to(device)
# with torch.no_grad():
#     outputs = model(input_ids=input_ids, attention_mask=attention_mask)
# logits = outputs.logits
# predicted_class = torch.argmax(logits, dim=1).cpu().numpy()[0]
# print(f"Predicted Class: {predicted_class}")

# You can adjust the model architecture, hyperparameters, and other settings as needed.


              precision    recall  f1-score   support

    negative       0.94      0.93      0.94       199
    positive       0.86      0.88      0.87        94

    accuracy                           0.92       293
   macro avg       0.90      0.91      0.91       293
weighted avg       0.92      0.92      0.92       293

