In [1]:
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ['CUDA_LAUNCH_BLOCKING'] = "1" 

In [2]:
import torch
from transformers import (
    BertTokenizer, 
    BertForSequenceClassification, 
    DataCollatorWithPadding,
    TrainingArguments, 
    Trainer
)
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA device: {torch.cuda.current_device()}")
print(f"Device name: {torch.cuda.get_device_name(0)}")

CUDA available: True
CUDA device: 0
Device name: NVIDIA GeForce RTX 3070 Ti Laptop GPU


In [23]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device) 
print(f"Model device: {next(model.parameters()).device}")

Model device: cuda:0


In [None]:
df = pd.read_csv("input/train.csv", encoding='ISO-8859-1')
df.dropna(inplace=True)
df = df[:1000] 

In [26]:
df['sentiment'].value_counts()

sentiment
neutral     390
positive    315
negative    295
Name: count, dtype: int64

In [27]:
le = LabelEncoder()
df['sentiment_encoded'] = le.fit_transform(df['sentiment'])

# Train-test split
x_train, x_test, y_train, y_test = train_test_split(
    df['text'].tolist(), 
    df['sentiment_encoded'].tolist(), 
    test_size=0.2, 
    random_state=42
)

In [28]:
train_encodings = tokenizer(x_train, truncation=True, padding=True, return_tensors="pt")
test_encodings = tokenizer(x_test, truncation=True, padding=True, return_tensors="pt")

In [29]:
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: val[idx].to(device) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long).to(device)
        return item
    
    def __len__(self):
        return len(self.labels)

In [30]:
train_dataset = SentimentDataset(train_encodings, y_train)
val_dataset = SentimentDataset(test_encodings, y_test)

In [31]:
sample = train_dataset[0]
print("\nSample batch:")
print({k: v.shape for k, v in sample.items()})


Sample batch:
{'input_ids': torch.Size([59]), 'token_type_ids': torch.Size([59]), 'attention_mask': torch.Size([59]), 'labels': torch.Size([])}


In [32]:
def compute_metrics(p):
    preds, labels = p
    preds = np.argmax(preds, axis=1)
    return {"accuracy": accuracy_score(labels, preds)}


In [42]:
args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=8,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=100,
    eval_steps=100
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


In [44]:
try:
    trainer.train()
except Exception as e:
    print(f"Error during raining: {e}")
    raise

Step,Training Loss
10,0.0076
20,0.0028
30,0.2761
40,0.0385
50,0.1257
60,0.042
70,0.0703
80,0.0446
90,0.0016
100,0.0757


In [45]:
trainer.evaluate()

{'eval_loss': 1.8935626745224,
 'eval_accuracy': 0.75,
 'eval_runtime': 0.4392,
 'eval_samples_per_second': 455.372,
 'eval_steps_per_second': 56.922,
 'epoch': 5.0}

In [46]:
trainer.save_model('CustomModel')

In [47]:
model_2 = BertForSequenceClassification.from_pretrained("CustomModel")
model_2.to('cuda')

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [49]:
text = "just woke up, no school today, we are free"
inputs = tokenizer(text,padding = True, truncation = True, return_tensors='pt').to('cuda')
outputs = model_2(**inputs)
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
predictions = predictions.cpu().detach().numpy()
predicted_index = predictions.argmax(axis=1) 

predicted_label = le.inverse_transform(predicted_index)[0]
print("Predicted sentiment:", predicted_label)

Predicted sentiment: neutral
