In [57]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, pipeline
from torch.optim import AdamW
from transformers import get_scheduler
from sklearn.model_selection import train_test_split
from tqdm.auto import tqdm

In [8]:
df = pd.read_csv("data/data.csv", encoding="latin1", header=None)

In [38]:
df['label'] = df['label'].replace(4,1)

In [39]:
df = df.sample(n=10000, random_state=42)

In [40]:
train_texts, val_texts, train_labels, val_labels = train_test_split(df['text'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42)

In [42]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encoding = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encoding = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

In [44]:
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

In [45]:
train_dataset = SentimentDataset(train_encoding, train_labels)
val_dataset = SentimentDataset(val_encoding, val_labels)

In [46]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

In [51]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [55]:
optimizer = AdamW(model.parameters(),lr=5e-5)
num_training_steps= len(train_loader)
lr_scheduler = get_scheduler("linear",optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

In [56]:
model.train()
progress_bar = tqdm(range(num_training_steps))
for batch in train_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = model(**batch)
    loss = outputs.loss
    loss.backward()
    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()
    progress_bar.update(1)

100%|████████████████████████████████████████████████████████████████████████████████| 500/500 [02:14<00:00,  3.04it/s]

In [59]:
model.save_pretrained("bert_sentiment_model")
tokenizer.save_pretrained("bert_sentiment_model")

('bert_sentiment_model\\tokenizer_config.json',
 'bert_sentiment_model\\special_tokens_map.json',
 'bert_sentiment_model\\vocab.txt',
 'bert_sentiment_model\\added_tokens.json')

In [66]:
classifier = pipeline("sentiment-analysis", model="bert_sentiment_model")
classifier("I hate you!")

Device set to use cuda:0


[{'label': 'LABEL_0', 'score': 0.8926213383674622}]