In [27]:
from transformers import BertTokenizer
import pandas as pd
import matplotlib.pyplot as plt

In [5]:
# Load data
data = pd.read_csv('cleaned_sentiment140.csv')
data

Unnamed: 0,polarity,text
0,0,awww thats a bummer you shoulda got david car...
1,0,is upset that he cant update his facebook by t...
2,0,i dived many times for the ball managed to sav...
3,0,my whole body feels itchy and like its on fire
4,0,no its not behaving at all im mad why am i her...
...,...,...
1599995,1,just woke up having no school is the best feel...
1599996,1,thewdbcom very cool to hear old walt interviews
1599997,1,are you ready for your mojo makeover ask me fo...
1599998,1,happy th birthday to my boo of alll time tupac...


In [13]:
data['text'] = data['text'].astype(str)

In [14]:
# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize tweets
encoded_inputs = tokenizer(data['text'].tolist(), 
                           padding=True, 
                           truncation=True, 
                           max_length=128,
                           return_tensors='pt')

input_ids = encoded_inputs['input_ids']
attention_masks = encoded_inputs['attention_mask']

In [15]:
import torch

labels = torch.tensor(data['polarity'].values)

In [16]:
from torch.utils.data import TensorDataset

dataset = TensorDataset(input_ids, attention_masks, labels)


In [20]:
from torch.utils.data import DataLoader, RandomSampler

# Define batch size
batch_size = 256

# Create a DataLoader to load data in batches
dataloader = DataLoader(
    dataset, 
    sampler=RandomSampler(dataset),
    batch_size=batch_size
)

In [21]:
from transformers import BertForSequenceClassification

# Load pre-trained BERT model for sequence classification (binary classification)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Define the optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# Training loop
epochs = 3
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    for batch in dataloader:
        input_ids = batch[0]  # First element of the batch is the input_ids
        attention_mask = batch[1]  # Second element is the attention mask
        labels = batch[2]  # Third element is the labels

        # Zero the gradients before each step
        optimizer.zero_grad()

        # Forward pass (get outputs from the model)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

        # Loss is computed internally when labels are provided
        loss = outputs.loss

        # Backward pass (compute gradients)
        loss.backward()

        # Update weights
        optimizer.step()

        # Print loss for tracking
        print(f"Loss: {loss.item()}")