In [3]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import pandas as pd
import re

# 1. Read CSV
df = pd.read_csv("twitter.csv", encoding="latin-1", header=None)
df.columns = ["sentiment", "id", "date", "query", "user", "text"]

# Original dataset uses 0 for negative, 4 for positive
df["label"] = df["sentiment"].apply(lambda x: 1 if x == 4 else 0)

# Define a preprocessing function for tweets
def preprocess_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove mentions
    text = re.sub(r'@\w+', '', text)
    # Remove hashtag symbols (keep the text)
    text = re.sub(r'#', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# Apply preprocessing
df['text'] = df['text'].apply(preprocess_text)

# Keep only text and label columns
df = df[["text", "label"]]

# Convert to numpy arrays
sentences = df["text"].values
labels = df["label"].values

print(df.head())

# Now split the data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    sentences, labels, test_size=0.2, random_state=42
)

                                                text  label
0  - Awww, that's a bummer. You shoulda got David...      0
1  is upset that he can't update his Facebook by ...      0
2  I dived many times for the ball. Managed to sa...      0
3     my whole body feels itchy and like its on fire      0
4  no, it's not behaving at all. i'm mad. why am ...      0


In [4]:
# 2. Tokenize the Data using BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_data(texts, max_len=128):
    """Tokenize texts into input IDs and attention masks."""
    input_ids = []
    attention_masks = []
    for text in texts:
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_len,
            padding="max_length",     # ensure fixed sequence length
            return_attention_mask=True,
            return_tensors="pt",
            truncation=True
        )
        input_ids.append(encoded["input_ids"])
        attention_masks.append(encoded["attention_mask"])
    # Concatenate tensors from list into a single tensor
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    return input_ids, attention_masks

# Tokenize both training and validation texts
train_inputs, train_masks = tokenize_data(train_texts)
val_inputs, val_masks = tokenize_data(val_texts)

# Convert label lists to tensors
train_labels = torch.tensor(train_labels)
val_labels = torch.tensor(val_labels)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [5]:
# 3. Create TensorDatasets and DataLoaders
train_dataset = TensorDataset(train_inputs, train_masks, train_labels)
val_dataset = TensorDataset(val_inputs, val_masks, val_labels)

batch_size = 16

train_dataloader = DataLoader(
    train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size
)
val_dataloader = DataLoader(
    val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size
)

In [6]:
# 4. Initialize the BERT Model for Sequence Classification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Set up the optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 10
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer, num_warmup_steps=0, num_training_steps=total_steps
)

# Helper function to calculate accuracy
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# 5. Training and Evaluation Loop
import time, datetime

for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")

    # Training
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)
        model.zero_grad()
        outputs = model(
            b_input_ids,
            token_type_ids=None,
            attention_mask=b_input_mask,
            labels=b_labels
        )
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Average training loss: {avg_train_loss:.4f}")

    # Evaluation
    model.eval()
    eval_accuracy = 0
    nb_eval_steps = 0
    for batch in val_dataloader:
        b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            outputs = model(
                b_input_ids,
                token_type_ids=None,
                attention_mask=b_input_mask
            )
        logits = outputs.logits.detach().cpu().numpy()
        label_ids = b_labels.to("cpu").numpy()
        eval_accuracy += flat_accuracy(logits, label_ids)
        nb_eval_steps += 1
    print(f"Validation Accuracy: {eval_accuracy/nb_eval_steps:.4f}\n")

Epoch 1/10
Average training loss: 0.4437
Validation Accuracy: 0.8297

Epoch 2/10
Average training loss: 0.2876
Validation Accuracy: 0.8350

Epoch 3/10
Average training loss: 0.1766
Validation Accuracy: 0.8235

Epoch 4/10
Average training loss: 0.1018
Validation Accuracy: 0.8290

Epoch 5/10
Average training loss: 0.0646
Validation Accuracy: 0.8190

Epoch 6/10
Average training loss: 0.0384
Validation Accuracy: 0.8225

Epoch 7/10
Average training loss: 0.0253
Validation Accuracy: 0.8247

Epoch 8/10
Average training loss: 0.0211
Validation Accuracy: 0.8257

Epoch 9/10
Average training loss: 0.0162
Validation Accuracy: 0.8253

Epoch 10/10
Average training loss: 0.0097
Validation Accuracy: 0.8265



In [8]:
# 6. Save the Model and Tokenizer
save_directory = "./saved_bert_model"
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

('./saved_bert_model/tokenizer_config.json',
 './saved_bert_model/special_tokens_map.json',
 './saved_bert_model/vocab.txt',
 './saved_bert_model/added_tokens.json')

In [11]:
# 7. Reload the Model and Tokenizer for Custom Inference
model_loaded = BertForSequenceClassification.from_pretrained(save_directory)
tokenizer_loaded = BertTokenizer.from_pretrained(save_directory)
model_loaded.to(device)

# Custom inference function
def predict_sentiment(text):
    model_loaded.eval()
    encoded = tokenizer_loaded.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        padding="max_length",
        return_attention_mask=True,
        return_tensors="pt",
        truncation=True
    )
    input_ids = encoded["input_ids"].to(device)
    attention_mask = encoded["attention_mask"].to(device)
    with torch.no_grad():
        outputs = model_loaded(input_ids, token_type_ids=None, attention_mask=attention_mask)
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()
    return predicted_class

# Define example sentences
sample_sentences = [
     "I love using BERT!",
     "I had a bad day at work!",
     "I am excited to meet my family!" ,
     "The food was awesome."
]

# Classify each sentence
for sentence in sample_sentences:
    prediction = predict_sentiment(sentence)
    sentiment = "Positive" if prediction == 1 else "Negative"
    print(f"Sentence: {sentence}\nPredicted Sentiment: {sentiment}\n")

Sentence: I love using BERT!
Predicted Sentiment: Positive

Sentence: I had a bad day at work!
Predicted Sentiment: Negative

Sentence: I am excited to meet my family!
Predicted Sentiment: Positive

Sentence: The food was awesome.
Predicted Sentiment: Positive

