In [1]:
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import torch
import re
import random
from nltk.corpus import wordnet
from itertools import chain
import pandas as pd

In [2]:
# Clean tweet by removing URLs and special characters
def clean_tweet(text):
    text = re.sub(r'http\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)  # Remove special characters (keep letters, numbers, and spaces)
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces and strip leading/trailing spaces
    return text

In [3]:
# Synonym replacement for sarcasm augmentation
def get_synonyms(word):
    synonyms = wordnet.synsets(word)
    lemmas = set(chain.from_iterable([word.lemma_names() for word in synonyms]))
    return lemmas

In [4]:
def synonym_replacement(sentence, n=1):
    words = sentence.split()
    new_sentence = words.copy()
    random_words = list(set([word for word in words if word not in ('CLS', 'SEP')]))
    random.shuffle(random_words)

    num_replacements = min(n, len(random_words))
    for random_word in random_words[:num_replacements]:
        synonyms = get_synonyms(random_word)
        if synonyms:
            synonym = random.choice(list(synonyms))
            new_sentence = [synonym if word == random_word else word for word in new_sentence]
    return ' '.join(new_sentence)

In [5]:
# Load and clean the data
file_path = './Tweets.csv'
df = pd.read_csv(file_path)
df['text'] = df['text'].apply(clean_tweet)  # Clean text

In [6]:
# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [7]:
# Map sentiments to numerical labels
sentiment_mapping = {'positive': 2, 'neutral': 1, 'negative': 0}
df['label'] = df['sentiment'].map(sentiment_mapping)

In [8]:
class TweetDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len, augment=False):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.augment = augment

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]

        # Apply sarcasm-specific augmentation (synonym replacement)
        if self.augment and random.uniform(0, 1) > 0.5:
            text = synonym_replacement(text)

        # Tokenize the text
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [9]:
MAX_LEN = 128
# Split the dataset into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(),
    df['label'].tolist(),
    test_size=0.2,
    random_state=42
)

In [10]:
# Create dataset objects with sarcasm augmentation in the training set
train_dataset = TweetDataset(train_texts, train_labels, tokenizer, MAX_LEN, augment=True)
val_dataset = TweetDataset(val_texts, val_labels, tokenizer, MAX_LEN, augment=False)

In [11]:
# Create DataLoaders
BATCH_SIZE = 16
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

In [12]:
# Model setup
from transformers import BertForSequenceClassification, AdamW
import torch.nn as nn

In [13]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [14]:
# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)



In [15]:
def train_epoch(model, data_loader, criterion, optimizer, device, epoch):
    model.train()  # Set model to training mode
    total_loss = 0

    for batch_index, batch in enumerate(data_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        # Zero gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Accumulate total loss
        total_loss += loss.item()

        # Get the predicted labels
        _, preds = torch.max(outputs.logits, dim=1)

        # Log detailed batch information
        print(f'Epoch {epoch + 1}, Batch {batch_index + 1}/{len(data_loader)}:')
        print(f'Input IDs:\n{input_ids}')
        print(f'Attention Mask:\n{attention_mask}')
        print(f'Labels:\n{labels}')
        print(f'Predicted Labels:\n{preds}')
        print(f'Loss: {loss.item():.4f}')
        print('-' * 50)  # Add a separator for clarity between batches

    avg_loss = total_loss / len(data_loader)
    return avg_loss


In [16]:
def eval_model(model, data_loader, criterion, device):
    model.eval()  # Set model to evaluation mode
    total_loss = 0
    correct_predictions = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            # Get the predictions
            _, preds = torch.max(outputs.logits, dim=1)
            correct_predictions += torch.sum(preds == labels)

    avg_loss = total_loss / len(data_loader)
    accuracy = correct_predictions.double() / len(data_loader.dataset)

    return avg_loss, accuracy


In [17]:
EPOCHS = 3  # Number of epochs
for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    
    # Train model with detailed batch information
    train_loss = train_epoch(model, train_loader, criterion, optimizer, device, epoch)
    print(f'Training loss: {train_loss}')

    # Evaluate model
    val_loss, val_accuracy = eval_model(model, val_loader, criterion, device)
    print(f'Validation loss: {val_loss}, Validation accuracy: {val_accuracy}')
    print('=' * 50)  # Add separator between epochs


Epoch 1/3
Epoch 1, Batch 1/1374:
Input IDs:
tensor([[  101,  2003,  2012,  ...,     0,     0,     0],
        [  101,  6457,  6251,  ...,     0,     0,     0],
        [  101,  2006,  1996,  ...,     0,     0,     0],
        ...,
        [  101, 10680, 14215,  ...,     0,     0,     0],
        [  101, 11601,  2239,  ...,     0,     0,     0],
        [  101,  2085,  2008,  ...,     0,     0,     0]], device='cuda:0')
Attention Mask:
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')
Labels:
tensor([1, 1, 1, 1, 2, 2, 1, 1, 0, 1, 2, 0, 2, 2, 2, 2], device='cuda:0')
Predicted Labels:
tensor([2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2], device='cuda:0')
Loss: 0.9900
--------------------------------------------------
Epoch 1, Batch 2/1374:
Input IDs:
tensor([[  101,  8840,  2140,  ...,     0,     0,     

In [18]:
# Save the fine-tuned BERT model
model.save_pretrained("bert_sentiment_model")

In [19]:
def preprocess_text(text, tokenizer, max_len=128):
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_len,
        return_token_type_ids=False,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
        truncation=True
    )
    
    return encoding['input_ids'], encoding['attention_mask']

In [20]:
def predict_sentiment(text, model, tokenizer, device):
    model.eval()  # Set model to evaluation mode

    # Preprocess the text
    input_ids, attention_mask = preprocess_text(text, tokenizer)
    
    # Move tensors to the same device as the model
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    
    # Get the prediction (the class with the highest logit score)
    _, prediction = torch.max(outputs.logits, dim=1)
    
    # Convert prediction to CPU and return it
    return prediction.cpu().item()

In [21]:
# Load the trained model
model = BertForSequenceClassification.from_pretrained("bert_sentiment_model")
model.to(device)  # Move model to GPU/CPU

# Load the tokenizer (same tokenizer used during training)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Test with a sample tweet
sample_text = "I love using BERT for NLP tasks!"
predicted_label = predict_sentiment(sample_text, model, tokenizer, device)

# Map the predicted label back to sentiment
label_mapping = {2: 'positive', 1: 'neutral', 0: 'negative'}
print(f'Sentiment: {label_mapping[predicted_label]}')

Sentiment: positive


In [22]:
def predict_sentiments(texts, model, tokenizer, device):
    model.eval()  # Set model to evaluation mode
    predictions = []

    for text in texts:
        input_ids, attention_mask = preprocess_text(text, tokenizer)
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, prediction = torch.max(outputs.logits, dim=1)
            predictions.append(prediction.cpu().item())

    return predictions

# Test with multiple sample tweets
sample_texts = [
    "BERT is amazing for sentiment analysis!",
    "I'm not sure about this.",
    "I really hate it when things don't work."
]
predicted_labels = predict_sentiments(sample_texts, model, tokenizer, device)

# Map predicted labels to sentiments
predicted_sentiments = [label_mapping[label] for label in predicted_labels]
for text, sentiment in zip(sample_texts, predicted_sentiments):
    print(f'Tweet: {text}\nSentiment: {sentiment}\n')

Tweet: BERT is amazing for sentiment analysis!
Sentiment: positive

Tweet: I'm not sure about this.
Sentiment: neutral

Tweet: I really hate it when things don't work.
Sentiment: negative



In [23]:
# Load the trained model
model = BertForSequenceClassification.from_pretrained("bert_sentiment_model")
model.to(device)  # Move model to GPU/CPU

# Load the tokenizer (same tokenizer used during training)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Test with a sample tweet
sample_text = "Oh great, another rainy day! Just what I needed to brighten up my week. I was running out of excuses to stay indoors and do absolutely nothing! Who needs sunshine anyway?"
predicted_label = predict_sentiment(sample_text, model, tokenizer, device)

# Map the predicted label back to sentiment
label_mapping = {2: 'positive', 1: 'neutral', 0: 'negative'}
print(f'Sentiment: {label_mapping[predicted_label]}')

Sentiment: neutral


In [24]:
# Load the trained model
model = BertForSequenceClassification.from_pretrained("bert_sentiment_model")
model.to(device)  # Move model to GPU/CPU

# Load the tokenizer (same tokenizer used during training)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Test with a sample tweet
sample_text = "Oh great, another Monday! Just what I needed to brighten my day. Can’t wait to dive into all this work. Sigh."

# Map the predicted label back to sentiment
label_mapping = {2: 'positive', 1: 'neutral', 0: 'negative'}
print(f'Sentiment: {label_mapping[predicted_label]}')

Sentiment: neutral
