# Sentiment Analysis using BERT (IMDb Sample)

In [2]:
import numpy as np
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML

## 1. Setup and Data Preparation

In [3]:
def load_data():
    data = {
        'text': [
            "This movie was fantastic and thrilling!",
            "Absolutely boring and predictable.",
            "A masterpiece of storytelling.",
            "Terrible acting and poor script."
        ],
        'label': [1, 0, 1, 0]  # 1: positive, 0: negative
    }
    df = pd.DataFrame(data)
    return df

## 2. BERT Implementation

In [4]:
def preprocess_data(df, tokenizer, max_len=128):
    input_ids = []
    attention_masks = []
    
    for text in df['text']:
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_len,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(df['label'].values)
    
    return input_ids, attention_masks, labels

def train_model(model, train_dataloader, epochs=3):
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    model.train()
    
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_dataloader:
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_dataloader)}")

## 3. Analysis and Evaluation

In [5]:
def evaluate_model(model, test_dataloader):
    model.eval()
    predictions, true_labels = [], []
    
    with torch.no_grad():
        for batch in test_dataloader:
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            predictions.extend(preds.tolist())
            true_labels.extend(labels.tolist())
    
    accuracy = accuracy_score(true_labels, predictions)
    report = classification_report(true_labels, predictions, target_names=['Negative', 'Positive'])
    return accuracy, report

## 4. Visualization

In [6]:
def visualize_attention(tokenizer, model, text, max_len=128):
    inputs = tokenizer.encode_plus(
        text,
        return_tensors='pt',
        max_length=max_len,
        truncation=True,
        padding=True
    )
    outputs = model(**inputs, output_attentions=True)
    attention = outputs.attentions[-1][0, 0].detach().numpy()  # Last layer, first head
    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(attention, xticklabels=tokens, yticklabels=tokens, cmap='viridis')
    plt.title("BERT Attention Weights")
    plt.savefig('attention_heatmap.png')
    plt.close()


## 5. Main Execution

In [7]:
def main():
    # Initialize BERT tokenizer and model
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
    
    # Load and preprocess data
    df = load_data()
    input_ids, attention_masks, labels = preprocess_data(df, tokenizer)
    
    # Create DataLoader
    dataset = TensorDataset(input_ids, attention_masks, labels)
    train_size = int(0.8 * len(dataset))
    test_size = len(dataset) - train_size
    train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])
    
    train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)
    test_dataloader = DataLoader(test_dataset, batch_size=2)
    
    # Train and evaluate
    print("Training BERT model...")
    train_model(model, train_dataloader)
    accuracy, report = evaluate_model(model, test_dataloader)
    
    # Display results
    display(HTML(f"<h3>Model Accuracy: {accuracy:.4f}</h3>"))
    display(HTML("<h3>Classification Report:</h3><pre>" + report + "</pre>"))
    
    # Visualize attention for a sample text
    sample_text = "This movie was fantastic and thrilling!"
    visualize_attention(tokenizer, model, sample_text)
    display(HTML("<h3>Attention Heatmap Saved as 'attention_heatmap.png'</h3>"))

## 6. Research Questions and Analysis

In [8]:
def research_questions():
    questions = """
    ### Research Questions:
    1. **Contextual Understanding**: How effectively does BERT capture contextual relationships in short versus long text inputs?
    2. **Creativity**: Can BERT generate creative text outputs, or is it primarily suited for classification tasks?
    3. **Domain Adaptability**: How well does BERT adapt to domain-specific tasks with limited fine-tuning?
    4. **Limitations**: What are the computational and data requirements for BERT to perform optimally?
    
    ### Observations:
    - BERT excels in understanding context due to its bidirectional architecture, as seen in the attention heatmap.
    - It is less suited for creative text generation compared to models like GPT-3.
    - Fine-tuning on small datasets can lead to overfitting, requiring careful hyperparameter tuning.
    - High computational cost limits accessibility for low-resource environments.
    """
    display(HTML(questions))

## 7. Conclusion

In [9]:
def conclusion():
    insights = """
    ### Conclusion:
    This project demonstrated BERT's capabilities in sentiment analysis, highlighting its strength in contextual understanding through bidirectional processing. The attention visualization revealed how BERT focuses on relevant tokens, enhancing interpretability. However, its computational demands and limited creative generation capabilities suggest opportunities for improvement in efficiency and versatility. Potential applications include automated sentiment analysis in social media or customer feedback systems. Future work could explore lightweight BERT variants or hybrid models combining BERT's contextual strengths with generative capabilities.
    
    ### Ethical Considerations:
    - **Bias**: BERT may inherit biases from training data, requiring careful monitoring in sensitive applications.
    - **Accessibility**: High resource demands limit its use in low-resource settings, raising equity concerns.
    - **Transparency**: Attention visualizations improve interpretability, aligning with ethical AI practices.
    """
    display(HTML(insights))

In [10]:
if __name__ == "__main__":
    main()
    research_questions()
    conclusion()

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Training BERT model...
Epoch 1, Loss: 0.6280363649129868
Epoch 2, Loss: 0.5016179233789444
Epoch 3, Loss: 0.4960475564002991


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


