In [None]:
# DS776 Environment Setup & Package Update
# Configures storage paths for proper cleanup/sync, then updates introdl if needed
# If this cell fails, see Lessons/Course_Tools/AUTO_UPDATE_SYSTEM.md for help
%run ../../Lessons/Course_Tools/auto_update_introdl.py

# Homework 10: Named Entity Recognition

**Name:** [Your Name Here]  
**Total Points: 40**

## Submission Checklist
- [ ] All code cells executed with output saved
- [ ] All questions answered in markdown cells
- [ ] Used `DATA_PATH` and `MODELS_PATH` variables (no hardcoded paths)
- [ ] Per-entity metrics visualization created
- [ ] Comparison of fine-tuned models vs LLM completed
- [ ] Reflection questions answered
- [ ] Notebook exported to HTML
- [ ] Canvas filename includes `_GRADE_THIS_ONE`
- [ ] Files uploaded to Canvas

---

**Point Breakdown:**
- Part 1 (Named Entity Analysis): 6 pts
- Part 2 (Fine-tune BERT Models): 12 pts
- Part 3 (LLM for NER): 12 pts
- Part 4 (Comparison): 6 pts
- Part 5 (Reflection): 2 pts

# Step 1: Load the dataset
from datasets import load_dataset

# YOUR CODE HERE
# 1. Load the "hobbes99/mit-movie-ner-simplified" dataset
# 2. Examine the 'train' and 'valid' splits
# 3. Print an example to see the structure
# 4. Get the label names from the dataset features

In [None]:
# Step 2: Prepare for fine-tuning - Tokenization
from transformers import AutoTokenizer

# YOUR CODE HERE
# 1. Load two tokenizers:
#    - "distilbert-base-uncased"
#    - "bert-base-uncased"
# 2. Create a tokenization function that:
#    - Tokenizes the tokens (use is_split_into_words=True)
#    - Aligns the NER labels with tokenized output
#    - Handles subword tokens (use -100 for ignored tokens)
# 3. Apply tokenization to both train and valid splits

# Hint: word_ids() method helps align labels with subword tokens

## Storage Guidance

**Always use the path variables** (`MODELS_PATH`, `DATA_PATH`, `CACHE_PATH`) instead of hardcoded paths. The actual locations depend on your environment:

| Variable | CoCalc Home Server | Compute Server |
|----------|-------------------|----------------|
| `MODELS_PATH` | `Homework_10_Models/` | `Homework_10_Models/` *(synced)* |
| `DATA_PATH` | `~/home_workspace/data/` | `~/cs_workspace/data/` *(local)* |
| `CACHE_PATH` | `~/home_workspace/downloads/` | `~/cs_workspace/downloads/` *(local)* |

**Why this matters:**
- On **Compute Servers**: Only `MODELS_PATH` syncs back to CoCalc (~10GB limit). Data and cache stay local (~50GB).
- On **CoCalc Home**: Everything syncs and counts against the ~10GB limit.
- **Storage_Cleanup.ipynb** (in this folder) helps free synced space when needed.

**Tip:** Always write `MODELS_PATH / 'model.pt'` ‚Äî never hardcode paths like `'Homework_10_Models/model.pt'`.

# Step 3: Set up metrics for evaluation
import evaluate
import numpy as np

# YOUR CODE HERE
# 1. Load the seqeval metric
# 2. Create a compute_metrics function that:
#    - Extracts predictions and labels
#    - Removes ignored tokens (-100)
#    - Converts numeric labels to string labels
#    - Computes precision, recall, F1 overall and per-entity

In [None]:
# Step 4: Fine-tune distilbert-base-uncased
from transformers import (
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)

# YOUR CODE HERE
# 1. Load the model with num_labels matching your dataset
# 2. Create TrainingArguments (output_dir, num_epochs, batch_size, etc.)
# 3. Create a DataCollator for token classification
# 4. Create Trainer with model, args, datasets, tokenizer, data_collator, compute_metrics
# 5. Train the model

# Step 5: Evaluate distilbert and get per-entity metrics

# YOUR CODE HERE
# 1. Run evaluation on the valid set
# 2. Get predictions for detailed analysis
# 3. Calculate precision, recall, F1 for each entity type
# 4. Create a visualization (bar chart) comparing metrics by entity

# Hint: Use trainer.predict() and seqeval.compute() for detailed results

In [None]:
# Step 1: Extract entities from training set

# YOUR CODE HERE
# 1. Create empty lists or dictionaries to store:
#    - Actors with positive sentiment
#    - Actors with negative sentiment
#    - Directors with positive sentiment
#    - Directors with negative sentiment
# 2. Loop through the training dataset
# 3. For each example, extract:
#    - The sentiment (0 or 1)
#    - Tokens tagged as B-ACTOR or I-ACTOR
#    - Tokens tagged as B-DIRECTOR or I-DIRECTOR
# 4. Keep track of counts for each actor/director by sentiment

# Hint: You'll need to:
# - Convert token lists to strings (join consecutive I- tags with B- tag)
# - Track how many times each actor appears in positive vs negative reviews
# - Use label_list to convert numeric tags to string labels

In [None]:
# Step 2: Calculate proportions and find top 3

# YOUR CODE HERE
# 1. For each actor/director, calculate:
#    total_appearances = positive_count + negative_count
#    positive_proportion = positive_count / total_appearances
# 2. Sort by positive proportion (descending for positive, ascending for negative)
# 3. Select top 3 for each category
# 4. Display results with counts and proportions

# Step 6: Create visualization of per-entity performance
import matplotlib.pyplot as plt
import pandas as pd

# YOUR CODE HERE
# 1. Extract precision, recall, F1 for each entity type
# 2. Create a DataFrame with entity types and metrics
# 3. Plot a grouped bar chart comparing precision, recall, F1

In [None]:
# Step 1: Load the dataset
from datasets import load_dataset

# Your code here:
# 1. Load the "hobbes99/mit-movie-ner-simplified" dataset
# 2. Examine the 'train' and 'valid' splits
# 3. Print an example to see the structure
# 4. Get the label names from the dataset features


In [None]:
# Step 2: Prepare for fine-tuning - Tokenization
from transformers import AutoTokenizer

# Your code here:
# 1. Load two tokenizers:
#    - "distilbert-base-uncased"
#    - "bert-base-uncased"
# 2. Create a tokenization function that:
#    - Tokenizes the tokens (use is_split_into_words=True)
#    - Aligns the NER labels with tokenized output
#    - Handles subword tokens (use -100 for ignored tokens)
# 3. Apply tokenization to both train and valid splits

# Hint: word_ids() method helps align labels with subword tokens


In [None]:
# Step 3: Set up metrics for evaluation
import evaluate
import numpy as np

# Your code here:
# 1. Load the seqeval metric
# 2. Create a compute_metrics function that:
#    - Extracts predictions and labels
#    - Removes ignored tokens (-100)
#    - Converts numeric labels to string labels
#    - Computes precision, recall, F1 overall and per-entity


In [None]:
# Step 4: Fine-tune distilbert-base-uncased
from transformers import (
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification
)

# Your code here:
# 1. Load the model with num_labels matching your dataset
# 2. Create TrainingArguments (output_dir, num_epochs, batch_size, etc.)
# 3. Create a DataCollator for token classification
# 4. Create Trainer with model, args, datasets, tokenizer, data_collator, compute_metrics
# 5. Train the model


In [None]:
# Step 5: Evaluate distilbert and get per-entity metrics

# Your code here:
# 1. Run evaluation on the valid set
# 2. Get predictions for detailed analysis
# 3. Calculate precision, recall, F1 for each entity type
# 4. Create a visualization (bar chart) comparing metrics by entity

# Hint: Use trainer.predict() and seqeval.compute() for detailed results


In [None]:
# Step 6: Create visualization of per-entity performance
import matplotlib.pyplot as plt
import pandas as pd

# Your code here:
# 1. Extract precision, recall, F1 for each entity type
# 2. Create a DataFrame with entity types and metrics
# 3. Plot a grouped bar chart comparing precision, recall, F1


In [None]:
# Step 7: Repeat for bert-base-uncased

# Your code here:
# 1. Tokenize the dataset with bert-base-uncased tokenizer
# 2. Load bert-base-uncased model
# 3. Create new Trainer with BERT model
# 4. Train the BERT model
# 5. Evaluate and compare with DistilBERT

# (Follow same steps as DistilBERT - cells above)


In [None]:
# Step 8: Run inference on movie reviews from internet
from transformers import pipeline

# Your code here:
# 1. Find 2 movie reviews from the internet (IMDB, Rotten Tomatoes, etc.)
# 2. Create a NER pipeline with your fine-tuned model
# 3. Run inference on the reviews
# 4. Display the extracted entities with their types

# Hint: Use pipeline("ner", model=your_model, aggregation_strategy="simple")


In [None]:
# Step 9: Compare DistilBERT vs BERT

# Your code here:
# 1. Create a comparison table with:
#    - Model name
#    - Overall F1 score
#    - Training time (approximate)
#    - Model size (parameters)
#    - Best/worst entity types
# 2. Discuss which model performed better
# 3. Analyze whether BERT's larger size justified better performance


# Step 7: Repeat for bert-base-uncased

# YOUR CODE HERE
# 1. Tokenize the dataset with bert-base-uncased tokenizer
# 2. Load bert-base-uncased model
# 3. Create new Trainer with BERT model
# 4. Train the BERT model
# 5. Evaluate and compare with DistilBERT

# (Follow same steps as DistilBERT - cells above)

In [None]:
# Step 1: Prepare validation subset
from Lesson_10_Helpers import llm_ner_extractor

# Your code here:
# 1. Get first 100 examples from valid split
# 2. Extract texts and true labels
# 3. Start with 2-3 examples to test your prompt


In [None]:
# Step 2: Design prompt for LLM-based NER

# Your code here:
# 1. Create system_prompt explaining the task
# 2. Create prompt_template with:
#    - Instructions to extract entities
#    - Entity types to look for: Actor, Character, Director, Genre, Title, Year
#    - Request structured output (JSON format recommended)
#    - Include the {text} placeholder


In [None]:
# Step 3: Test prompt on small subset with llm_ner_extractor

# YOUR CODE HERE
# 1. Use llm_ner_extractor function (similar to llm_classifier from Lesson 8)
# 2. Start with a small API model or local model
# 3. Test on 3-5 examples
# 4. Refine your prompt based on results
# 5. Parse JSON output and convert to BIO format

# Hint: Use llm_generate() to test prompt formatting before scaling up

In [None]:
# Step 4: Convert LLM outputs to BIO format

# YOUR CODE HERE
# 1. Parse JSON from LLM response
# 2. Match entity spans to token positions
# 3. Convert to BIO format (B-ACTOR, I-ACTOR, etc.)
# 4. Handle errors/malformed JSON gracefully

# Hint: llm_ner_extractor from Lesson_10_Helpers handles conversion for you

In [None]:
# Step 5: Scale up to 100 examples and evaluate

# YOUR CODE HERE
# 1. Once prompt is refined, run on all 100 validation examples
# 2. Convert predictions to BIO format
# 3. Calculate metrics using seqeval (like Part 2)
# 4. Generate classification report

# Step 8: Run inference on movie reviews from internet
from transformers import pipeline

# YOUR CODE HERE
# 1. Find 2 movie reviews from the internet (IMDB, Rotten Tomatoes, etc.)
# 2. Create a NER pipeline with your fine-tuned model
# 3. Run inference on the reviews
# 4. Display the extracted entities with their types

# Hint: Use pipeline("ner", model=your_model, aggregation_strategy="simple")

# Step 9: Compare DistilBERT vs BERT

# YOUR CODE HERE
# 1. Create a comparison table with:
#    - Model name
#    - Overall F1 score
#    - Training time (approximate)
#    - Model size (parameters)
#    - Best/worst entity types
# 2. Discuss which model performed better
# 3. Analyze whether BERT's larger size justified better performance

## Part 5 - Reflection (2 pts)

1. What, if anything, did you find difficult to understand for the lesson? Why?

üìù **YOUR ANSWER HERE:**

2. What resources did you find supported your learning most and least for this lesson? (Be honest - I use your input to shape the course.)

üìù **YOUR ANSWER HERE:**

# Step 1: Prepare validation subset
from Lesson_10_Helpers import llm_ner_extractor

# YOUR CODE HERE
# 1. Get first 100 examples from valid split
# 2. Extract texts and true labels
# 3. Start with 2-3 examples to test your prompt

In [None]:
# Step 2: Design prompt for LLM-based NER

# YOUR CODE HERE
# 1. Create system_prompt explaining the task
# 2. Create prompt_template with:
#    - Instructions to extract entities
#    - Entity types to look for: Actor, Character, Director, Genre, Title, Year
#    - Request structured output (JSON format recommended)
#    - Include the {text} placeholder