In [4]:
# Import required libraries
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline
import torch
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from collections import Counter
import seaborn as sns

# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

print("✅ Libraries imported successfully!")

✅ Libraries imported successfully!


In [5]:
# Hint: Use GPT2Tokenizer.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Test sentences to explore tokenization
test_sentences = [
    "Hello world!",
    "The quick brown fox jumps over the lazy dog.",
    "Artificial intelligence is revolutionizing technology.",
    "GPT-2 uses transformer architecture.",
    "Supercalifragilisticexpialidocious"  # Long word to see sub-word tokenization
]

print("🔍 Exploring Tokenization:")
print("=" * 50)

for sentence in test_sentences:
    # TODO: Tokenize the sentence
    # Hint: Use tokenizer.encode() to get token IDs
    # Use tokenizer.tokenize() to see the actual tokens
    tokens = tokenizer.tokenize(sentence)  # Get the actual token strings
    token_ids = tokenizer.encode(sentence)  # Get the numerical IDsa
    
    print(f"\nOriginal: {sentence}")
    print(f"Tokens: {tokens}")
    print(f"Token IDs: {token_ids}")
    print(f"Number of tokens: {len(tokens)}")

# TODO: Print tokenizer vocabulary size
print(f"\n📊 Tokenizer vocabulary size: {len(tokenizer)}")  # Hint: len(tokenizer)

🔍 Exploring Tokenization:

Original: Hello world!
Tokens: ['Hello', 'Ġworld', '!']
Token IDs: [15496, 995, 0]
Number of tokens: 3

Original: The quick brown fox jumps over the lazy dog.
Tokens: ['The', 'Ġquick', 'Ġbrown', 'Ġfox', 'Ġjumps', 'Ġover', 'Ġthe', 'Ġlazy', 'Ġdog', '.']
Token IDs: [464, 2068, 7586, 21831, 18045, 625, 262, 16931, 3290, 13]
Number of tokens: 10

Original: Artificial intelligence is revolutionizing technology.
Tokens: ['Art', 'ificial', 'Ġintelligence', 'Ġis', 'Ġrevolution', 'izing', 'Ġtechnology', '.']
Token IDs: [8001, 9542, 4430, 318, 5854, 2890, 3037, 13]
Number of tokens: 8

Original: GPT-2 uses transformer architecture.
Tokens: ['G', 'PT', '-', '2', 'Ġuses', 'Ġtransformer', 'Ġarchitecture', '.']
Token IDs: [38, 11571, 12, 17, 3544, 47385, 10959, 13]
Number of tokens: 8

Original: Supercalifragilisticexpialidocious
Tokens: ['Super', 'cal', 'if', 'rag', 'il', 'ist', 'ice', 'xp', 'ial', 'id', 'ocious']
Token IDs: [12442, 9948, 361, 22562, 346, 396, 501, 42372, 

In [6]:
analysis_texts = [
    "running",
    "runner",
    "run",
    "unhappiness",
    "ChatGPT",
    "COVID-19",
    "2023",
    "programming",
    "antidisestablishmentarianism"
]

print("🔍 Token Pattern Analysis:")
print("=" * 60)

token_analysis = []

for text in analysis_texts:
    tokens = tokenizer.tokenize(text)              # Tokenize the current text
    token_count = len(tokens)                      # Count the tokens
    chars_per_token = len(text) / token_count      # Average characters per token

    token_analysis.append({
        'text': text,
        'tokens': tokens,
        'token_count': token_count,
        'chars_per_token': chars_per_token
    })

    print(f"{text:30} → {tokens} ({token_count} tokens)")

# Create DataFrame and analyze patterns
df = pd.DataFrame(token_analysis)

avg_chars_per_token = df['chars_per_token'].mean()
longest_word = df.loc[df['token_count'].idxmax(), 'text']

print(f"\n📊 Average characters per token: {avg_chars_per_token:.2f}")
print(f"📊 Longest word in tokens: {longest_word}")


🔍 Token Pattern Analysis:
running                        → ['running'] (1 tokens)
runner                         → ['runner'] (1 tokens)
run                            → ['run'] (1 tokens)
unhappiness                    → ['un', 'h', 'appiness'] (3 tokens)
ChatGPT                        → ['Chat', 'G', 'PT'] (3 tokens)
COVID-19                       → ['CO', 'VID', '-', '19'] (4 tokens)
2023                           → ['20', '23'] (2 tokens)
programming                    → ['program', 'ming'] (2 tokens)
antidisestablishmentarianism   → ['ant', 'idis', 'establishment', 'arian', 'ism'] (5 tokens)

📊 Average characters per token: 4.12
📊 Longest word in tokens: antidisestablishmentarianism


In [7]:
print("🔄 Loading GPT-2 model (this may take a moment)...")
model = GPT2LMHeadModel.from_pretrained('gpt2')

model.eval()

print("✅ GPT-2 model loaded successfully!")
\
print("\n🏗️ Model Architecture:")
print(f"Model type: {type(model).__name__}")

total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {total_params:,}")
print(f"Model size: ~{total_params / 1e6:.1f}M parameters")


🔄 Loading GPT-2 model (this may take a moment)...
✅ GPT-2 model loaded successfully!

🏗️ Model Architecture:
Model type: GPT2LMHeadModel
Total parameters: 124,439,808
Model size: ~124.4M parameters


In [8]:
# Explore model structure
print("🔍 Model Structure Analysis:")
print("=" * 50)

# TODO: Print model configuration
# Hint: Use model.config
config = model.config
print(f"Model configuration: {config}")

print(f"Vocabulary size: {config.vocab_size}")
print(f"Maximum sequence length: {config.n_positions}")
print(f"Number of transformer layers: {config.n_layer}")
print(f"Number of attention heads: {config.n_head}")
print(f"Hidden size: {config.n_embd}")

# Compare to your simple network
print("\n🤔 Comparison to Your Neural Network:")
print(f"Your network had: 2 inputs → 4 hidden → 1 output")
print(f"GPT-2 has: {config.vocab_size} inputs → {config.n_embd} hidden → {config.vocab_size} outputs")
print(f"Your network: ~50 parameters")
print(f"GPT-2: {total_params:,} parameters")
print(f"GPT-2 is ~{total_params/50:,.0f}x larger!")

🔍 Model Structure Analysis:
Model configuration: GPT2Config {
  "_attn_implementation_autoset": true,
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.46.0",
  "use_cache": true,
  "vocab_size": 50257
}

Vocabulary size: 502

In [9]:
# Load text-generation pipeline using your model and tokenizer
generator = pipeline('text-generation', model=model, tokenizer=tokenizer)

# Base prompt for experiments
base_prompt = "In the future, artificial intelligence will"

print(f"🤖 Base prompt: '{base_prompt}'")
print("=" * 60)

output = generator(base_prompt, max_length=50, num_return_sequences=1)
print(output[0]['generated_text'])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


🤖 Base prompt: 'In the future, artificial intelligence will'
In the future, artificial intelligence will become a part of people's lives. More than 50,000 applications being developed in Europe, Asia and the United States, totaling more than 5 million developers each year, are already being marketed by large online entities to


In [10]:
# Experiment with different temperatures
temperatures = [0.1, 0.7, 1.0, 1.5]

print("🌡️ Temperature Experiments:")
print("=" * 50)

for temp in temperatures:
    print(f"\n🔥 Temperature: {temp}")
    print("-" * 30)
    
    # Generate text with different temperatures using the base_prompt and generator
    result = generator(
        base_prompt,          # prompt
        max_length=60,        # try 60
        temperature=temp,     # use the current temp variable
        do_sample=True,       # enable sampling
        pad_token_id=tokenizer.eos_token_id  # for padding
    )
    
    # Extract and print the generated text
    generated_text = result[0]['generated_text']
    print(generated_text)
    
print("\n🤔 Discussion Questions:")
print("• Which temperature produced the most coherent text?")
print("• Which was most creative/surprising?")
print("• When might you use each temperature setting?")


🌡️ Temperature Experiments:

🔥 Temperature: 0.1
------------------------------


In the future, artificial intelligence will be able to do things like search for people, find people, find people, find people, find people, find people, find people, find people, find people, find people, find people, find people, find people, find people, find people, find

🔥 Temperature: 0.7
------------------------------
In the future, artificial intelligence will be the main driver of our ability to solve complex problems, including our decision-making. We've already seen this with the AI revolution, but with machine learning, we can apply this to our products.

We've already tried out our new machine learning solution

🔥 Temperature: 1.0
------------------------------
In the future, artificial intelligence will become almost ubiquitous, but it won't be just for computers. Machines will build their own self-driving cars as soon as humans, robots, and the natural world agree on how to make smarter choices.

The problem is the human and machine are inseparable

🔥 Temperature: 1.5
--

In [11]:
# Experiment with top-p sampling
top_p_values = [0.3, 0.7, 0.9, 1.0]

print("🎯 Top-p Sampling Experiments:")
print("=" * 50)

for top_p in top_p_values:
    print(f"\n🎲 Top-p: {top_p}")
    print("-" * 30)
    
    # TODO: Generate text with different top-p values
    result = generator(
        base_prompt,
        max_length=60,
        temperature=0.8,  # Keep temperature constant
        top_p=top_p,  # Use the top_p variable
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )
    
    generated_text = result[0]['generated_text']
    print(generated_text)
    
print("\n🤔 Discussion Questions:")
print("• How did the outputs change with different top-p values?")
print("• What's the trade-off between diversity and quality?")

🎯 Top-p Sampling Experiments:

🎲 Top-p: 0.3
------------------------------


In the future, artificial intelligence will be able to learn from our past, and we will be able to learn from our future.

The future is bright.

The future is bright.

The future is bright.

The future is bright.

The future is bright

🎲 Top-p: 0.7
------------------------------
In the future, artificial intelligence will be able to better understand what people are saying, what they're saying, what they're saying, and what they're saying.

"We'll be able to better understand what people are saying, what they're saying, what they're saying, and what

🎲 Top-p: 0.9
------------------------------
In the future, artificial intelligence will make it easier for our children to learn, and easier for our society to understand.

In the long run, we should be able to control the robots. But what about the future?

We can't control the future without artificial intelligence. But we

🎲 Top-p: 1.0
------------------------------
In the future, artificial intelligence will be used to answer the "how

In [12]:
prompts_to_test = {
    "Direct": "Write about artificial intelligence:",
    "Question": "What is artificial intelligence and how will it change the world?",
    "Story_Start": "Once upon a time, in a world where artificial intelligence was everywhere,",
    "List_Format": "Here are 5 ways artificial intelligence will change our lives:\n1.",
    "Expert_Persona": "As a leading AI researcher, I believe that artificial intelligence will",
    "Few_Shot": "Technology predictions:\n• The internet will connect everyone (1990s)\n• Smartphones will be everywhere (2000s)\n• Artificial intelligence will"
}

print("✍️ Prompt Engineering Experiments:")
print("=" * 60)

# TODO: Test each prompt style
for style, prompt in prompts_to_test.items():
    print(f"\n📝 Style: {style}")
    print(f"Prompt: '{prompt}'")
    print("-" * 40)
    
    # TODO: Generate text for each prompt
    result = generator(
        base_prompt,  # use the prompt variable
        max_length=80,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )
    
    generated_text = result[0]['generated_text']
    print(generated_text)
    print("\n" + "="*60)

✍️ Prompt Engineering Experiments:

📝 Style: Direct
Prompt: 'Write about artificial intelligence:'
----------------------------------------


In the future, artificial intelligence will have to learn how to use all of the information in your mind and body, and how to use it to better serve your needs.

It's only a matter of time before artificial intelligence will have to learn how to use all of the information in your mind and body, and how to use it to better serve your needs.

The Future of Artificial


📝 Style: Question
Prompt: 'What is artificial intelligence and how will it change the world?'
----------------------------------------
In the future, artificial intelligence will be used for more than just human jobs, but also for more complex projects such as healthcare and finance.

The company is also developing a new technology that will allow it to build artificial intelligence systems on top of existing technology.

The technology is called "deep learning", which is a type of artificial intelligence that can learn from human input.

The


📝 Style: Story_Start
Prompt: 'Once upon a time, in a world where artificial int

In [13]:
print("📊 Prompt Analysis Exercise:")
print("=" * 50)

# TODO: For each prompt style, generate multiple outputs and analyze
analysis_results = []

for style, prompt in list(prompts_to_test.items())[:3]:  # Test first 3 for time
    # Generate 3 outputs for each prompt
    outputs = []
    
    for i in range(3):
        # TODO: Generate text
        result = generator(
            prompt,
            max_length=60,
            temperature=0.8,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
        
        output = result[0]['generated_text']
        outputs.append(output)
    
    # TODO: Analyze the outputs
    lengths = [len(o) for o in outputs]
    avg_length = sum(lengths) / len(lengths) # Calculate average length of outputs
    
    analysis_results.append({
        'style': style,
        'prompt': prompt,
        'avg_length': avg_length,
        'outputs': outputs
    })
    
    print(f"\n{style}:")
    print(f"  Average length: {avg_length:.1f} characters")
    print(f"  Sample output: {outputs[0][:100]}...")

print("\n🤔 Reflection Questions:")
print("• Which prompt style was most consistent?")
print("• Which produced the most relevant outputs?")
print("• How might you improve these prompts?")

📊 Prompt Analysis Exercise:



Direct:
  Average length: 245.3 characters
  Sample output: Write about artificial intelligence: Can a computer be smarter than everyone else?

The answer: It h...

Question:
  Average length: 282.3 characters
  Sample output: What is artificial intelligence and how will it change the world?

The internet is like the internet...

Story_Start:
  Average length: 283.7 characters
  Sample output: Once upon a time, in a world where artificial intelligence was everywhere, the only thing that could...

🤔 Reflection Questions:
• Which prompt style was most consistent?
• Which produced the most relevant outputs?
• How might you improve these prompts?


In [14]:
def custom_text_generator(prompt, style="balanced", length="medium"):
    """
    Create a customizable text generation function.

    Args:
        prompt (str): The input prompt
        style (str): "creative", "balanced", or "conservative"
        length (str): "short", "medium", or "long"
    Returns:
        str: Generated text
    """
    # Set temperature and top_p based on style
    if style == "creative":
        temperature = 1.3     # Higher for creativity
        top_p = 0.95          # Higher for diversity
    elif style == "conservative":
        temperature = 0.5     # Lower for consistency
        top_p = 0.8           # Lower for focus
    else:  # balanced
        temperature = 1.0     # Medium values
        top_p = 0.9

    # Set max_length based on length
    if length == "short":
        max_length = 40       # Shorter output
    elif length == "long":
        max_length = 100      # Longer output
    else:  # medium
        max_length = 70

    # Generate text with the parameters
    result = generator(
        prompt,               # prompt
        max_length=max_length,
        temperature=temperature,
        top_p=top_p,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )

    return result[0]['generated_text']

# Test your function
test_prompt = "The future of education will be"

print("🧪 Testing Your Text Generator:")
print("=" * 50)

test_combinations = [
    ("creative", "short"),
    ("balanced", "medium"),
    ("conservative", "long")
]

for style, length in test_combinations:
    print(f"\n📝 Style: {style}, Length: {length}")
    print("-" * 30)
    output = custom_text_generator(test_prompt, style=style, length=length)
    print(output)
    print(f"Characters: {len(output)}")


🧪 Testing Your Text Generator:

📝 Style: creative, Length: short
------------------------------


The future of education will be made easier by this decision. And a decision that is legally binding is probably the next step," Prakash said.

In 2014, Indian states like Tamil Nadu
Characters: 182

📝 Style: balanced, Length: medium
------------------------------
The future of education will be in a much greater hands of the people," he said.

In the meantime, he added, the system should "look to the future" rather than create "a state of limbo," noting that "everybody is being evaluated in an ever-increasing amount of ways. Our system is still under evaluation, but
Characters: 308

📝 Style: conservative, Length: long
------------------------------
The future of education will be determined by the success of our children, not by the success of the government," he said.

The government is also expected to introduce a new charter school system, which will be run by the state's public school system.

The state government is also expected to introduce a new charter school system, which wi

In [15]:
# Creative applications to try
creative_prompts = {
    "Poetry": "Roses are red, violets are blue, artificial intelligence",
    "Story": "It was a dark and stormy night when the AI finally",
    "Product Description": "Introducing the revolutionary new smartphone that",
    "Email": "Dear valued customer, we are excited to announce",
    "Recipe": "How to make the perfect AI-inspired cookies:\nIngredients:\n-",
    "News Headline": "Breaking: Scientists discover that artificial intelligence"
}

print("🎨 Creative Applications:")
print("=" * 50)

# TODO: Generate creative content
for app_type, prompt in creative_prompts.items():
    print(f"\n🖼️ {app_type}:")
    print(f"Prompt: '{prompt}'")
    print("-" * 40)
    
    # TODO: Choose appropriate style for each application
    if app_type in ["Poetry", "Story"]:
        style = "creative"  # Should be creative
    elif app_type in ["Product Description", "Email"]:
        style = "conservative"  # Should be conservative
    else:
        style = "balanced"  # Should be balanced
    
    output = custom_text_generator(prompt, style=style, length="medium")
    print(output)
    print("\n" + "="*50)

🎨 Creative Applications:

🖼️ Poetry:
Prompt: 'Roses are red, violets are blue, artificial intelligence'
----------------------------------------


Roses are red, violets are blue, artificial intelligence is orange; the last few of the above species is also very cute. I was surprised to be in the area with these creatures (when we did stop at Roses Cave I found the other two very cool). The most interesting and rare being of these species have both legs as shown


🖼️ Story:
Prompt: 'It was a dark and stormy night when the AI finally'
----------------------------------------
It was a dark and stormy night when the AI finally started to show signs of a recovery. A huge group stood in the middle of a river filled with rain and debris. I kept my eyes closed tightly looking around the world and didn't see something I didn't already expect. I could feel the massive and mysterious power that was slowly growing around


🖼️ Product Description:
Prompt: 'Introducing the revolutionary new smartphone that'
----------------------------------------
Introducing the revolutionary new smartphone that will revolutionize the way you use your smartph

In [16]:
limitation_tests = {
    "Factual Knowledge": "The capital of Fakelandia is",
    "Recent Events": "In 2023, the most important AI breakthrough was",
    "Math": "What is 47 * 83? The answer is",
    "Logic": "If all A are B, and all B are C, then all A are",
    "Consistency": "My favorite color is blue. Later in the conversation, my favorite color is"
}

print("⚠️ Understanding Model Limitations:")
print("=" * 50)

for test_type, prompt in limitation_tests.items():
    print(f"\n🧪 Testing: {test_type}")
    print(f"Prompt: '{prompt}'")
    print("-" * 40)
    
    # TODO: Generate responses to test limitations
    output = custom_text_generator(
        prompt,  # prompt
        style="conservative",  # Use conservative for factual tasks
        length="short"
    )
    
    print(output)
    
    # TODO: Analyze the output
    print(f"🤔 Analysis: Does this look correct/reasonable?")
    print("\n" + "="*50)

print("\n⚠️ Important Reminders:")
print("• Language models can generate plausible-sounding but incorrect information")
print("• Always verify factual claims from AI-generated content")
print("• Be aware of potential biases in training data")
print("• Use AI as a tool to assist, not replace, human judgment")

⚠️ Understanding Model Limitations:

🧪 Testing: Factual Knowledge
Prompt: 'The capital of Fakelandia is'
----------------------------------------


The capital of Fakelandia is the capital of the Kingdom of the West. The Kingdom of the West is the largest of the three kingdoms in the Kingdom of the West. The Kingdom of the West
🤔 Analysis: Does this look correct/reasonable?


🧪 Testing: Recent Events
Prompt: 'In 2023, the most important AI breakthrough was'
----------------------------------------
In 2023, the most important AI breakthrough was the first real-time prediction of how the world would respond to climate change. The first of these was the prediction that humans would be able to prevent
🤔 Analysis: Does this look correct/reasonable?


🧪 Testing: Math
Prompt: 'What is 47 * 83? The answer is'
----------------------------------------
What is 47 * 83? The answer is yes, but it is not the answer to the question of 47 * 83.

(a) The answer to 47 * 83 is that the answer
🤔 Analysis: Does this look correct/reasonable?


🧪 Testing: Logic
Prompt: 'If all A are B, and all B are C, then all A are'
-----------------------------------

In [17]:
print("🎯 FINAL CHALLENGE:")
print("Design your own text generation use case!")
print("=" * 50)
\
your_use_case = "creative story starter"
your_prompt = "A door appears in the middle of a busy city street, glowing with an eerie light. As soon as it opens,"   
your_style = "creative"     
your_length = "medium"      

print(f"📝 Your use case: {your_use_case}")
print(f"📝 Your prompt: '{your_prompt}'")
print(f"📝 Your settings: {your_style}, {your_length}")
print("-" * 50)

# Generate with your custom settings
your_output = custom_text_generator(
    your_prompt,
    style=your_style,
    length=your_length
)
print("🎉 Your generated content:")
print(your_output)

print("\n📈 Next Steps:")
print("• Experiment with different prompt formats")
print("• Try combining multiple generation calls")
print("• Think about how to validate or improve outputs")
print("• Consider user interface design for your application")


🎯 FINAL CHALLENGE:
Design your own text generation use case!
📝 Your use case: creative story starter
📝 Your prompt: 'A door appears in the middle of a busy city street, glowing with an eerie light. As soon as it opens,'
📝 Your settings: creative, medium
--------------------------------------------------
🎉 Your generated content:
A door appears in the middle of a busy city street, glowing with an eerie light. As soon as it opens, nothing could stop the monster. The floor, however, begins to fall under him as fast as his body shakes under the weight of the falling pieces. He jumps out of the way and straight into the sunlight to face the monster.

📈 Next Steps:
• Experiment with different prompt formats
• Try combining multiple generation calls
• Think about how to validate or improve outputs
• Consider user interface design for your application
