<a href="https://colab.research.google.com/github/27abernal/Adv_AI/blob/main/Next_Word_Predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import torch

# Load model
print("Loading AI model...")
model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
model = AutoModelForCausalLM.from_pretrained("distilgpt2")

Loading AI model...


In [50]:
# Simple example
text = "The weather today is "
print(f"\nStarting text: '{text}'")
print("\nGenerating word by word...\n")

top_k = 20
temperature = 1.2

# Generate 5 words, one at a time
current_text = text
for step in range(30):
    print(f"--- Step {step + 1} ---")
    print(f"Current: '{current_text}'")

    # Encode current text
    input_ids = tokenizer.encode(current_text, return_tensors="pt")

    # Get predictions
    with torch.no_grad():
        outputs = model(input_ids)
        predictions = outputs.logits

    # Get the predictions for the NEXT token
    next_token_logits = predictions[0, -1, :]
    # Divide the raw scores by temperature
    # Lower temp = more focused; Higher temp = more random
    next_token_logits = next_token_logits / temperature
    next_token_probs = torch.softmax(next_token_logits, dim=0)

    # Get top predictions
    top_probs, top_indices = torch.topk(next_token_probs, top_k)

    print(f"Top {top_k} next word predictions (showing first 5):")
    for i, (prob, idx) in enumerate(zip(top_probs[:5], top_indices[:5])):
        word = tokenizer.decode([idx])
        print(f"  {i+1}. '{word}' ({prob.item()*100:.1f}% confident)")

    # --- CHANGE: Sampling logic replaces the 'top_indices[0]' line ---
    # This picks one index from top_indices based on the top_probs weights
    sample_idx = torch.multinomial(top_probs, num_samples=1)
    next_token_id = top_indices[sample_idx].item()
    # Use the most likely word
    # next_token_id = top_indices[0]
    next_word = tokenizer.decode([next_token_id])
    current_text += next_word

    print(f"✓ Chosen: '{next_word}'")
    print(f"New text: '{current_text}'\n")

print(f"\nFinal generated text: '{current_text}'")


Starting text: 'The weather today is '

Generating word by word...

--- Step 1 ---
Current: 'The weather today is '
Top 20 next word predictions (showing first 5):
  1. 'iced' (29.4% confident)
  2. ' ' (13.9% confident)
  3. '!!!' (2.6% confident)
  4. '�' (2.2% confident)
  5. '????' (2.0% confident)
✓ Chosen: 'iced'
New text: 'The weather today is iced'

--- Step 2 ---
Current: 'The weather today is iced'
Top 20 next word predictions (showing first 5):
  1. ' up' (8.0% confident)
  2. ' with' (7.0% confident)
  3. ' and' (5.5% confident)
  4. '.' (5.3% confident)
  5. ',' (5.2% confident)
✓ Chosen: ' up'
New text: 'The weather today is iced up'

--- Step 3 ---
Current: 'The weather today is iced up'
Top 20 next word predictions (showing first 5):
  1. ',' (10.1% confident)
  2. ' and' (9.7% confident)
  3. '.' (7.1% confident)
  4. ' by' (5.1% confident)
  5. ' to' (3.8% confident)
✓ Chosen: ' and'
New text: 'The weather today is iced up and'

--- Step 4 ---
Current: 'The weather t

In [51]:
print(repr(current_text))


'The weather today is iced up and a couple of other factors could help us find out how to get on board! If there is any problem we need to do this quick'
