In [1]:
# Check GPU availability
!nvidia-smi

import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")

# Install required packages
!pip install transformers accelerate datasets torch bitsandbytes -q
!pip install huggingface_hub tokenizers -q

print("✅ Environment setup complete!")

Sat May 24 17:51:47 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   58C    P8             12W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [2]:
# Download StatsBomb data
!git clone https://github.com/statsbomb/open-data.git
!ls open-data/data/ | head -5

print("StatsBomb data downloaded")

# Verify data structure
import json
from pathlib import Path

data_path = Path("open-data/data")
with open(data_path / "competitions.json") as f:
    competitions = json.load(f)

print(f"📊 Loaded {len(competitions)} competitions")
print("Sample competitions:")
for comp in competitions[:3]:
    print(f"  • {comp['competition_name']} - {comp['season_name']}")

Cloning into 'open-data'...
remote: Enumerating objects: 49843, done.[K
remote: Counting objects: 100% (3321/3321), done.[K
remote: Compressing objects: 100% (1181/1181), done.[K
remote: Total 49843 (delta 3309), reused 2140 (delta 2140), pack-reused 46522 (from 4)[K
Receiving objects: 100% (49843/49843), 6.45 GiB | 14.66 MiB/s, done.
Resolving deltas: 100% (46838/46838), done.
Updating files: 100% (7246/7246), done.
competitions.json
events
lineups
matches
three-sixty
StatsBomb data downloaded
📊 Loaded 74 competitions
Sample competitions:
  • 1. Bundesliga - 2023/2024
  • 1. Bundesliga - 2015/2016
  • African Cup of Nations - 2023


In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch

# Configure for T4 GPU optimization
device = "cuda" if torch.cuda.is_available() else "cpu"

# Quantization config for T4 GPU
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

# Try to load Phi-4, fallback to alternatives if needed
model_options = [
    "microsoft/Phi-4",
    "microsoft/Phi-3-medium-4k-instruct",
    "microsoft/DialoGPT-large",
    "meta-llama/Llama-2-7b-chat-hf"
]

model = None
tokenizer = None
model_name = None

for model_option in model_options:
    try:
        print(f"🔄 Attempting to load {model_option}...")

        if "Phi-4" in model_option or "Phi-3" in model_option:
            # Load Phi models with quantization
            tokenizer = AutoTokenizer.from_pretrained(model_option, trust_remote_code=True)
            model = AutoModelForCausalLM.from_pretrained(
                model_option,
                quantization_config=bnb_config,
                device_map="auto",
                trust_remote_code=True,
                torch_dtype=torch.float16
            )
        else:
            # Load other models normally
            tokenizer = AutoTokenizer.from_pretrained(model_option)
            model = AutoModelForCausalLM.from_pretrained(
                model_option,
                quantization_config=bnb_config,
                device_map="auto",
                torch_dtype=torch.float16
            )

        model_name = model_option
        print(f"✅ Successfully loaded {model_option}")
        break

    except Exception as e:
        print(f"❌ Failed to load {model_option}: {str(e)[:100]}...")
        continue

if model is None:
    print("❌ Could not load any model. Check your Colab setup.")
else:
    print(f"🤖 Model ready: {model_name}")
    print(f"🎯 Device: {device}")

    # Set pad token if needed
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

🔄 Attempting to load microsoft/Phi-4...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/17.7k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.61M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/917k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/4.25M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.50k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/95.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/802 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.4k [00:00<?, ?B/s]

Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

model-00003-of-00006.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00006-of-00006.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00005-of-00006.safetensors:   0%|          | 0.00/4.77G [00:00<?, ?B/s]

model-00002-of-00006.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00004-of-00006.safetensors:   0%|          | 0.00/4.77G [00:00<?, ?B/s]

model-00001-of-00006.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

✅ Successfully loaded microsoft/Phi-4
🤖 Model ready: microsoft/Phi-4
🎯 Device: cuda


In [17]:
import json
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import random
import time
from pathlib import Path
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from IPython.display import display, HTML
from datetime import datetime

**Load and Process Match Data**

In [39]:
def load_complete_match_with_more_events():
    """Load a match with significantly more events"""
    print("🔍 SEARCHING FOR MATCH WITH MORE EVENTS")
    print("=" * 50)

    data_path = Path("open-data/data")
    with open(data_path / "competitions.json") as f:
        competitions = json.load(f)

    best_match = None
    best_events = None
    best_score = 0
    best_context = None

    # Try multiple competitions to find a rich match
    for comp in competitions[:10]:  # Check more competitions
        try:
            matches_file = data_path / f"matches/{comp['competition_id']}/{comp['season_id']}.json"
            if not matches_file.exists():
                continue

            with open(matches_file) as f:
                matches = json.load(f)

            print(f"📊 Checking {comp['competition_name']} - {len(matches)} matches available")

            # Check multiple matches, not just first 5
            for match in matches[:10]:  # Check more matches
                try:
                    events_file = data_path / f"events/{match['match_id']}.json"
                    if not events_file.exists():
                        continue

                    with open(events_file) as f:
                        events = json.load(f)

                    # Count ALL event types, not just limited ones
                    all_event_types = [
                        'Goal', 'Shot', 'Pass', 'Foul Committed', 'Card',
                        'Substitution', 'Offside', 'Duel', 'Interception',
                        'Ball Recovery', 'Clearance', 'Block', 'Tackle'
                    ]

                    total_events = 0
                    interesting_events = []
                    minute_coverage = set()

                    for event in events:
                        event_type = event.get('type', {}).get('name', '')
                        minute = event.get('minute', 0)

                        # Include more event types
                        if event_type in all_event_types and minute <= 90:
                            total_events += 1
                            minute_coverage.add(minute)

                            # Process the event
                            processed_event = {
                                'minute': minute,
                                'second': event.get('second', 0),
                                'event_type': event_type,
                                'player': event.get('player', {}).get('name', 'Unknown'),
                                'team': event.get('team', {}).get('name', 'Unknown'),
                                'details': extract_event_details(event)
                            }
                            interesting_events.append(processed_event)

                    # Score based on total events and minute coverage
                    minute_span = max(minute_coverage) - min(minute_coverage) if minute_coverage else 0
                    score = total_events + minute_span

                    print(f"   Match: {match['home_team']['home_team_name']} vs {match['away_team']['away_team_name']}")
                    print(f"   Events: {total_events}, Minutes: {minute_span}, Score: {score}")

                    if score > best_score and total_events > 20:  # Need at least 20 events
                        best_score = score
                        best_match = match
                        best_events = interesting_events
                        best_context = {
                            'home_team': match['home_team']['home_team_name'],
                            'away_team': match['away_team']['away_team_name'],
                            'competition': comp['competition_name'],
                            'season': comp['season_name'],
                            'match_id': match['match_id']
                        }

                except Exception as e:
                    continue

        except Exception as e:
            continue

    if best_match and best_events:
        print(f"\n🎯 BEST MATCH FOUND:")
        print(f"   Match: {best_context['home_team']} vs {best_context['away_team']}")
        print(f"   Competition: {best_context['competition']}")
        print(f"   Total events: {len(best_events)}")

        # Show minute distribution
        minutes = [event['minute'] for event in best_events]
        print(f"   Minute range: {min(minutes)} - {max(minutes)}")

        # Show event type distribution
        event_types = {}
        for event in best_events:
            event_type = event['event_type']
            event_types[event_type] = event_types.get(event_type, 0) + 1

        print(f"\n📊 Event Types Found:")
        for event_type, count in sorted(event_types.items(), key=lambda x: x[1], reverse=True):
            print(f"   • {event_type}: {count}")

        return best_events, best_context

    print("❌ Could not find a suitable match with enough events")
    return None, None

def extract_event_details(event):
    """Extract details from any event type"""
    details = {}
    event_type = event.get('type', {}).get('name', '')

    if event_type == 'Shot':
        shot_data = event.get('shot', {})
        details['outcome'] = shot_data.get('outcome', {}).get('name', 'Unknown')
        details['body_part'] = shot_data.get('body_part', {}).get('name', 'Unknown')
        details['technique'] = shot_data.get('technique', {}).get('name', 'Unknown')

    elif event_type == 'Goal':
        shot_data = event.get('shot', {})
        details['body_part'] = shot_data.get('body_part', {}).get('name', 'Unknown')
        details['technique'] = shot_data.get('technique', {}).get('name', 'Unknown')

    elif event_type == 'Card':
        foul_data = event.get('foul_committed', {})
        details['card_type'] = foul_data.get('card', {}).get('name', 'Yellow')
        details['reason'] = foul_data.get('type', {}).get('name', 'Foul')

    elif event_type == 'Pass':
        pass_data = event.get('pass', {})
        details['outcome'] = pass_data.get('outcome', {}).get('name', 'Complete')
        details['length'] = pass_data.get('length', 0)

    elif event_type == 'Substitution':
        sub_data = event.get('substitution', {})
        details['replacement'] = sub_data.get('replacement', {}).get('name', 'Unknown')
        details['reason'] = sub_data.get('reason', {}).get('name', 'Tactical')

    return details

# Load better match data
print("🔄 Loading match with more comprehensive events...")
extended_events, extended_context = load_complete_match_with_more_events()

🔄 Loading match with more comprehensive events...
🔍 SEARCHING FOR MATCH WITH MORE EVENTS
📊 Checking 1. Bundesliga - 34 matches available
   Match: Bayer Leverkusen vs Werder Bremen
   Events: 1449, Minutes: 89, Score: 1538
   Match: Union Berlin vs Bayer Leverkusen
   Events: 1346, Minutes: 90, Score: 1436
   Match: Eintracht Frankfurt vs Bayer Leverkusen
   Events: 1207, Minutes: 90, Score: 1297
   Match: Bochum vs Bayer Leverkusen
   Events: 1224, Minutes: 90, Score: 1314
   Match: Bayer Leverkusen vs Augsburg
   Events: 1435, Minutes: 90, Score: 1525
   Match: Bayer Leverkusen vs Hoffenheim
   Events: 1434, Minutes: 90, Score: 1524
   Match: Darmstadt 98 vs Bayer Leverkusen
   Events: 1462, Minutes: 90, Score: 1552
   Match: Bayer Leverkusen vs FSV Mainz 05
   Events: 1372, Minutes: 90, Score: 1462
   Match: Bayer Leverkusen vs Wolfsburg
   Events: 1488, Minutes: 89, Score: 1577
   Match: Freiburg vs Bayer Leverkusen
   Events: 1403, Minutes: 90, Score: 1493
📊 Checking 1. Bundesliga

**Basic Commentary Generator**

In [48]:
class UltraImprovedCommentaryGenerator:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def create_ultra_specific_prompt(self, event, context, style='exciting'):
        """Create highly specific prompts that force variety"""

        event_type = event['event_type']
        player = event['player']
        team = event['team']
        minute = event['minute']

        # Create completely different prompts for each event type and style
        if event_type == 'Goal':
            if style == 'exciting':
                prompt = f"GOAL CELEBRATION: {player} scores for {team}! Describe this explosive moment:"
            elif style == 'dramatic':
                prompt = f"DRAMATIC GOAL: {player} finds the net for {team}. Paint this theatrical moment:"
            else:
                prompt = f"TACTICAL GOAL: {player} converts for {team}. Analyze this clinical finish:"

        elif event_type == 'Shot':
            outcome = event['details'].get('outcome', 'saved')
            if style == 'exciting':
                prompt = f"SHOT ATTEMPT: {player} fires for {team}, result {outcome}! Capture the tension:"
            elif style == 'dramatic':
                prompt = f"CRUCIAL SHOT: {player} tries his luck for {team}, {outcome}. Build the suspense:"
            else:
                prompt = f"SHOT ANALYSIS: {player} shoots for {team}, {outcome}. Break down the technique:"

        elif event_type == 'Card':
            card = event['details'].get('card_type', 'yellow card')
            if style == 'exciting':
                prompt = f"BOOKING: {player} gets {card} for {team}! Referee takes action:"
            elif style == 'dramatic':
                prompt = f"DISCIPLINE: {player} sees {card} for {team}. Justice is served:"
            else:
                prompt = f"TACTICAL FOUL: {player} receives {card} for {team}. Assess the decision:"

        else:  # Foul
            if style == 'exciting':
                prompt = f"FOUL ACTION: {player} commits foul for {team}! Referee intervenes:"
            elif style == 'dramatic':
                prompt = f"CONTROVERSIAL MOMENT: {player} fouls for {team}. Tension rises:"
            else:
                prompt = f"FOUL ANALYSIS: {player} commits foul for {team}. Examine the challenge:"

        return prompt

    def generate_ultra_specific_commentary(self, prompt, temperature=0.95):
        """Generate with maximum variety settings"""
        try:
            # Very short, focused input
            inputs = self.tokenizer(
                prompt,
                return_tensors="pt",
                max_length=200,  # Much shorter
                truncation=True
            )
            inputs = {k: v.to(self.model.device) for k, v in inputs.items()}

            # High creativity settings
            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=25,  # Shorter output
                    temperature=temperature,
                    do_sample=True,
                    top_p=0.8,  # More diverse
                    top_k=30,   # More diverse
                    repetition_penalty=1.3,  # Stronger anti-repetition
                    pad_token_id=self.tokenizer.eos_token_id,
                    no_repeat_ngram_size=3  # Prevent 3-word repetitions
                )

            # Decode and clean
            new_tokens = outputs[0][inputs['input_ids'].shape[1]:]
            commentary = self.tokenizer.decode(new_tokens, skip_special_tokens=True).strip()

            # Remove prompt artifacts
            commentary = commentary.replace(prompt.split(':')[-1].strip(), '').strip()
            commentary = re.sub(r'^[:\-\s]+', '', commentary)

            # Ensure proper ending
            if commentary and not commentary.endswith(('!', '.', '?')):
                commentary += '!'

            # If still too generic or short, create specific fallback
            if len(commentary) < 10 or 'exciting moment' in commentary.lower():
                commentary = self.create_specific_fallback(prompt)

            return commentary

        except Exception as e:
            return self.create_specific_fallback(prompt)

    def create_specific_fallback(self, prompt):
        """Create specific fallbacks based on the prompt type"""
        if 'GOAL' in prompt:
            options = [
                "Sensational strike finds the bottom corner!",
                "Unstoppable effort rockets into the net!",
                "Pure class from the striker - what a finish!",
                "Thunderous shot leaves the keeper helpless!"
            ]
        elif 'SHOT' in prompt:
            options = [
                "Venomous drive forces a brilliant save!",
                "Curling effort whistles inches wide!",
                "Powerful strike rattles the crossbar!",
                "Keeper pulls off a spectacular stop!"
            ]
        elif 'BOOKING' in prompt or 'CARD' in prompt:
            options = [
                "Referee shows no hesitation with the yellow!",
                "Cynical challenge earns the expected booking!",
                "Professional foul stops the dangerous attack!",
                "Tactical fouling brings out the card!"
            ]
        else:
            options = [
                "Reckless challenge disrupts the flow!",
                "Physical battle intensifies on the pitch!",
                "Tough tackle sparks controversy!",
                "Referee calls for calm after the clash!"
            ]

        return random.choice(options)

# Initialize improved generator
ultra_generator = UltraImprovedCommentaryGenerator(model, tokenizer)
print("✅ Commentary generator ready!")

✅ Commentary generator ready!


In [49]:
def generate_ultra_varied_commentary(events, context):
    """Generate highly varied commentary with forced diversity"""
    print("🎯 ULTRA-VARIED COMMENTARY GENERATION")
    print("=" * 50)

    if not events:
        print("❌ No events to process")
        return []

    # Different approach: vary everything
    configurations = [
        {'style': 'exciting', 'temp': 1.0, 'seed': 42},
        {'style': 'dramatic', 'temp': 0.9, 'seed': 123},
        {'style': 'analytical', 'temp': 0.8, 'seed': 456},
        {'style': 'exciting', 'temp': 0.95, 'seed': 789},    # Different seed for variety
        {'style': 'dramatic', 'temp': 0.85, 'seed': 101},
        {'style': 'analytical', 'temp': 0.9, 'seed': 202},
        {'style': 'exciting', 'temp': 0.88, 'seed': 303},
        {'style': 'dramatic', 'temp': 0.92, 'seed': 404},
        {'style': 'analytical', 'temp': 0.87, 'seed': 505}
    ]

    ultra_results = []
    config_index = 0

    for event_idx, event in enumerate(events[:3]):
        print(f"\n⚽ Event {event_idx + 1}: {event['event_type']} - {event['player']}")
        print("-" * 50)

        # Use 3 different configurations per event
        for i in range(3):
            config = configurations[config_index]
            config_index += 1

            # Set random seed for variety
            torch.manual_seed(config['seed'])
            random.seed(config['seed'])

            style = config['style']
            temp = config['temp']

            print(f"  🎨 Style: {style} (temp={temp}, seed={config['seed']})")

            # Create ultra-specific prompt
            ultra_prompt = ultra_generator.create_ultra_specific_prompt(event, context, style)
            print(f"    📝 Prompt: {ultra_prompt[:60]}...")

            # Generate with ultra-specific method
            commentary = ultra_generator.generate_ultra_specific_commentary(ultra_prompt, temp)

            print(f"    💬 Result: {commentary}")

            ultra_results.append({
                'event_num': event_idx + 1,
                'event_type': event['event_type'],
                'player': event['player'],
                'team': event['team'],
                'minute': event['minute'],
                'style': style,
                'temperature': temp,
                'seed': config['seed'],
                'commentary': commentary,
                'ultra_improved': True
            })
            print()

    print(f"✅ Generated {len(ultra_results)} ultra-varied commentaries!")

    # Check for variety
    unique_commentaries = set(result['commentary'] for result in ultra_results)
    print(f"📊 Unique outputs: {len(unique_commentaries)}/{len(ultra_results)}")

    if len(unique_commentaries) == len(ultra_results):
        print("🎉 PERFECT VARIETY - All commentaries are unique!")
    elif len(unique_commentaries) > len(ultra_results) * 0.8:
        print("✅ EXCELLENT VARIETY - High diversity achieved!")
    else:
        print("⚠️ Some repetition still present")

    return ultra_results

# Generate ultra-varied commentary
ultra_results = generate_ultra_varied_commentary(events, context)

🎯 ULTRA-VARIED COMMENTARY GENERATION

⚽ Event 1: Foul Committed - Moussa Sissoko
--------------------------------------------------
  🎨 Style: exciting (temp=1.0, seed=42)
    📝 Prompt: FOUL ACTION: Moussa Sissoko commits foul for Tottenham Hotsp...
    💬 Result: Free kick to Newcastle United!</td></tr>
<tr><td align="left">90'</td><tdalign=" left!

  🎨 Style: dramatic (temp=0.9, seed=123)
    📝 Prompt: CONTROVERSIAL MOMENT: Moussa Sissoko fouls for Tottenham Hot...
    💬 Result: Arsenal manager Arsene Wenger is sent to the stands by referee Chris Foy (left). But he returns and tries in vain!

  🎨 Style: analytical (temp=0.8, seed=456)
    📝 Prompt: FOUL ANALYSIS: Moussa Sissoko commits foul for Tottenham Hot...
    💬 Result: Was it a result of misjudgment, lack of control or an aggressive intent? Consider his body positioning and timing.

[!


⚽ Event 2: Shot - Mohamed Salah
--------------------------------------------------
  🎨 Style: exciting (temp=0.95, seed=789)
    📝 Prompt: SHOT

**Evaluation**

In [29]:
# Initialize the basic evaluator (required for ultra-varied evaluation)
class CommentaryEvaluator:
    def __init__(self):
        # Ground truth examples for comparison
        self.ground_truth_examples = {
            'Goal': [
                "GOAL! What a magnificent strike! The crowd erupts as the ball finds the back of the net!",
                "He scores! A moment of pure magic that will be remembered for years to come!",
                "INTO THE NET! The keeper had no chance with that perfectly placed shot!"
            ],
            'Shot': [
                "Great effort! The keeper makes a fantastic save to deny the striker!",
                "So close! The ball whistles past the post as the crowd holds its breath!",
                "Blocked! The defense stands firm and clears the danger!"
            ],
            'Card': [
                "Yellow card! The referee reaches into his pocket for that reckless challenge!",
                "The official has no choice but to show the card for that foul!",
                "A booking for the defender after that cynical challenge!"
            ]
        }

        self.excitement_keywords = {
            'high_energy': ['goal', 'scores', 'amazing', 'incredible', 'fantastic', 'spectacular', 'brilliant', 'magnificent'],
            'crowd_reaction': ['crowd', 'erupts', 'cheers', 'roars', 'applause', 'celebrates', 'fans'],
            'dramatic': ['drama', 'tension', 'thriller', 'edge', 'breathtaking', 'stunning', 'shocking'],
            'technical': ['strike', 'shot', 'pass', 'save', 'tackle', 'header', 'volley', 'cross']
        }

    def calculate_excitement_score(self, text):
        """Calculate excitement level based on keywords and punctuation"""
        text_lower = text.lower()
        score = 0

        # Check for excitement keywords
        for category, keywords in self.excitement_keywords.items():
            for keyword in keywords:
                if keyword in text_lower:
                    if category == 'high_energy':
                        score += 2
                    elif category == 'crowd_reaction':
                        score += 1.5
                    elif category == 'dramatic':
                        score += 1.5
                    else:
                        score += 1

        # Punctuation excitement
        exclamation_count = text.count('!')
        score += exclamation_count * 0.5

        # All caps words (excitement)
        caps_words = len([word for word in text.split() if word.isupper() and len(word) > 1])
        score += caps_words * 0.3

        return min(score, 10)

    def calculate_cqr_score(self, text, event_type):
        """Calculate Comment Quality Rating (1-5 scale)"""
        score = 1  # Base score

        # Clarity (grammar and structure)
        if len(text.split()) >= 5:
            score += 0.5
        if text.endswith('.') or text.endswith('!'):
            score += 0.3
        if not re.search(r'\b(error|failed|unknown)\b', text.lower()):
            score += 0.2

        # Emotion and engagement
        excitement = self.calculate_excitement_score(text)
        if excitement >= 3:
            score += 1
        elif excitement >= 1:
            score += 0.5

        # Event appropriateness
        event_keywords = {
            'Goal': ['goal', 'scores', 'net', 'back'],
            'Shot': ['shot', 'attempt', 'effort', 'strike'],
            'Card': ['card', 'yellow', 'red', 'referee', 'booking']
        }

        if event_type in event_keywords:
            text_lower = text.lower()
            if any(keyword in text_lower for keyword in event_keywords[event_type]):
                score += 1

        return min(max(score, 1), 5)

    def calculate_bleu_score(self, generated_text, reference_texts):
        """Calculate BLEU score against reference texts"""
        gen_words = set(generated_text.lower().split())
        ref_words = set()
        for ref in reference_texts:
            ref_words.update(ref.lower().split())

        if len(gen_words) == 0:
            return 0.0

        overlap = len(gen_words.intersection(ref_words))
        return overlap / len(gen_words)

    def evaluate_commentary_set(self, results):
        """Comprehensive evaluation of commentary results"""
        if not results:
            return {}

        evaluation_results = {
            'individual_scores': [],
            'aggregate_metrics': {}
        }

        commentaries = [result['commentary'] for result in results]

        # Individual commentary evaluation
        for result in results:
            text = result['commentary']
            event_type = result.get('event_type', 'Unknown')

            individual_score = {
                'commentary': text,
                'event_type': event_type,
                'player': result.get('player', 'Unknown'),
                'style': result.get('style', 'Unknown'),
                'bleu_score': self.calculate_bleu_score(text, self.ground_truth_examples.get(event_type, [])),
                'excitement_score': self.calculate_excitement_score(text),
                'cqr_score': self.calculate_cqr_score(text, event_type),
                'length': len(text.split())
            }

            evaluation_results['individual_scores'].append(individual_score)

        # Aggregate metrics
        individual_scores = evaluation_results['individual_scores']

        evaluation_results['aggregate_metrics'] = {
            'avg_bleu_score': np.mean([s['bleu_score'] for s in individual_scores]),
            'avg_excitement_score': np.mean([s['excitement_score'] for s in individual_scores]),
            'avg_cqr_score': np.mean([s['cqr_score'] for s in individual_scores]),
            'avg_length': np.mean([s['length'] for s in individual_scores]),
            'total_commentaries': len(commentaries)
        }

        return evaluation_results

# Initialize both evaluators
evaluator = CommentaryEvaluator()
ultra_evaluator = UltraVariedEvaluator()
print("✅ Both evaluators initialized!")

✅ Both evaluators initialized!


In [30]:
def run_ultra_varied_evaluation(results):
    """Comprehensive evaluation specifically for ultra-varied commentary"""
    print("🔬 ULTRA-VARIED COMMENTARY EVALUATION")
    print("=" * 50)

    if not results:
        print("❌ No results to evaluate")
        return None

    # Basic metrics first
    basic_evaluation = evaluator.evaluate_commentary_set(results)
    commentaries = [result['commentary'] for result in results]

    # Ultra-specific metrics
    ultra_metrics = {}

    # 1. Variety Score (most important for this system)
    ultra_metrics['variety_score'] = ultra_evaluator.calculate_variety_score(commentaries)

    # 2. Enhanced excitement per event type
    excitement_by_type = {}
    for result in results:
        event_type = result['event_type']
        if event_type not in excitement_by_type:
            excitement_by_type[event_type] = []

        enhanced_excitement = ultra_evaluator.calculate_enhanced_excitement(
            result['commentary'],
            event_type
        )
        excitement_by_type[event_type].append(enhanced_excitement)

    # Average excitement by event type
    for event_type, scores in excitement_by_type.items():
        ultra_metrics[f'avg_excitement_{event_type.lower().replace(" ", "_")}'] = np.mean(scores)

    ultra_metrics['overall_enhanced_excitement'] = np.mean([
        score for scores in excitement_by_type.values() for score in scores
    ])

    # 3. Style consistency (how different are the styles?)
    ultra_metrics['style_differentiation'] = ultra_evaluator.calculate_style_consistency(results)

    # 4. Repetition analysis
    unique_commentaries = len(set(commentaries))
    total_commentaries = len(commentaries)
    ultra_metrics['uniqueness_ratio'] = unique_commentaries / total_commentaries if total_commentaries > 0 else 0

    # 5. Professional quality assessment
    professional_score = 0
    for commentary in commentaries:
        # Check for professional phrases
        text_lower = commentary.lower()
        for category, phrases in ultra_evaluator.professional_phrases.items():
            for phrase in phrases:
                if phrase in text_lower:
                    professional_score += 1
                    break

    ultra_metrics['professional_phrase_usage'] = professional_score / len(commentaries) if commentaries else 0

    # Display results
    basic_metrics = basic_evaluation['aggregate_metrics']

    print(f"\n📊 CORE METRICS:")
    print("-" * 25)
    print(f"🎯 CQR Score: {basic_metrics['avg_cqr_score']:.2f}/5.0")
    print(f"📝 BLEU Score: {basic_metrics['avg_bleu_score']:.3f}")
    print(f"📏 Avg Length: {basic_metrics['avg_length']:.1f} words")

    print(f"\n🌟 VARIETY ANALYSIS:")
    print("-" * 25)
    print(f"🎨 Variety Score: {ultra_metrics['variety_score']:.3f}/1.0")
    print(f"🔄 Uniqueness Ratio: {ultra_metrics['uniqueness_ratio']:.3f} ({unique_commentaries}/{total_commentaries})")
    print(f"⚡ Style Differentiation: {ultra_metrics['style_differentiation']:.3f}/1.0")
    print(f"👔 Professional Phrases: {ultra_metrics['professional_phrase_usage']:.3f}")

    print(f"\n🎉 EXCITEMENT BY EVENT TYPE:")
    print("-" * 30)
    for event_type, scores in excitement_by_type.items():
        avg_score = np.mean(scores)
        print(f"⚽ {event_type}: {avg_score:.2f}/10.0")

    print(f"🚀 Overall Enhanced Excitement: {ultra_metrics['overall_enhanced_excitement']:.2f}/10.0")

    # Performance assessment
    print(f"\n📋 PERFORMANCE ASSESSMENT:")
    print("-" * 30)

    # Variety assessment
    if ultra_metrics['variety_score'] >= 0.8:
        print("✅ VARIETY: Excellent - High diversity achieved!")
    elif ultra_metrics['variety_score'] >= 0.6:
        print("🟡 VARIETY: Good - Decent diversity")
    elif ultra_metrics['variety_score'] >= 0.4:
        print("🟠 VARIETY: Fair - Some repetition present")
    else:
        print("🔴 VARIETY: Poor - High repetition")

    # Uniqueness assessment
    if ultra_metrics['uniqueness_ratio'] == 1.0:
        print("✅ UNIQUENESS: Perfect - All commentaries unique!")
    elif ultra_metrics['uniqueness_ratio'] >= 0.9:
        print("🟡 UNIQUENESS: Excellent - Minimal repetition")
    elif ultra_metrics['uniqueness_ratio'] >= 0.7:
        print("🟠 UNIQUENESS: Good - Some duplicates")
    else:
        print("🔴 UNIQUENESS: Poor - Many duplicates")

    # Style differentiation
    if ultra_metrics['style_differentiation'] >= 0.3:
        print("✅ STYLES: Well differentiated - Clear style differences")
    elif ultra_metrics['style_differentiation'] >= 0.2:
        print("🟡 STYLES: Moderately differentiated")
    else:
        print("🔴 STYLES: Poorly differentiated - Styles too similar")

    return {
        'basic_metrics': basic_metrics,
        'ultra_metrics': ultra_metrics,
        'excitement_by_type': excitement_by_type,
        'individual_scores': basic_evaluation['individual_scores']
    }

# Run ultra-varied evaluation
if 'ultra_results' in locals() and ultra_results:
    ultra_evaluation = run_ultra_varied_evaluation(ultra_results)
else:
    print("❌ No ultra_results found. Please run the ultra-varied generation first.")

🔬 ULTRA-VARIED COMMENTARY EVALUATION

📊 CORE METRICS:
-------------------------
🎯 CQR Score: 2.28/5.0
📝 BLEU Score: 0.058
📏 Avg Length: 18.8 words

🌟 VARIETY ANALYSIS:
-------------------------
🎨 Variety Score: 0.727/1.0
🔄 Uniqueness Ratio: 1.000 (9/9)
⚡ Style Differentiation: 0.930/1.0
👔 Professional Phrases: 0.000

🎉 EXCITEMENT BY EVENT TYPE:
------------------------------
⚽ Foul Committed: 1.40/10.0
⚽ Shot: 2.20/10.0
🚀 Overall Enhanced Excitement: 1.67/10.0

📋 PERFORMANCE ASSESSMENT:
------------------------------
🟡 VARIETY: Good - Decent diversity
✅ UNIQUENESS: Perfect - All commentaries unique!
✅ STYLES: Well differentiated - Clear style differences


In [31]:
def analyze_ultra_varied_performance(evaluation_results):
    """Detailed analysis of ultra-varied commentary performance"""
    print("🔍 DETAILED ULTRA-VARIED ANALYSIS")
    print("=" * 45)

    if not evaluation_results:
        print("❌ No evaluation results to analyze")
        return

    ultra_metrics = evaluation_results['ultra_metrics']
    individual_scores = evaluation_results['individual_scores']

    # Best and worst performers
    best_performer = max(individual_scores, key=lambda x: x['cqr_score'] + x['excitement_score']/10)
    worst_performer = min(individual_scores, key=lambda x: x['cqr_score'] + x['excitement_score']/10)

    print(f"\n🏆 BEST PERFORMER:")
    print(f"   Event: {best_performer['event_type']} - {best_performer['player']}")
    print(f"   Style: {best_performer['style']}")
    print(f"   CQR: {best_performer['cqr_score']:.2f} | Excitement: {best_performer['excitement_score']:.2f}")
    print(f"   Commentary: {best_performer['commentary']}")

    print(f"\n⚠️ NEEDS IMPROVEMENT:")
    print(f"   Event: {worst_performer['event_type']} - {worst_performer['player']}")
    print(f"   Style: {worst_performer['style']}")
    print(f"   CQR: {worst_performer['cqr_score']:.2f} | Excitement: {worst_performer['excitement_score']:.2f}")
    print(f"   Commentary: {worst_performer['commentary']}")

    # Style analysis
    print(f"\n🎨 STYLE PERFORMANCE ANALYSIS:")
    print("-" * 35)

    style_performance = {}
    for score in individual_scores:
        style = score['style']
        if style not in style_performance:
            style_performance[style] = {
                'cqr_scores': [],
                'excitement_scores': [],
                'commentaries': []
            }

        style_performance[style]['cqr_scores'].append(score['cqr_score'])
        style_performance[style]['excitement_scores'].append(score['excitement_score'])
        style_performance[style]['commentaries'].append(score['commentary'])

    for style, data in style_performance.items():
        avg_cqr = np.mean(data['cqr_scores'])
        avg_excitement = np.mean(data['excitement_scores'])

        print(f"📊 {style.upper()}:")
        print(f"   Quality: {avg_cqr:.2f}/5.0")
        print(f"   Excitement: {avg_excitement:.2f}/10.0")
        print(f"   Sample: {data['commentaries'][0][:60]}...")
        print()

    # Improvement recommendations
    print(f"\n💡 SPECIFIC IMPROVEMENTS:")
    print("-" * 30)

    if ultra_metrics['variety_score'] < 0.7:
        print("🎨 BOOST VARIETY:")
        print("   • Increase temperature to 1.0+")
        print("   • Use more diverse seed values")
        print("   • Add more prompt variations")

    if ultra_metrics['uniqueness_ratio'] < 0.9:
        print("🔄 REDUCE REPETITION:")
        print("   • Increase repetition_penalty to 1.4+")
        print("   • Use no_repeat_ngram_size=4")
        print("   • Create more specific prompts")

    if ultra_metrics['style_differentiation'] < 0.3:
        print("🎭 IMPROVE STYLE DIFFERENCES:")
        print("   • Make style prompts more distinct")
        print("   • Use different vocabulary sets per style")
        print("   • Adjust temperature per style")

    if ultra_metrics['professional_phrase_usage'] < 0.5:
        print("👔 INCREASE PROFESSIONALISM:")
        print("   • Add more professional phrases to prompts")
        print("   • Study real commentator vocabulary")
        print("   • Include crowd reaction elements")

    # Overall system rating
    overall_score = (
        ultra_metrics['variety_score'] * 30 +
        ultra_metrics['uniqueness_ratio'] * 25 +
        ultra_metrics['style_differentiation'] * 20 +
        (ultra_metrics['overall_enhanced_excitement'] / 10) * 15 +
        ultra_metrics['professional_phrase_usage'] * 10
    )

    print(f"\n🏆 OVERALL ULTRA-VARIED SYSTEM SCORE: {overall_score:.1f}/100")

    if overall_score >= 80:
        print("🌟 EXCELLENT - System produces highly varied, quality commentary!")
    elif overall_score >= 65:
        print("✅ GOOD - System shows strong variety with room for improvement")
    elif overall_score >= 50:
        print("🟡 FAIR - System has basic variety but needs optimization")
    else:
        print("🔴 POOR - System needs significant improvements")

# Run detailed analysis
if 'ultra_evaluation' in locals() and ultra_evaluation:
    analyze_ultra_varied_performance(ultra_evaluation)

🔍 DETAILED ULTRA-VARIED ANALYSIS

🏆 BEST PERFORMER:
   Event: Shot - Mohamed Salah
   Style: analytical
   CQR: 3.50 | Excitement: 2.40
   Commentary: body position during approach and shot execution.
3) VIDEO ANNOTATION REQUESTS:
a. Provide detailed commentary on a specific!

⚠️ NEEDS IMPROVEMENT:
   Event: Foul Committed - Moussa Sissoko
   Style: dramatic
   CQR: 2.00 | Excitement: 0.50
   Commentary: Arsenal manager Arsene Wenger is sent to the stands by referee Chris Foy (left). But he returns and tries in vain!

🎨 STYLE PERFORMANCE ANALYSIS:
-----------------------------------
📊 EXCITING:
   Quality: 2.17/5.0
   Excitement: 0.67/10.0
   Sample: Free kick to Newcastle United!</td></tr>
<tr><td align="left...

📊 DRAMATIC:
   Quality: 2.17/5.0
   Excitement: 1.00/10.0
   Sample: Arsenal manager Arsene Wenger is sent to the stands by refer...

📊 ANALYTICAL:
   Quality: 2.50/5.0
   Excitement: 1.13/10.0
   Sample: Was it a result of misjudgment, lack of control or an aggres...


💡 SPE

**Basic Full Match Commentary with Templates**

In [43]:
class FixedDetailedCommentaryGenerator:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer

    def create_simple_detailed_prompt(self, event, context, match_state):
        """Create very simple but specific prompts to avoid repetition"""

        event_type = event['event_type']
        player = event['player']
        team = event['team']
        minute = event['minute']

        # Create completely different prompt structures for each event type
        if event_type == 'Pass':
            prompt = f"{player} passes the ball for {team} in minute {minute}. Describe this pass:"

        elif event_type == 'Foul Committed':
            prompt = f"{player} commits a foul for {team} in minute {minute}. Commentate on this foul:"

        elif event_type == 'Shot':
            outcome = event['details'].get('outcome', 'taken')
            prompt = f"{player} shoots for {team} in minute {minute}, outcome {outcome}. Describe this shot:"

        elif event_type == 'Goal':
            prompt = f"GOAL! {player} scores for {team} in minute {minute}! Celebrate this goal:"

        elif event_type == 'Card':
            card = event['details'].get('card_type', 'yellow card')
            prompt = f"{player} gets {card} for {team} in minute {minute}. Explain this booking:"

        else:
            prompt = f"{player} involved in {event_type} for {team} in minute {minute}. Commentary:"

        return prompt

    def generate_varied_commentary(self, prompt, temperature=1.0, seed=None):
        """Generate varied commentary with different seeds"""
        try:
            # Set different random seed for each generation
            if seed:
                torch.manual_seed(seed)
                random.seed(seed)

            # Very short prompt for focus
            inputs = self.tokenizer(
                prompt,
                return_tensors="pt",
                max_length=150,  # Much shorter
                truncation=True
            )
            inputs = {k: v.to(self.model.device) for k, v in inputs.items()}

            # High temperature for maximum variety
            with torch.no_grad():
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=50,
                    temperature=temperature,
                    do_sample=True,
                    top_p=0.7,  # More focused
                    top_k=25,   # More focused
                    repetition_penalty=1.5,  # Strong anti-repetition
                    pad_token_id=self.tokenizer.eos_token_id,
                    no_repeat_ngram_size=4,  # Prevent 4-word repetitions
                    do_early_stopping=True
                )

            # Decode and clean
            new_tokens = outputs[0][inputs['input_ids'].shape[1]:]
            commentary = self.tokenizer.decode(new_tokens, skip_special_tokens=True).strip()

            # Clean output
            commentary = commentary.replace(prompt, '').strip()
            commentary = re.sub(r'^[:\-\s]+', '', commentary)

            # Take first sentence
            sentences = re.split(r'[.!?]+', commentary)
            if sentences:
                commentary = sentences[0].strip()
                if commentary and not commentary.endswith(('!', '.', '?')):
                    commentary += '!'

            return commentary if len(commentary) > 10 else None

        except Exception as e:
            return None

# Initialize fixed generator
fixed_generator = FixedDetailedCommentaryGenerator(model, tokenizer)
print("✅ Fixed detailed generator initialized!")

✅ Fixed detailed generator initialized!


In [45]:
def generate_template_detailed_commentary(events, context, max_events=20):
    """Generate detailed commentary using templates to guarantee variety"""
    print("📝 GENERATING TEMPLATE-BASED DETAILED COMMENTARY")
    print("=" * 60)

    if not events:
        print("❌ No events to process")
        return []

    selected_events = events[:max_events]
    print(f"📊 Processing {len(selected_events)} events with guaranteed variety")

    # Detailed templates for each event type
    detailed_templates = {
        'Pass': [
            "{player} sprays the ball wide for {team}, looking to switch the point of attack and stretch the opposition defense with this clever piece of distribution.",
            "{player} picks out a teammate with a precise pass for {team}, demonstrating the technical quality that has made him such a key player in this system.",
            "Neat passing play from {player} as {team} look to build from the back, maintaining possession with patience and precision in these early exchanges.",
            "{player} finds space and releases the ball quickly for {team}, keeping the tempo high as they look to exploit any gaps in the defensive structure."
        ],

        'Foul Committed': [
            "{player} goes in hard on his opponent and the referee immediately stops play! A robust challenge from the {team} player that crossed the line into foul territory.",
            "The whistle goes as {player} commits a foul for {team}! The intensity is building here and the official needs to keep control of this increasingly physical encounter.",
            "{player} brings down his man and concedes a free kick! A tactical foul from the {team} player to stop what looked like a promising attacking move developing.",
            "Challenge from {player} and it's a foul! The {team} defender was late to the tackle and the referee was perfectly positioned to make that call."
        ],

        'Shot': [
            "{player} lines up the shot and lets fly for {team}! The ball arrows towards goal with real venom but the keeper is equal to the challenge with a fine save.",
            "Space opens up for {player} and he doesn't need a second invitation! A crisp strike from the {team} forward that tests the goalkeeper's reflexes to the full.",
            "{player} goes for goal from distance! It's a speculative effort from the {team} midfielder but sometimes you have to try your luck from range in tight games like this.",
            "What an effort from {player}! The {team} attacker connects sweetly with the ball and sends it goalward, drawing a spectacular save from between the posts."
        ],

        'Goal': [
            "GOAL! {player} finds the back of the net for {team}! What a moment of quality from the striker, who showed ice-cool composure to slot home when it mattered most!",
            "INTO THE NET! {player} scores for {team}! The crowd erupts as the deadlock is broken with a clinical finish that showcases exactly why he's so highly rated!",
            "BRILLIANT GOAL! {player} converts with aplomb for {team}! A moment of magic from the forward, who picked his spot perfectly and gave the keeper no chance whatsoever!",
            "WHAT A FINISH! {player} makes it count for {team}! The striker's movement was sublime and the execution even better as he sends the supporters into raptures!"
        ],

        'Card': [
            "Yellow card for {player}! The {team} player can have no complaints about that decision as the referee takes decisive action to maintain control of this feisty encounter.",
            "The referee reaches into his pocket and shows {player} the yellow card! A necessary booking for the {team} player after that reckless challenge went unpunished.",
            "{player} goes into the book for {team}! The official had no choice but to take action there as the defender's challenge was clearly late and potentially dangerous.",
            "Caution for {player}! The {team} player is shown yellow by the referee, who is keeping a close eye on the physical nature of this increasingly competitive fixture."
        ],

        'Substitution': [
            "Tactical change for {team} as {player} makes way for fresh legs! The manager is looking to inject some new energy into his side's play with this strategic substitution.",
            "Here comes a change for {team}! {player} heads to the bench having put in a solid shift, with the coach bringing on a replacement to alter the dynamic.",
            "Substitution for {team} as {player} is replaced! The player can be pleased with his contribution before making way for someone who might offer something different.",
            "The manager makes his move! {player} comes off for {team} in what looks like a tactical switch designed to give his side fresh impetus in the final stages."
        ]
    }

    match_state = {'score': [0, 0], 'total_goals': 0}
    template_results = []

    # Detailed introduction
    intro = f"""Welcome to this captivating {context['competition']} encounter between {context['home_team']} and {context['away_team']}!
    Both sides come into this fixture with high hopes and expectations, and judging by the electric atmosphere generated by the passionate supporters,
    we're in for a real treat tonight. The stage is set for what promises to be a memorable clash between two well-organized teams."""

    template_results.append({
        'minute': 0,
        'event_type': 'Match Start',
        'player': 'Commentator',
        'team': 'Broadcast',
        'commentary': intro,
        'score': '0-0',
        'word_count': len(intro.split())
    })

    print(f"\n🎙️ {intro}\n")
    print("="*80)

    # Generate template-based commentary
    for i, event in enumerate(selected_events):
        minute = event['minute']
        event_type = event['event_type']
        player = event['player']
        team = event['team']

        print(f"\nEvent {i+1}/{len(selected_events)}")
        print(f"⏱️ MINUTE {minute} - {event_type}")
        print(f"👤 {player} ({team})")

        # Update score
        if event_type == 'Goal':
            if team == context['home_team']:
                match_state['score'][0] += 1
            else:
                match_state['score'][1] += 1
            match_state['total_goals'] += 1

        # Get template for this event type
        if event_type in detailed_templates:
            templates = detailed_templates[event_type]
            # Use different template each time (cycle through them)
            template = templates[i % len(templates)]
            commentary = template.format(player=player, team=team)
        else:
            # Fallback for unknown event types
            commentary = f"{player} is involved in the action for {team}, contributing to the flow of this absorbing encounter between these two competitive sides."

        current_score = f"{match_state['score'][0]}-{match_state['score'][1]}"
        word_count = len(commentary.split())

        print(f"📊 Score: {current_score}")
        print(f"📝 Words: {word_count}")
        print(f"🎙️ {commentary}")
        print("-" * 80)

        template_results.append({
            'minute': minute,
            'event_type': event_type,
            'player': player,
            'team': team,
            'commentary': commentary,
            'score': current_score,
            'word_count': word_count,
            'template_based': True
        })

    # Detailed conclusion
    final_score = f"{match_state['score'][0]}-{match_state['score'][1]}"
    conclusion = f"""FULL TIME! The referee brings this enthralling {context['competition']} encounter to a close with {context['home_team']}
    finishing {final_score} against {context['away_team']}! What a thoroughly entertaining match that was, showcasing the very best of competitive football.
    Both teams can be proud of their efforts tonight, and we thank you for joining us for this memorable sporting spectacle!"""

    template_results.append({
        'minute': 90,
        'event_type': 'Full Time',
        'player': 'Referee',
        'team': 'Officials',
        'commentary': conclusion,
        'score': final_score,
        'word_count': len(conclusion.split())
    })

    print(f"\n🏁 {conclusion}")

    # Statistics
    commentary_events = [r for r in template_results if r['event_type'] not in ['Match Start', 'Full Time']]
    avg_words = np.mean([r['word_count'] for r in commentary_events])
    unique_commentaries = len(set(r['commentary'] for r in commentary_events))

    print(f"\n✅ TEMPLATE-BASED COMMENTARY COMPLETE!")
    print(f"📊 Statistics:")
    print(f"   • Events: {len(commentary_events)}")
    print(f"   • Average words: {avg_words:.1f}")
    print(f"   • Unique commentaries: {unique_commentaries}/{len(commentary_events)} (100% guaranteed)")
    print(f"   • Total words: {sum(r['word_count'] for r in template_results)}")

    return template_results

# Generate template-based commentary
if 'extended_events' in locals() and 'extended_context' in locals():
    template_detailed_commentary = generate_template_detailed_commentary(
        extended_events,
        extended_context,
        max_events=20
    )
else:
    print("❌ No extended event data available")

📝 GENERATING TEMPLATE-BASED DETAILED COMMENTARY
📊 Processing 20 events with guaranteed variety

🎙️ Welcome to this captivating 1. Bundesliga encounter between Bayer Leverkusen and Wolfsburg! 
    Both sides come into this fixture with high hopes and expectations, and judging by the electric atmosphere generated by the passionate supporters, 
    we're in for a real treat tonight. The stage is set for what promises to be a memorable clash between two well-organized teams.


Event 1/20
⏱️ MINUTE 0 - Pass
👤 Maximilian Arnold (Wolfsburg)
📊 Score: 0-0
📝 Words: 26
🎙️ Maximilian Arnold sprays the ball wide for Wolfsburg, looking to switch the point of attack and stretch the opposition defense with this clever piece of distribution.
--------------------------------------------------------------------------------

Event 2/20
⏱️ MINUTE 0 - Pass
👤 Koen Casteels (Wolfsburg)
📊 Score: 0-0
📝 Words: 27
🎙️ Koen Casteels picks out a teammate with a precise pass for Wolfsburg, demonstrating the technical

In [46]:
class CommentaryAnalysisEngine:
    def __init__(self):
        # Analysis criteria
        self.quality_indicators = {
            'professional_phrases': [
                'back of the net', 'keeper had no chance', 'moment of magic', 'clinical finish',
                'what a strike', 'pure class', 'ice-cool composure', 'thunderous effort',
                'spectacular save', 'arrow towards goal', 'makes no mistake', 'finds the target'
            ],
            'descriptive_words': [
                'magnificent', 'spectacular', 'brilliant', 'sublime', 'clinical', 'precise',
                'thunderous', 'venomous', 'crisp', 'delightful', 'exquisite', 'masterful'
            ],
            'atmosphere_words': [
                'crowd', 'erupts', 'supporters', 'atmosphere', 'electric', 'passionate',
                'raptures', 'celebration', 'roars', 'applause', 'cheer', 'excitement'
            ],
            'tactical_terms': [
                'tactical', 'strategic', 'formation', 'defensive', 'attacking', 'pressure',
                'possession', 'counter-attack', 'build-up', 'distribution', 'structure'
            ]
        }

        # Readability metrics
        self.readability_factors = {
            'sentence_variety': True,
            'word_complexity': True,
            'flow_indicators': ['meanwhile', 'however', 'suddenly', 'immediately', 'as', 'while']
        }

    def analyze_vocabulary_richness(self, commentaries):
        """Analyze the richness and variety of vocabulary used"""
        all_words = []
        for commentary in commentaries:
            words = commentary.lower().replace('.', '').replace('!', '').replace(',', '').split()
            all_words.extend(words)

        if not all_words:
            return {'error': 'No words to analyze'}

        unique_words = len(set(all_words))
        total_words = len(all_words)

        # Type-Token Ratio (vocabulary diversity)
        ttr = unique_words / total_words if total_words > 0 else 0

        # Average word length (sophistication indicator)
        avg_word_length = sum(len(word) for word in all_words) / len(all_words)

        # Professional vocabulary usage
        professional_count = 0
        for phrase in self.quality_indicators['professional_phrases']:
            for commentary in commentaries:
                if phrase in commentary.lower():
                    professional_count += 1
                    break

        professional_ratio = professional_count / len(self.quality_indicators['professional_phrases'])

        return {
            'total_words': total_words,
            'unique_words': unique_words,
            'vocabulary_diversity': ttr,
            'avg_word_length': avg_word_length,
            'professional_vocabulary_ratio': professional_ratio,
            'sophistication_score': min((avg_word_length - 3) / 3, 1.0)  # Normalized score
        }

    def analyze_content_quality(self, results):
        """Analyze the content quality of commentary"""
        if not results:
            return {'error': 'No results to analyze'}

        # Filter out intro/outro
        commentary_events = [r for r in results if r['event_type'] not in ['Match Start', 'Full Time']]
        commentaries = [r['commentary'] for r in commentary_events]

        # Length analysis
        word_counts = [r.get('word_count', len(r['commentary'].split())) for r in commentary_events]

        # Event-specific analysis
        event_coverage = {}
        for result in commentary_events:
            event_type = result['event_type']
            if event_type not in event_coverage:
                event_coverage[event_type] = {
                    'count': 0,
                    'avg_words': 0,
                    'commentaries': []
                }
            event_coverage[event_type]['count'] += 1
            event_coverage[event_type]['commentaries'].append(result['commentary'])

        # Calculate averages for each event type
        for event_type, data in event_coverage.items():
            words_per_event = [len(c.split()) for c in data['commentaries']]
            data['avg_words'] = np.mean(words_per_event) if words_per_event else 0

        # Quality indicators count
        quality_scores = {
            'professional_phrases': 0,
            'descriptive_words': 0,
            'atmosphere_words': 0,
            'tactical_terms': 0
        }

        for commentary in commentaries:
            text_lower = commentary.lower()
            for category, indicators in self.quality_indicators.items():
                for indicator in indicators:
                    if indicator in text_lower:
                        quality_scores[category] += 1
                        break  # Count each category once per commentary

        # Normalize quality scores
        total_commentaries = len(commentaries)
        for category in quality_scores:
            quality_scores[category] = quality_scores[category] / total_commentaries if total_commentaries > 0 else 0

        return {
            'total_events': len(commentary_events),
            'word_count_stats': {
                'min': min(word_counts) if word_counts else 0,
                'max': max(word_counts) if word_counts else 0,
                'avg': np.mean(word_counts) if word_counts else 0,
                'std': np.std(word_counts) if word_counts else 0
            },
            'event_coverage': event_coverage,
            'quality_indicators': quality_scores
        }

    def analyze_narrative_flow(self, results):
        """Analyze the narrative flow and coherence"""
        commentary_events = [r for r in results if r['event_type'] not in ['Match Start', 'Full Time']]

        if len(commentary_events) < 2:
            return {'error': 'Not enough events for flow analysis'}

        # Temporal coherence
        minutes = [r['minute'] for r in commentary_events]
        temporal_gaps = []
        for i in range(1, len(minutes)):
            gap = minutes[i] - minutes[i-1]
            temporal_gaps.append(gap)

        avg_temporal_gap = np.mean(temporal_gaps) if temporal_gaps else 0

        # Repetition analysis
        commentaries = [r['commentary'] for r in commentary_events]

        # Check for repeated phrases (3+ words)
        phrase_counts = {}
        for commentary in commentaries:
            words = commentary.split()
            for i in range(len(words) - 2):
                phrase = ' '.join(words[i:i+3]).lower()
                phrase_counts[phrase] = phrase_counts.get(phrase, 0) + 1

        repeated_phrases = {phrase: count for phrase, count in phrase_counts.items() if count > 1}
        repetition_ratio = len(repeated_phrases) / len(phrase_counts) if phrase_counts else 0

        # Sentence structure variety
        structures = {
            'exclamatory': 0,  # Contains !
            'questioning': 0,  # Contains ?
            'declarative': 0,  # Ends with .
            'incomplete': 0    # No proper ending
        }

        for commentary in commentaries:
            if '!' in commentary:
                structures['exclamatory'] += 1
            elif '?' in commentary:
                structures['questioning'] += 1
            elif commentary.endswith('.'):
                structures['declarative'] += 1
            else:
                structures['incomplete'] += 1

        structure_variety = len([s for s in structures.values() if s > 0]) / len(structures)

        return {
            'temporal_coherence': {
                'avg_minute_gap': avg_temporal_gap,
                'time_span': max(minutes) - min(minutes) if minutes else 0,
                'coverage_ratio': (max(minutes) - min(minutes)) / 90 if minutes else 0
            },
            'repetition_analysis': {
                'repeated_phrases_count': len(repeated_phrases),
                'repetition_ratio': repetition_ratio,
                'most_repeated': max(repeated_phrases.items(), key=lambda x: x[1]) if repeated_phrases else None
            },
            'sentence_structure': structures,
            'structure_variety_score': structure_variety
        }

    def generate_improvement_recommendations(self, vocab_analysis, content_analysis, flow_analysis):
        """Generate specific improvement recommendations"""
        recommendations = []

        # Vocabulary recommendations
        if vocab_analysis.get('vocabulary_diversity', 0) < 0.6:
            recommendations.append({
                'category': 'Vocabulary',
                'issue': 'Low vocabulary diversity',
                'recommendation': 'Increase word variety by using synonyms and varied descriptors',
                'priority': 'High'
            })

        if vocab_analysis.get('professional_vocabulary_ratio', 0) < 0.3:
            recommendations.append({
                'category': 'Professional Quality',
                'issue': 'Limited professional phrases',
                'recommendation': 'Include more broadcast-standard phrases like "back of the net", "clinical finish"',
                'priority': 'Medium'
            })

        # Content recommendations
        if content_analysis.get('word_count_stats', {}).get('avg', 0) < 25:
            recommendations.append({
                'category': 'Content Length',
                'issue': 'Commentary too brief',
                'recommendation': 'Expand commentary to 30-50 words for more engaging content',
                'priority': 'High'
            })

        quality_indicators = content_analysis.get('quality_indicators', {})
        if quality_indicators.get('atmosphere_words', 0) < 0.2:
            recommendations.append({
                'category': 'Atmosphere',
                'issue': 'Lacks crowd and atmosphere descriptions',
                'recommendation': 'Include more crowd reactions and stadium atmosphere',
                'priority': 'Medium'
            })

        # Flow recommendations
        flow_data = flow_analysis.get('repetition_analysis', {})
        if flow_data.get('repetition_ratio', 0) > 0.3:
            recommendations.append({
                'category': 'Variety',
                'issue': 'High repetition in phrases',
                'recommendation': 'Use more varied sentence structures and avoid repeated phrases',
                'priority': 'High'
            })

        structure_variety = flow_analysis.get('structure_variety_score', 0)
        if structure_variety < 0.5:
            recommendations.append({
                'category': 'Sentence Structure',
                'issue': 'Limited sentence variety',
                'recommendation': 'Mix exclamatory, declarative, and questioning sentences',
                'priority': 'Medium'
            })

        return recommendations

# Initialize analysis engine
analysis_engine = CommentaryAnalysisEngine()
print("✅ Commentary analysis engine initialized!")

✅ Commentary analysis engine initialized!


In [50]:
# Export the template-based results
if 'template_detailed_commentary' in locals():
    filename = "template_detailed_commentary.txt"

    with open(filename, 'w') as f:
        f.write("🏟️ TEMPLATE-BASED DETAILED COMMENTARY\n")
        f.write("=" * 70 + "\n")
        f.write(f"Match: {extended_context['home_team']} vs {extended_context['away_team']}\n")
        f.write(f"Competition: {extended_context['competition']}\n")
        f.write(f"Style: Professional Template-Based\n")
        f.write(f"Variety: 100% Guaranteed Unique\n")
        f.write("=" * 70 + "\n\n")

        for result in template_detailed_commentary:
            if result['event_type'] in ['Match Start', 'Full Time']:
                f.write(f"\n{'='*60}\n")
                f.write(f"{result['commentary']}\n")
                f.write(f"{'='*60}\n\n")
            else:
                f.write(f"MINUTE {result['minute']} - {result['event_type'].upper()}\n")
                f.write(f"Player: {result['player']} ({result['team']})\n")
                f.write(f"Score: {result['score']} | Words: {result['word_count']}\n")
                f.write(f"\n🎙️ {result['commentary']}\n")
                f.write("=" * 70 + "\n\n")

    print(f"✅ Template-based commentary exported to: {filename}")
    print("🎉 GUARANTEED VARIETY - Every commentary is unique!")

✅ Template-based commentary exported to: template_detailed_commentary.txt
🎉 GUARANTEED VARIETY - Every commentary is unique!


In [53]:
from google.colab import files
files.download('template_detailed_commentary.txt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

**Enhanced Full Match Commentary**

In [69]:
import numpy as np
import random
from collections import defaultdict, Counter
import json
from pathlib import Path

# INTEGRATED MATCH LOADER AND COMMENTARY SYSTEM

def load_optimal_match_with_rich_events():
    """Load a match with maximum event variety and coverage"""
    print("🔍 SEARCHING FOR OPTIMAL MATCH WITH RICH EVENT COVERAGE")
    print("=" * 60)

    data_path = Path("open-data/data")

    # Ensure competitions file exists
    competitions_file = data_path / "competitions.json"
    if not competitions_file.exists():
        print(f"❌ Competitions file not found: {competitions_file}")
        return None, None

    with open(competitions_file) as f:
        competitions = json.load(f)

    print(f"📋 Found {len(competitions)} competitions to search")

    # Expanded event types for richer commentary
    priority_events = {
        # High priority - most interesting for commentary
        'Goal': 10,
        'Shot': 8,
        'Card': 7,
        'Substitution': 6,
        'Foul Committed': 5,

        # Medium priority - good supporting events
        'Offside': 4,
        'Duel': 3,
        'Interception': 3,
        'Ball Recovery': 3,
        'Clearance': 3,
        'Block': 3,
        'Tackle': 3,

        # Lower priority but still valuable
        'Pass': 2,
        'Carry': 1,
        'Pressure': 1,
        'Ball Receipt': 1
    }

    all_event_types = list(priority_events.keys())
    best_matches = []  # Store multiple good matches

    # Search through competitions more comprehensively
    for comp_idx, comp in enumerate(competitions[:15]):  # Check more competitions
        try:
            matches_file = data_path / f"matches/{comp['competition_id']}/{comp['season_id']}.json"
            if not matches_file.exists():
                print(f"⚠️  Matches file not found for {comp['competition_name']}")
                continue

            with open(matches_file) as f:
                matches = json.load(f)

            print(f"\n📊 Competition {comp_idx+1}: {comp['competition_name']} ({comp['season_name']})")
            print(f"   Available matches: {len(matches)}")

            # Check more matches per competition
            for match_idx, match in enumerate(matches[:20]):  # Check more matches
                try:
                    events_file = data_path / f"events/{match['match_id']}.json"
                    if not events_file.exists():
                        continue

                    with open(events_file) as f:
                        events = json.load(f)

                    # Analyze match comprehensively
                    match_analysis = analyze_match_quality(events, all_event_types, priority_events)

                    if match_analysis['quality_score'] > 0:
                        match_data = {
                            'match': match,
                            'competition': comp,
                            'events': match_analysis['processed_events'],
                            'analysis': match_analysis,
                            'context': {
                                'home_team': match['home_team']['home_team_name'],
                                'away_team': match['away_team']['away_team_name'],
                                'competition': comp['competition_name'],
                                'season': comp['season_name'],
                                'match_id': match['match_id'],
                                'match_date': match.get('match_date', 'Unknown')
                            }
                        }
                        best_matches.append(match_data)

                        # Print basic info for promising matches
                        if match_analysis['total_events'] > 30:
                            print(f"   ✅ Good match {match_idx+1}: {match_data['context']['home_team']} vs {match_data['context']['away_team']}")
                            print(f"      Events: {match_analysis['total_events']}, Score: {match_analysis['quality_score']:.1f}")

                except Exception as e:
                    if match_idx < 5:  # Only print errors for first few matches
                        print(f"   ⚠️  Error processing match {match_idx+1}: {str(e)[:50]}...")
                    continue

        except Exception as e:
            print(f"❌ Error processing competition {comp['competition_name']}: {str(e)[:50]}...")
            continue

    if not best_matches:
        print("❌ No suitable matches found")
        return None, None

    # Sort by quality score and select the best
    best_matches.sort(key=lambda x: x['analysis']['quality_score'], reverse=True)

    print(f"\n🎯 FOUND {len(best_matches)} VIABLE MATCHES")
    print("=" * 50)

    # Show top 5 candidates
    print("Top 5 match candidates:")
    for i, match_data in enumerate(best_matches[:5]):
        ctx = match_data['context']
        analysis = match_data['analysis']
        print(f"{i+1}. {ctx['home_team']} vs {ctx['away_team']}")
        print(f"   Competition: {ctx['competition']}")
        print(f"   Events: {analysis['total_events']}, Variety: {analysis['event_variety']}, Score: {analysis['quality_score']:.1f}")
        print(f"   Time span: {analysis['minute_span']} minutes")

    # Select the best match
    selected_match = best_matches[0]
    selected_events = selected_match['events']
    selected_context = selected_match['context']
    selected_analysis = selected_match['analysis']

    print(f"\n🏆 SELECTED MATCH:")
    print(f"   {selected_context['home_team']} vs {selected_context['away_team']}")
    print(f"   Competition: {selected_context['competition']} ({selected_context['season']})")
    print(f"   Date: {selected_context['match_date']}")
    print(f"   Total events: {selected_analysis['total_events']}")
    print(f"   Event variety: {selected_analysis['event_variety']} different types")
    print(f"   Quality score: {selected_analysis['quality_score']:.1f}")
    print(f"   Time coverage: {selected_analysis['minute_span']} minutes")

    # Show detailed event breakdown
    print(f"\n📊 DETAILED EVENT BREAKDOWN:")
    for event_type, count in sorted(selected_analysis['event_counts'].items(),
                                   key=lambda x: x[1], reverse=True):
        percentage = (count / selected_analysis['total_events']) * 100
        print(f"   • {event_type}: {count} ({percentage:.1f}%)")

    return selected_events, selected_context

def analyze_match_quality(events, commentary_worthy_events, priority_events):
    """Comprehensive analysis of match quality for commentary generation - ONLY key events"""

    total_events = 0
    interesting_events = []
    minute_coverage = set()
    event_counts = defaultdict(int)
    time_distribution = defaultdict(int)
    quality_score = 0

    print(f"🔍 DEBUG: Starting to process {len(events)} total events...")
    print(f"🔍 Looking for these event types: {commentary_worthy_events}")

    key_events_found = 0

    for event_idx, event in enumerate(events):
        try:
            event_type = event.get('type', {}).get('name', '')
            minute = event.get('minute', 0)
            second = event.get('second', 0)

            # DEBUG: Count all event types to see what we're missing
            if event_idx < 100:  # First 100 events
                if event_type not in commentary_worthy_events and event_idx % 20 == 0:
                    print(f"   Skipping common event: {event_type} at minute {minute}")

            # Skip events outside normal playing time
            if minute is None or minute > 95 or minute < 0:
                continue

            # ONLY process commentary-worthy events
            if event_type not in commentary_worthy_events:
                continue

            key_events_found += 1
            if key_events_found <= 10:  # Debug first 10 key events
                print(f"✅ KEY EVENT {key_events_found}: {event_type} at minute {minute}")

            # Extract player and team info safely
            player_info = event.get('player', {})
            team_info = event.get('team', {})

            player_name = player_info.get('name', 'Unknown Player')
            team_name = team_info.get('name', 'Unknown Team')

            # Skip events without proper player/team info
            if player_name == 'Unknown Player' or team_name == 'Unknown Team':
                continue

            # Convert minute to integer
            try:
                minute_int = int(float(minute)) if minute is not None else 0
                second_int = int(float(second)) if second is not None else 0
            except (ValueError, TypeError):
                minute_int = 0
                second_int = 0

            total_events += 1
            minute_coverage.add(minute_int)
            event_counts[event_type] += 1

            # Time distribution for variety analysis
            if minute_int <= 15:
                time_distribution['Early (0-15)'] += 1
            elif minute_int <= 30:
                time_distribution['Mid-1st (16-30)'] += 1
            elif minute_int <= 45:
                time_distribution['Late-1st (31-45)'] += 1
            elif minute_int <= 60:
                time_distribution['Early-2nd (46-60)'] += 1
            elif minute_int <= 75:
                time_distribution['Mid-2nd (61-75)'] += 1
            else:
                time_distribution['Late-2nd (76-90+)'] += 1

            # Add to quality score based on event priority
            quality_score += priority_events.get(event_type, 1)

            # Process the event with enhanced detail extraction
            processed_event = {
                'minute': minute_int,
                'second': second_int,
                'event_type': event_type,
                'player': player_name,
                'team': team_name,
                'details': extract_enhanced_event_details(event),
                'timestamp': f"{minute_int}:{second_int:02d}"
            }
            interesting_events.append(processed_event)

        except Exception as e:
            # Skip problematic events rather than failing completely
            continue

    print(f"🔍 DEBUG: Found {key_events_found} key events out of {len(events)} total events")
    print(f"🔍 DEBUG: Processed {total_events} commentary-worthy events")

    # Calculate additional quality metrics
    event_variety = len(set(event_counts.keys()))
    minute_span = max(minute_coverage) - min(minute_coverage) if minute_coverage else 0

    # Bonus for good narrative structure
    narrative_bonus = 0
    if event_counts.get('Goal', 0) >= 2:
        narrative_bonus += 15  # Goals are crucial
    if event_counts.get('Card', 0) >= 1:
        narrative_bonus += 8   # Cards add drama
    if event_counts.get('Substitution', 0) >= 2:
        narrative_bonus += 5   # Tactical changes
    if minute_span >= 80:
        narrative_bonus += 10  # Full match coverage
    if event_counts.get('Shot', 0) >= 5:
        narrative_bonus += 10  # Action-packed

    final_quality_score = quality_score + narrative_bonus + event_variety * 3

    # Sort events by time
    interesting_events.sort(key=lambda x: (x['minute'], x['second']))

    print(f"🔍 DEBUG: Final quality score: {final_quality_score}")
    print(f"🔍 DEBUG: Event breakdown: {dict(event_counts)}")

    return {
        'total_events': total_events,
        'processed_events': interesting_events,
        'event_counts': dict(event_counts),
        'event_variety': event_variety,
        'minute_span': minute_span,
        'time_distribution': dict(time_distribution),
        'quality_score': final_quality_score,
        'goals_scored': event_counts.get('Goal', 0)
    }

# INTEGRATED MATCH LOADER AND COMMENTARY SYSTEM

def load_optimal_match_with_rich_events():
    """Load a match with maximum event variety and coverage"""
    print("🔍 SEARCHING FOR OPTIMAL MATCH WITH RICH EVENT COVERAGE")
    print("=" * 60)

    data_path = Path("open-data/data")

    # Ensure competitions file exists
    competitions_file = data_path / "competitions.json"
    if not competitions_file.exists():
        print(f"❌ Competitions file not found: {competitions_file}")
        return None, None

    with open(competitions_file) as f:
        competitions = json.load(f)

    print(f"📋 Found {len(competitions)} competitions to search")

    # Expanded event types for richer commentary - ONLY the most important ones
    priority_events = {
        # High priority - most interesting for commentary
        'Goal': 15,
        'Shot': 12,
        'Card': 10,
        'Substitution': 8,
        'Foul Committed': 6,

        # Medium priority - good supporting events (but only sometimes)
        'Offside': 4,
        'Tackle': 5,
        'Interception': 4,
        'Block': 6,
        'Clearance': 3,

        # Lower priority - only include occasionally
        'Duel': 2,
        'Ball Recovery': 1,

        # REMOVED: Pass, Carry, Pressure, Ball Receipt (too granular)
    }

    # Only focus on these key event types for commentary
    commentary_worthy_events = [
        'Goal', 'Shot', 'Card', 'Substitution', 'Foul Committed',
        'Offside', 'Tackle', 'Interception', 'Block', 'Clearance'
    ]

    all_event_types = commentary_worthy_events
    best_matches = []  # Store multiple good matches

    # Search through competitions more comprehensively
    for comp_idx, comp in enumerate(competitions[:15]):  # Check more competitions
        try:
            matches_file = data_path / f"matches/{comp['competition_id']}/{comp['season_id']}.json"
            if not matches_file.exists():
                print(f"⚠️  Matches file not found for {comp['competition_name']}")
                continue

            with open(matches_file) as f:
                matches = json.load(f)

            print(f"\n📊 Competition {comp_idx+1}: {comp['competition_name']} ({comp['season_name']})")
            print(f"   Available matches: {len(matches)}")

            # Check more matches per competition
            for match_idx, match in enumerate(matches[:20]):  # Check more matches
                try:
                    events_file = data_path / f"events/{match['match_id']}.json"
                    if not events_file.exists():
                        continue

                    with open(events_file) as f:
                        events = json.load(f)

                    # Analyze match comprehensively - but only for key events
                    match_analysis = analyze_match_quality(events, commentary_worthy_events, priority_events)

                    if match_analysis['quality_score'] > 20 and match_analysis['total_events'] > 10:  # Higher threshold for quality
                        match_data = {
                            'match': match,
                            'competition': comp,
                            'events': match_analysis['processed_events'],
                            'analysis': match_analysis,
                            'context': {
                                'home_team': match['home_team']['home_team_name'],
                                'away_team': match['away_team']['away_team_name'],
                                'competition': comp['competition_name'],
                                'season': comp['season_name'],
                                'match_id': match['match_id'],
                                'match_date': match.get('match_date', 'Unknown')
                            }
                        }
                        best_matches.append(match_data)

                        # Print basic info for promising matches
                        if match_analysis['total_events'] > 30:
                            print(f"   ✅ Good match {match_idx+1}: {match_data['context']['home_team']} vs {match_data['context']['away_team']}")
                            print(f"      Events: {match_analysis['total_events']}, Score: {match_analysis['quality_score']:.1f}")

                except Exception as e:
                    if match_idx < 5:  # Only print errors for first few matches
                        print(f"   ⚠️  Error processing match {match_idx+1}: {str(e)[:50]}...")
                    continue

        except Exception as e:
            print(f"❌ Error processing competition {comp['competition_name']}: {str(e)[:50]}...")
            continue

    if not best_matches:
        print("❌ No suitable matches found")
        return None, None

    # Sort by quality score and select the best
    best_matches.sort(key=lambda x: x['analysis']['quality_score'], reverse=True)

    print(f"\n🎯 FOUND {len(best_matches)} VIABLE MATCHES")
    print("=" * 50)

    # Show top 5 candidates
    print("Top 5 match candidates:")
    for i, match_data in enumerate(best_matches[:5]):
        ctx = match_data['context']
        analysis = match_data['analysis']
        print(f"{i+1}. {ctx['home_team']} vs {ctx['away_team']}")
        print(f"   Competition: {ctx['competition']}")
        print(f"   Events: {analysis['total_events']}, Variety: {analysis['event_variety']}, Score: {analysis['quality_score']:.1f}")
        print(f"   Time span: {analysis['minute_span']} minutes")

    # Select the best match
    selected_match = best_matches[0]
    selected_events = selected_match['events']
    selected_context = selected_match['context']
    selected_analysis = selected_match['analysis']

    print(f"\n🏆 SELECTED MATCH:")
    print(f"   {selected_context['home_team']} vs {selected_context['away_team']}")
    print(f"   Competition: {selected_context['competition']} ({selected_context['season']})")
    print(f"   Date: {selected_context['match_date']}")
    print(f"   Total events: {selected_analysis['total_events']}")
    print(f"   Event variety: {selected_analysis['event_variety']} different types")
    print(f"   Quality score: {selected_analysis['quality_score']:.1f}")
    print(f"   Time coverage: {selected_analysis['minute_span']} minutes")

    # Show detailed event breakdown
    print(f"\n📊 DETAILED EVENT BREAKDOWN:")
    for event_type, count in sorted(selected_analysis['event_counts'].items(),
                                   key=lambda x: x[1], reverse=True):
        percentage = (count / selected_analysis['total_events']) * 100
        print(f"   • {event_type}: {count} ({percentage:.1f}%)")

    return selected_events, selected_context

def analyze_match_quality(events, commentary_worthy_events, priority_events):
    """Comprehensive analysis of match quality for commentary generation - ONLY key events"""

    total_events = 0
    interesting_events = []
    minute_coverage = set()
    event_counts = defaultdict(int)
    time_distribution = defaultdict(int)
    quality_score = 0

    print(f"🔍 DEBUG: Starting to process {len(events)} total events...")
    print(f"🔍 Looking for these event types: {commentary_worthy_events}")

    key_events_found = 0

    for event_idx, event in enumerate(events):
        try:
            event_type = event.get('type', {}).get('name', '')
            minute = event.get('minute', 0)
            second = event.get('second', 0)

            # DEBUG: Count all event types to see what we're missing
            if event_idx < 100:  # First 100 events
                if event_type not in commentary_worthy_events and event_idx % 20 == 0:
                    print(f"   Skipping common event: {event_type} at minute {minute}")

            # Skip events outside normal playing time
            if minute is None or minute > 95 or minute < 0:
                continue

            # ONLY process commentary-worthy events
            if event_type not in commentary_worthy_events:
                continue

            key_events_found += 1
            if key_events_found <= 10:  # Debug first 10 key events
                print(f"✅ KEY EVENT {key_events_found}: {event_type} at minute {minute}")

            # Extract player and team info safely
            player_info = event.get('player', {})
            team_info = event.get('team', {})

            player_name = player_info.get('name', 'Unknown Player')
            team_name = team_info.get('name', 'Unknown Team')

            # Skip events without proper player/team info
            if player_name == 'Unknown Player' or team_name == 'Unknown Team':
                continue

            # Convert minute to integer
            try:
                minute_int = int(float(minute)) if minute is not None else 0
                second_int = int(float(second)) if second is not None else 0
            except (ValueError, TypeError):
                minute_int = 0
                second_int = 0

            total_events += 1
            minute_coverage.add(minute_int)
            event_counts[event_type] += 1

            # Time distribution for variety analysis
            if minute_int <= 15:
                time_distribution['Early (0-15)'] += 1
            elif minute_int <= 30:
                time_distribution['Mid-1st (16-30)'] += 1
            elif minute_int <= 45:
                time_distribution['Late-1st (31-45)'] += 1
            elif minute_int <= 60:
                time_distribution['Early-2nd (46-60)'] += 1
            elif minute_int <= 75:
                time_distribution['Mid-2nd (61-75)'] += 1
            else:
                time_distribution['Late-2nd (76-90+)'] += 1

            # Add to quality score based on event priority
            quality_score += priority_events.get(event_type, 1)

            # Process the event with enhanced detail extraction
            processed_event = {
                'minute': minute_int,
                'second': second_int,
                'event_type': event_type,
                'player': player_name,
                'team': team_name,
                'details': extract_enhanced_event_details(event),
                'timestamp': f"{minute_int}:{second_int:02d}"
            }
            interesting_events.append(processed_event)

        except Exception as e:
            # Skip problematic events rather than failing completely
            continue

    print(f"🔍 DEBUG: Found {key_events_found} key events out of {len(events)} total events")
    print(f"🔍 DEBUG: Processed {total_events} commentary-worthy events")

    # Calculate additional quality metrics
    event_variety = len(set(event_counts.keys()))
    minute_span = max(minute_coverage) - min(minute_coverage) if minute_coverage else 0

    # Bonus for good narrative structure
    narrative_bonus = 0
    if event_counts.get('Goal', 0) >= 2:
        narrative_bonus += 15  # Goals are crucial
    if event_counts.get('Card', 0) >= 1:
        narrative_bonus += 8   # Cards add drama
    if event_counts.get('Substitution', 0) >= 2:
        narrative_bonus += 5   # Tactical changes
    if minute_span >= 80:
        narrative_bonus += 10  # Full match coverage
    if event_counts.get('Shot', 0) >= 5:
        narrative_bonus += 10  # Action-packed

    final_quality_score = quality_score + narrative_bonus + event_variety * 3

    # Sort events by time
    interesting_events.sort(key=lambda x: (x['minute'], x['second']))

    print(f"🔍 DEBUG: Final quality score: {final_quality_score}")
    print(f"🔍 DEBUG: Event breakdown: {dict(event_counts)}")

    return {
        'total_events': total_events,
        'processed_events': interesting_events,
        'event_counts': dict(event_counts),
        'event_variety': event_variety,
        'minute_span': minute_span,
        'time_distribution': dict(time_distribution),
        'quality_score': final_quality_score,
        'goals_scored': event_counts.get('Goal', 0)
    }

# INTEGRATED MATCH LOADER AND COMMENTARY SYSTEM

def load_optimal_match_with_rich_events():
    """Load a match with maximum event variety and coverage"""
    print("🔍 SEARCHING FOR OPTIMAL MATCH WITH RICH EVENT COVERAGE")
    print("=" * 60)

    data_path = Path("open-data/data")

    # Ensure competitions file exists
    competitions_file = data_path / "competitions.json"
    if not competitions_file.exists():
        print(f"❌ Competitions file not found: {competitions_file}")
        return None, None

    with open(competitions_file) as f:
        competitions = json.load(f)

    print(f"📋 Found {len(competitions)} competitions to search")

    # Expanded event types for richer commentary - ONLY the most important ones
    priority_events = {
        # High priority - most interesting for commentary
        'Goal': 15,
        'Shot': 12,
        'Card': 10,
        'Substitution': 8,
        'Foul Committed': 6,

        # Medium priority - good supporting events (but only sometimes)
        'Offside': 4,
        'Tackle': 5,
        'Interception': 4,
        'Block': 6,
        'Clearance': 3,

        # Lower priority - only include occasionally
        'Duel': 2,
        'Ball Recovery': 1,

        # REMOVED: Pass, Carry, Pressure, Ball Receipt (too granular)
    }

    # Only focus on these key event types for commentary
    commentary_worthy_events = [
        'Goal', 'Shot', 'Card', 'Substitution', 'Foul Committed',
        'Offside', 'Tackle', 'Interception', 'Block', 'Clearance'
    ]

    all_event_types = commentary_worthy_events
    best_matches = []  # Store multiple good matches

    # Search through competitions more comprehensively
    for comp_idx, comp in enumerate(competitions[:15]):  # Check more competitions
        try:
            matches_file = data_path / f"matches/{comp['competition_id']}/{comp['season_id']}.json"
            if not matches_file.exists():
                print(f"⚠️  Matches file not found for {comp['competition_name']}")
                continue

            with open(matches_file) as f:
                matches = json.load(f)

            print(f"\n📊 Competition {comp_idx+1}: {comp['competition_name']} ({comp['season_name']})")
            print(f"   Available matches: {len(matches)}")

            # Check more matches per competition
            for match_idx, match in enumerate(matches[:20]):  # Check more matches
                try:
                    events_file = data_path / f"events/{match['match_id']}.json"
                    if not events_file.exists():
                        continue

                    with open(events_file) as f:
                        events = json.load(f)

                    # Analyze match comprehensively - but only for key events
                    match_analysis = analyze_match_quality(events, commentary_worthy_events, priority_events)

                    if match_analysis['quality_score'] > 20 and match_analysis['total_events'] > 10:  # Higher threshold for quality
                        match_data = {
                            'match': match,
                            'competition': comp,
                            'events': match_analysis['processed_events'],
                            'analysis': match_analysis,
                            'context': {
                                'home_team': match['home_team']['home_team_name'],
                                'away_team': match['away_team']['away_team_name'],
                                'competition': comp['competition_name'],
                                'season': comp['season_name'],
                                'match_id': match['match_id'],
                                'match_date': match.get('match_date', 'Unknown')
                            }
                        }
                        best_matches.append(match_data)

                        # Print basic info for promising matches
                        if match_analysis['total_events'] > 30:
                            print(f"   ✅ Good match {match_idx+1}: {match_data['context']['home_team']} vs {match_data['context']['away_team']}")
                            print(f"      Events: {match_analysis['total_events']}, Score: {match_analysis['quality_score']:.1f}")
                            print(f"      Time span: {match_analysis['minute_span']} minutes")

                except Exception as e:
                    if match_idx < 5:  # Only print errors for first few matches
                        print(f"   ⚠️  Error processing match {match_idx+1}: {str(e)[:50]}...")
                    continue

        except Exception as e:
            print(f"❌ Error processing competition {comp['competition_name']}: {str(e)[:50]}...")
            continue

    if not best_matches:
        print("❌ No suitable matches found")
        return None, None

    # Sort by quality score and select the best
    best_matches.sort(key=lambda x: x['analysis']['quality_score'], reverse=True)

    print(f"\n🎯 FOUND {len(best_matches)} VIABLE MATCHES")
    print("=" * 50)

    # Show top 5 candidates
    print("Top 5 match candidates:")
    for i, match_data in enumerate(best_matches[:5]):
        ctx = match_data['context']
        analysis = match_data['analysis']
        print(f"{i+1}. {ctx['home_team']} vs {ctx['away_team']}")
        print(f"   Competition: {ctx['competition']}")
        print(f"   Events: {analysis['total_events']}, Variety: {analysis['event_variety']}, Score: {analysis['quality_score']:.1f}")
        print(f"   Time span: {analysis['minute_span']} minutes")

    # Select the best match
    selected_match = best_matches[0]
    selected_events = selected_match['events']
    selected_context = selected_match['context']
    selected_analysis = selected_match['analysis']

    print(f"\n🏆 SELECTED MATCH:")
    print(f"   {selected_context['home_team']} vs {selected_context['away_team']}")
    print(f"   Competition: {selected_context['competition']} ({selected_context['season']})")
    print(f"   Date: {selected_context['match_date']}")
    print(f"   Total events: {selected_analysis['total_events']}")
    print(f"   Event variety: {selected_analysis['event_variety']} different types")
    print(f"   Quality score: {selected_analysis['quality_score']:.1f}")
    print(f"   Time coverage: {selected_analysis['minute_span']} minutes")

    # Show detailed event breakdown
    print(f"\n📊 DETAILED EVENT BREAKDOWN:")
    for event_type, count in sorted(selected_analysis['event_counts'].items(),
                                   key=lambda x: x[1], reverse=True):
        percentage = (count / selected_analysis['total_events']) * 100
        print(f"   • {event_type}: {count} ({percentage:.1f}%)")

    return selected_events, selected_context

def analyze_match_quality(events, all_event_types, priority_events):
    """Comprehensive analysis of match quality for commentary generation"""

    total_events = 0
    interesting_events = []
    minute_coverage = set()
    event_counts = defaultdict(int)
    time_distribution = defaultdict(int)
    quality_score = 0

    for event in events:
        try:
            event_type = event.get('type', {}).get('name', '')
            minute = event.get('minute', 0)
            second = event.get('second', 0)

            # Skip events outside normal playing time
            if minute > 95 or minute < 0:
                continue

            # Only process events we care about
            if event_type not in all_event_types:
                continue

            # Extract player and team info safely
            player_info = event.get('player', {})
            team_info = event.get('team', {})

            player_name = player_info.get('name', 'Unknown Player')
            team_name = team_info.get('name', 'Unknown Team')

            # Skip events without proper player/team info
            if player_name == 'Unknown Player' or team_name == 'Unknown Team':
                continue

            total_events += 1
            minute_coverage.add(minute)
            event_counts[event_type] += 1

            # Time distribution for variety analysis
            if minute <= 15:
                time_distribution['Early (0-15)'] += 1
            elif minute <= 30:
                time_distribution['Mid-1st (16-30)'] += 1
            elif minute <= 45:
                time_distribution['Late-1st (31-45)'] += 1
            elif minute <= 60:
                time_distribution['Early-2nd (46-60)'] += 1
            elif minute <= 75:
                time_distribution['Mid-2nd (61-75)'] += 1
            else:
                time_distribution['Late-2nd (76-90+)'] += 1

            # Add to quality score based on event priority
            quality_score += priority_events.get(event_type, 1)

            # Process the event with enhanced detail extraction
            processed_event = {
                'minute': int(minute) if minute is not None else 0,
                'second': int(second) if second is not None else 0,
                'event_type': event_type,
                'player': player_name,
                'team': team_name,
                'details': extract_enhanced_event_details(event),
                'timestamp': f"{int(minute) if minute else 0}:{int(second) if second else 0:02d}"
            }
            interesting_events.append(processed_event)

        except Exception as e:
            # Skip problematic events rather than failing completely
            continue

    # Calculate additional quality metrics
    event_variety = len(set(event_counts.keys()))
    minute_span = max(minute_coverage) - min(minute_coverage) if minute_coverage else 0

    # Bonus for good narrative structure
    narrative_bonus = 0
    if event_counts.get('Goal', 0) >= 2:
        narrative_bonus += 10
    if event_counts.get('Card', 0) >= 1:
        narrative_bonus += 5
    if event_counts.get('Substitution', 0) >= 2:
        narrative_bonus += 5
    if minute_span >= 80:
        narrative_bonus += 10

    final_quality_score = quality_score + narrative_bonus + event_variety * 2

    # Sort events by time
    interesting_events.sort(key=lambda x: (x['minute'], x['second']))

    return {
        'total_events': total_events,
        'processed_events': interesting_events,
        'event_counts': dict(event_counts),
        'event_variety': event_variety,
        'minute_span': minute_span,
        'time_distribution': dict(time_distribution),
        'quality_score': final_quality_score,
        'goals_scored': event_counts.get('Goal', 0)
    }

def extract_enhanced_event_details(event):
    """Extract comprehensive details from events for richer commentary"""
    details = {}
    event_type = event.get('type', {}).get('name', '')

    try:
        if event_type == 'Shot':
            shot_data = event.get('shot', {})
            details.update({
                'outcome': shot_data.get('outcome', {}).get('name', 'Unknown'),
                'body_part': shot_data.get('body_part', {}).get('name', 'Unknown'),
                'technique': shot_data.get('technique', {}).get('name', 'Unknown'),
                'type': shot_data.get('type', {}).get('name', 'Unknown')
            })

        elif event_type == 'Goal':
            shot_data = event.get('shot', {})
            details.update({
                'body_part': shot_data.get('body_part', {}).get('name', 'Unknown'),
                'technique': shot_data.get('technique', {}).get('name', 'Unknown'),
                'type': shot_data.get('type', {}).get('name', 'Unknown')
            })

        elif event_type == 'Card':
            foul_data = event.get('foul_committed', {})
            bad_behaviour_data = event.get('bad_behaviour', {})
            details.update({
                'card_type': foul_data.get('card', {}).get('name') or bad_behaviour_data.get('card', {}).get('name', 'Yellow'),
                'reason': foul_data.get('type', {}).get('name') or bad_behaviour_data.get('type', {}).get('name', 'Foul')
            })

        elif event_type == 'Pass':
            pass_data = event.get('pass', {})
            details.update({
                'outcome': pass_data.get('outcome', {}).get('name', 'Complete'),
                'length': pass_data.get('length', 0),
                'height': pass_data.get('height', {}).get('name', 'Unknown'),
                'type': pass_data.get('type', {}).get('name', 'Unknown')
            })

        elif event_type == 'Substitution':
            sub_data = event.get('substitution', {})
            details.update({
                'replacement': sub_data.get('replacement', {}).get('name', 'Unknown'),
                'reason': sub_data.get('reason', {}).get('name', 'Tactical')
            })

        elif event_type == 'Foul Committed':
            foul_data = event.get('foul_committed', {})
            details.update({
                'type': foul_data.get('type', {}).get('name', 'Unknown'),
                'advantage': foul_data.get('advantage', False),
                'penalty': foul_data.get('penalty', False)
            })

    except Exception as e:
        details['extraction_error'] = str(e)

    return details

# ENHANCED COMMENTARY GENERATION SYSTEM

def generate_enhanced_template_commentary(events, context, max_events=None):
    """Generate enhanced detailed commentary with massive variety and intelligent selection"""
    print("📝 GENERATING ENHANCED TEMPLATE-BASED DETAILED COMMENTARY")
    print("=" * 60)

    if not events:
        print("❌ No events to process")
        return []

    # Process ALL events if max_events is None, otherwise use the limit
    if max_events is None:
        selected_events = events  # Process ALL events
        print(f"📊 Processing ALL {len(selected_events)} events for complete match coverage")
    else:
        selected_events = events[:max_events]
        print(f"📊 Processing {len(selected_events)} events (limited by max_events={max_events})")

    print(f"🕐 Match timeline: {min(e['minute'] for e in selected_events)} - {max(e['minute'] for e in selected_events)} minutes")

    # Vocabulary banks for dynamic content
    vocabulary = {
        'intensity_adjectives': [
            'electric', 'pulsating', 'thrilling', 'captivating', 'mesmerizing', 'spellbinding',
            'enthralling', 'riveting', 'gripping', 'breathtaking', 'spectacular', 'dramatic',
            'intense', 'fierce', 'heated', 'explosive', 'dynamic', 'absorbing'
        ],
        'quality_adjectives': [
            'sublime', 'exquisite', 'magnificent', 'brilliant', 'outstanding', 'superb',
            'exceptional', 'remarkable', 'phenomenal', 'sensational', 'world-class', 'masterful',
            'clinical', 'precise', 'perfect', 'flawless', 'immaculate', 'textbook', 'delightful'
        ],
        'negative_adjectives': [
            'poor', 'awful', 'terrible', 'dreadful', 'shocking', 'woeful', 'disappointing',
            'sloppy', 'careless', 'reckless', 'clumsy', 'mistimed', 'wayward', 'errant'
        ],
        'action_verbs': [
            'arrows', 'rockets', 'thunders', 'screams', 'whistles', 'flies', 'surges',
            'powers', 'drives', 'slices', 'curves', 'bends', 'whips', 'crashes', 'fizzes'
        ],
        'crowd_reactions': [
            'erupts in celebration', 'roars with approval', 'goes absolutely wild',
            'explodes with joy', 'rises as one', 'holds its collective breath',
            'gasps in amazement', 'falls silent in anticipation', 'cheers thunderously',
            'goes into raptures', 'produces a wall of sound', 'creates bedlam'
        ],
        'tactical_phrases': [
            'exploiting space in the final third', 'probing for weaknesses',
            'stretching the defensive line', 'compressing the midfield',
            'forcing the tempo', 'dictating the rhythm', 'controlling possession',
            'applying relentless pressure', 'switching the point of attack',
            'maintaining defensive shape', 'pressing high up the pitch'
        ],
        'shot_outcomes': [
            'saved brilliantly by the goalkeeper', 'deflected wide for a corner',
            'blazed high over the crossbar', 'struck the post with venom',
            'parried away by safe hands', 'blocked heroically by the defender',
            'whistled just past the upright', 'tipped over by flying fingertips',
            'cannoned back off the woodwork', 'smothered at the near post'
        ],
        'positions': {
            'Goal': ['striker', 'forward', 'goalscorer', 'finisher', 'marksman'],
            'Shot': ['striker', 'forward', 'attacker', 'shooter', 'marksman'],
            'Pass': ['midfielder', 'playmaker', 'distributor', 'orchestrator'],
            'Foul Committed': ['defender', 'midfielder', 'player', 'enforcer'],
            'Card': ['player', 'defender', 'midfielder', 'culprit'],
            'Substitution': ['player', 'substitute', 'replacement'],
            'Tackle': ['defender', 'midfielder', 'ball-winner'],
            'Interception': ['defender', 'midfielder', 'interceptor'],
            'Block': ['defender', 'shot-blocker', 'guardian'],
            'Clearance': ['defender', 'sweeper', 'last man']
        }
    }

    # Massively expanded templates with sophisticated variety
    enhanced_templates = {
        'Goal': [
            "GOAL! {quality_adj} finish from {player}! The {team} {position} shows ice-cold composure to slot home with {quality_adj} precision! The crowd {crowd_reaction}!",
            "INTO THE NET! {player} finds the corner with {quality_adj} placement! What a {quality_adj} finish from the {team} {position} - that's why they pay him the big money!",
            "BRILLIANT GOAL! {player} converts with aplomb! The {team} {position}'s technique was absolutely {quality_adj} - picking his spot and executing to perfection!",
            "SPECTACULAR! {player} finds the net with a {quality_adj} strike! The {team} {position}'s technique was absolutely {quality_adj} - a goal worthy of winning any match!",
            "SENSATIONAL GOAL! {player} produces a moment of pure magic! The {team} {position}'s effort {action_verb} past the helpless goalkeeper with {quality_adj} power!",
            "CRUCIAL GOAL! {player} delivers when it matters most! The {team} {position} rises to the occasion with a finish that could prove absolutely vital!",
            "WHAT A STRIKE! {player} unleashes a {quality_adj} effort that {action_verb} into the net! The {team} {position} has produced something truly special!",
            "MAGNIFICENT GOAL! {player} scores with {quality_adj} technique! The {team} {position}'s composure under pressure was simply outstanding!"
        ],

        'Shot': [
            "{player} {action_verb} a thunderbolt towards goal! The {team} {position}'s effort {action_verb} through the air with tremendous power, but it's {shot_outcome}!",
            "What a strike from {player}! The {team} {position} connects with {quality_adj} technique, sending the ball {action_verb} towards the top corner, only to be {shot_outcome}!",
            "{player} lets rip from distance! A {quality_adj} effort from the {team} {position} that {action_verb} goalward like a guided missile - {shot_outcome}!",
            "Spectacular attempt from {player}! The {team} {position} unleashes a venomous drive that {action_verb} towards goal with tremendous force - {shot_outcome}!",
            "{player} goes for precision over power! The {team} {position}'s {quality_adj} effort {action_verb} towards the corner with pinpoint accuracy, but it's {shot_outcome}!",
            "{player} tries his luck from an impossible angle! The {team} {position}'s speculative effort {action_verb} goalward - sometimes fortune favors the brave, but not this time!",
            "Ambitious attempt from {player}! The {team} {position} decides to chance his arm from distance - audacious but {shot_outcome}!",
            "{player} has a go from distance! The {team} {position} sees an opportunity and {action_verb} a hopeful effort towards goal - {shot_outcome}!"
        ],

        'Pass': [
            "{player} {tactical_phrase} with a {quality_adj} pass for {team}, demonstrating the kind of vision that separates truly great players from the merely good.",
            "Inch-perfect distribution from {player}! The {team} {position} {action_verb} the ball across the pitch with {quality_adj} technique, keeping possession flowing beautifully.",
            "{player} orchestrates the play from deep, {tactical_phrase} as {team} patiently build their attack with the kind of composure that wins matches.",
            "What {quality_adj} passing from {player}! The {team} {position} picks out his teammate with laser-like precision, threading the needle through a forest of legs.",
            "Sublime technique from {player}! The {team} {position} caresses the ball forward with the outside of his boot, a pass that oozes class and sophistication.",
            "{player} produces a moment of pure magic for {team}! A no-look pass that defies belief - how did he even see that option developing?",
            "{player} quickens the tempo for {team} with a rapid-fire pass! The ball {action_verb} forward before the opposition can even think about closing down.",
            "Lightning-quick thinking from {player}! The {team} {position} releases the ball in a heartbeat, maintaining the rhythm of this {intensity_adj} encounter."
        ],

        'Foul Committed': [
            "{player} commits a professional foul for {team}! A cynical but calculated challenge designed to break up a dangerous counter-attack before it could develop.",
            "{player} mistimes his challenge completely! The {team} {position} was nowhere near the ball there - a {negative_adj} foul that gives the referee no choice whatsoever.",
            "Bone-crunching challenge from {player}! The {team} {position} goes in hard and fast - perhaps too hard as the referee immediately reaches for his whistle.",
            "Tactical intervention from {player}! The {team} {position} takes one for the team, deliberately fouling to prevent what looked like a promising attacking opportunity.",
            "Overzealous defending from {player}! The {team} {position} goes through the back of his opponent in what can only be described as {negative_adj} technique.",
            "{player} gets his timing all wrong for {team}! What should have been a clean tackle turns into a foul as the {position} arrives a split second too late.",
            "Ferocious challenge from {player}! The {team} {position}'s commitment cannot be questioned, but his timing most certainly can as the whistle blows.",
            "No-nonsense defending from {player} turns into a foul! The {team} {position}'s old-school approach crosses the line into dangerous territory."
        ],

        'Card': [
            "Yellow card for {player}! The {team} {position}'s protests fall on deaf ears as the referee takes firm action to maintain control of this {intensity_adj} encounter.",
            "Booking for {player}! The {team} {position}'s enthusiasm gets the better of him there - a mistimed tackle that leaves the referee with no choice.",
            "{player} sees yellow for {team}! A cynical foul designed to break up play results in the inevitable caution from the official.",
            "RED CARD! {player} is sent off for {team}! A moment of madness from the {position} results in an early bath - his teammates now face an uphill battle!",
            "SENDING OFF! {player} sees red for {team}! The referee has no hesitation in producing the red card after that moment of {negative_adj} judgment!",
            "Caution for {player}! The {team} {position} goes into the referee's notebook for what was clearly a {negative_adj} challenge.",
            "Yellow card shown to {player}! The {team} {position}'s reaction to the referee's decision crosses the line into dissent.",
            "The referee reaches for his pocket! {player} of {team} is booked for a challenge that was late and potentially dangerous."
        ],

        'Substitution': [
            "Tactical masterstroke from the manager! {player} makes way for fresh legs as {team} look to inject new energy into their approach.",
            "Strategic substitution for {team}! {player} has given his all tonight, and now it's time for someone else to make their mark on this {intensity_adj} encounter.",
            "Fresh impetus for {team}! {player} heads to the bench after a solid contribution, making way for legs that could prove crucial in these dying moments.",
            "Change of personnel for {team}! {player} makes way having struggled to impose himself on this {intensity_adj} match - sometimes these things just don't work out.",
            "Managerial intervention for {team}! {player} departs having played his part, with the coach clearly having a specific game plan in mind.",
            "Tactical reshuffle for {team}! {player} is withdrawn as the manager looks for a different approach to unlock this stubborn defense.",
            "Fresh ideas for {team}! {player} departs as the coach seeks to change the dynamic of this fascinating tactical battle.",
            "Smart substitution for {team}! {player} comes off having given everything - time for fresh legs to make an impact."
        ],

        'Tackle': [
            "{player} times his tackle to perfection! A {quality_adj} defensive intervention that breaks up the attack with {quality_adj} technique for {team}!",
            "Superb defending from {player}! The {team} {position} reads the play beautifully, sliding in to win the ball with {quality_adj} timing!",
            "{player} produces a tackle worthy of the very best defenders! Committed, clean, and utterly {quality_adj} in its execution for {team}!",
            "Outstanding defensive work from {player}! The {team} {position} shows {quality_adj} anticipation to break up this dangerous attack!",
            "{player} wins the ball with a {quality_adj} challenge! The {team} {position} times his intervention perfectly to regain possession!",
            "Excellent defending from {player}! The {team} {position} stands strong and comes away with the ball through {quality_adj} technique!"
        ],

        'Interception': [
            "Brilliant interception from {player}! The {team} {position} reads the play like a book, stepping in to cut out the pass with {quality_adj} positioning!",
            "{player} sniffs out the danger for {team}! A {quality_adj} interception that breaks up the attack at just the right moment!",
            "Outstanding anticipation from {player}! The {team} {position} intercepts the ball with {quality_adj} timing and awareness!",
            "{player} shows {quality_adj} game intelligence for {team}! The {position} steps across to intercept with perfect timing!",
            "Smart defending from {player}! The {team} {position} reads the situation perfectly and intercepts with {quality_adj} awareness!"
        ],

        'Block': [
            "Heroic block from {player}! The {team} {position} throws his body on the line to deny what looked like a certain goal!",
            "{player} with a crucial block for {team}! The {position} shows tremendous courage to get in the way of that shot!",
            "Last-ditch defending from {player}! The {team} {position} produces a {quality_adj} block when his side needed it most!",
            "{player} stands firm for {team}! The {position} gets his body in the way with {quality_adj} defensive instincts!",
            "Brave defending from {player}! The {team} {position} blocks the shot with no regard for his own safety!"
        ],

        'Clearance': [
            "{player} clears the danger for {team}! The {position} deals with the threat in no-nonsense fashion, booting the ball to safety!",
            "Important clearance from {player}! The {team} {position} shows good awareness to deal with the cross before any damage could be done!",
            "{player} heads clear for {team}! The {position} rises highest to nod the ball away from the danger zone!",
            "{player} hoofs it to safety for {team}! The {position} doesn't take any chances, clearing the ball with {quality_adj} urgency!",
            "Vital clearance from {player}! The {team} {position} deals with the danger decisively when it mattered most!"
        ],

        'Offside': [
            "Offside! {player} strayed just beyond the last defender for {team} - good work by the assistant referee to spot that marginal call!",
            "Flag goes up against {player}! The {team} {position} was caught in an offside position - split-second timing in the modern game!",
            "{player} is flagged for offside! The {team} {position} mistimed his run by mere inches - the technology doesn't lie!",
            "Offside call against {player}! The {team} {position} was just a fraction too eager to get in behind the defense!"
        ],

        'Duel': [
            "{player} wins the aerial battle for {team}! The {position} shows {quality_adj} leap and timing to come out on top in this physical contest!",
            "Physical duel won by {player}! The {team} {position} uses his strength and positioning to good effect in this 50-50 challenge!",
            "{player} comes out victorious in the midfield battle! The {team} {position} shows {quality_adj} technique to win possession!",
            "{player} dominates in the air for {team}! The {position} outjumps his opponent with {quality_adj} athleticism!"
        ],

        'Ball Recovery': [
            "{player} wins the ball back for {team}! The {position} shows excellent pressing and determination to regain possession!",
            "Ball recovery from {player}! The {team} {position} refuses to give up on the challenge and is rewarded for his persistence!",
            "{player} hunts down the loose ball for {team}! The {position}'s work rate is simply {quality_adj} - never stops running!",
            "{player} shows {quality_adj} tenacity for {team}! The {position} fights to win the ball back through sheer determination!"
        ]
    }

    # Advanced template selection system
    template_usage = defaultdict(list)
    match_state = {
        'score': [0, 0],
        'total_goals': 0,
        'intensity': 'building',
        'cards': {'yellow': 0, 'red': 0},
        'substitutions': 0
    }
    template_results = []

    def get_context_appropriate_adjective(event_type, outcome=None):
        """Choose adjectives based on context"""
        if outcome == 'negative' or event_type in ['Foul Committed', 'Card']:
            return random.choice(vocabulary['negative_adjectives'])
        elif event_type in ['Goal', 'Tackle', 'Interception', 'Block']:
            return random.choice(vocabulary['quality_adjectives'])
        else:
            return random.choice(vocabulary['quality_adjectives'] + vocabulary['intensity_adjectives'])

    # Dynamic introduction based on context
    intro_variations = [
        f"Welcome to this {random.choice(vocabulary['intensity_adjectives'])} {context.get('competition', 'match')} encounter! {context.get('home_team', 'Home')} welcome {context.get('away_team', 'Away')} in what promises to be a {random.choice(vocabulary['intensity_adjectives'])} affair.",
        f"Good evening and welcome to what could be a defining {context.get('competition', 'fixture')}! The stage is set for {context.get('home_team', 'Home')} versus {context.get('away_team', 'Away')} - buckle up for what should be a {random.choice(vocabulary['intensity_adjectives'])} encounter!",
        f"The atmosphere is {random.choice(vocabulary['intensity_adjectives'])} here tonight! {context.get('home_team', 'Home')} and {context.get('away_team', 'Away')} are ready to serve up a {context.get('competition', 'football')} feast for the ages!"
    ]

    intro = random.choice(intro_variations)
    template_results.append({
        'minute': 0,
        'event_type': 'Match Start',
        'player': 'Commentator',
        'team': 'Broadcast',
        'commentary': intro,
        'score': '0-0',
        'word_count': len(intro.split())
    })

    print(f"\n🎙️ {intro}\n")
    print("="*80)

    def get_best_template(event_type, event_index):
        """Intelligent template selection to maximize variety"""
        if event_type not in enhanced_templates:
            fallback_templates = [
                f"Action from {'{player}'} for {'{team}'} as this {random.choice(vocabulary['intensity_adjectives'])} encounter continues to unfold!",
                f"The {'{position}'} {'{player}'} is involved in the play for {'{team}'}, contributing to the flow of this absorbing contest!",
                f"{'{player}'} makes his presence felt for {'{team}'} in what has been a {random.choice(vocabulary['intensity_adjectives'])} affair!"
            ]
            return random.choice(fallback_templates)

        available_templates = enhanced_templates[event_type]
        used_templates = template_usage[event_type]

        # If we haven't used all templates, pick an unused one
        unused_templates = [t for i, t in enumerate(available_templates) if i not in used_templates]

        if unused_templates:
            chosen_template = random.choice(unused_templates)
            template_index = available_templates.index(chosen_template)
            template_usage[event_type].append(template_index)
        else:
            # All templates used, reset and pick randomly
            template_usage[event_type] = []
            chosen_template = random.choice(available_templates)
            template_index = available_templates.index(chosen_template)
            template_usage[event_type].append(template_index)

        return chosen_template

    def fill_template_dynamically(template, event, context_info):
        """Fill template with context-aware dynamic content"""
        player = event['player']
        team = event['team']
        event_type = event['event_type']
        minute = event['minute']
        details = event.get('details', {})

        # Choose appropriate vocabulary based on context
        quality_adj = get_context_appropriate_adjective(event_type, details.get('outcome'))
        intensity_adj = random.choice(vocabulary['intensity_adjectives'])
        negative_adj = random.choice(vocabulary['negative_adjectives'])
        action_verb = random.choice(vocabulary['action_verbs'])
        crowd_reaction = random.choice(vocabulary['crowd_reactions'])
        tactical_phrase = random.choice(vocabulary['tactical_phrases'])
        shot_outcome = random.choice(vocabulary['shot_outcomes'])

        # Choose position based on event type
        position = random.choice(vocabulary['positions'].get(event_type, ['player']))

        # Handle specific event details for red cards
        if event_type == 'Card' and 'card_type' in details:
            if details['card_type'].lower() == 'red':
                red_templates = [t for t in enhanced_templates['Card'] if 'RED CARD' in t or 'SENDING OFF' in t]
                if red_templates:
                    template = random.choice(red_templates)

        # Fill in the template
        filled = template.format(
            player=player,
            team=team,
            quality_adj=quality_adj,
            intensity_adj=intensity_adj,
            negative_adj=negative_adj,
            action_verb=action_verb,
            crowd_reaction=crowd_reaction,
            tactical_phrase=tactical_phrase,
            position=position,
            shot_outcome=shot_outcome
        )

        return filled

    # Generate commentary for each event
    for i, event in enumerate(selected_events):
        minute = int(event.get('minute', 0))  # Ensure minute is an integer
        event_type = event['event_type']
        player = event['player']
        team = event['team']
        details = event.get('details', {})

        print(f"\nEvent {i+1}/{len(selected_events)}")
        print(f"⏱️ MINUTE {minute} - {event_type}")
        print(f"👤 {player} ({team})")
        if details:
            print(f"📋 Details: {details}")

        # Update match state
        if event_type == 'Goal':
            if team == context.get('home_team', 'Home'):
                match_state['score'][0] += 1
            else:
                match_state['score'][1] += 1
            match_state['total_goals'] += 1
        elif event_type == 'Card':
            card_type = details.get('card_type', 'Yellow').lower()
            if 'red' in card_type:
                match_state['cards']['red'] += 1
            else:
                match_state['cards']['yellow'] += 1
        elif event_type == 'Substitution':
            match_state['substitutions'] += 1

        # Get and fill template
        template = get_best_template(event_type, i)
        commentary = fill_template_dynamically(template, event, context)

        current_score = f"{match_state['score'][0]}-{match_state['score'][1]}"
        word_count = len(commentary.split())

        print(f"📊 Score: {current_score}")
        print(f"📝 Words: {word_count}")
        print(f"🎙️ {commentary}")
        print("-" * 80)

        template_results.append({
            'minute': minute,  # Now guaranteed to be an integer
            'event_type': event_type,
            'player': player,
            'team': team,
            'commentary': commentary,
            'score': current_score,
            'word_count': word_count,
            'template_based': True,
            'details': details
        })

    # Dynamic conclusion
    final_score = f"{match_state['score'][0]}-{match_state['score'][1]}"
    home_team = context.get('home_team', 'Home')
    away_team = context.get('away_team', 'Away')
    competition = context.get('competition', 'match')

    conclusion_variations = [
        f"FULL TIME! What a {random.choice(vocabulary['intensity_adjectives'])} encounter that was! {home_team} {final_score} {away_team} - a result that will have lasting implications in this {competition}!",
        f"The final whistle brings an end to this {random.choice(vocabulary['intensity_adjectives'])} contest! {final_score} the final score in a match that had absolutely everything!",
        f"That's it! {home_team} and {away_team} serve up a {random.choice(vocabulary['intensity_adjectives'])} encounter that finishes {final_score} - what a way to end this {competition} fixture!"
    ]

    conclusion = random.choice(conclusion_variations)
    template_results.append({
        'minute': 90,
        'event_type': 'Full Time',
        'player': 'Referee',
        'team': 'Officials',
        'commentary': conclusion,
        'score': final_score,
        'word_count': len(conclusion.split())
    })

    print(f"\n🏁 {conclusion}")

    # Enhanced statistics
    commentary_events = [r for r in template_results if r['event_type'] not in ['Match Start', 'Full Time']]
    avg_words = np.mean([r['word_count'] for r in commentary_events]) if commentary_events else 0
    unique_commentaries = len(set(r['commentary'] for r in commentary_events))
    total_templates = sum(len(templates) for templates in enhanced_templates.values())

    print(f"\n✅ ENHANCED TEMPLATE-BASED COMMENTARY COMPLETE!")
    print(f"📊 Final Statistics:")
    print(f"   • Events processed: {len(commentary_events)}")
    print(f"   • Average words per event: {avg_words:.1f}")
    print(f"   • Unique commentaries: {unique_commentaries}/{len(commentary_events)} (100% guaranteed)")
    print(f"   • Total available templates: {total_templates}")
    print(f"   • Total words generated: {sum(r['word_count'] for r in template_results)}")
    print(f"   • Final score: {final_score}")
    print(f"   • Goals: {match_state['total_goals']}, Cards: {match_state['cards']['yellow']}Y/{match_state['cards']['red']}R")
    print(f"   • Substitutions: {match_state['substitutions']}")

    return template_results

# MAIN INTEGRATION FUNCTION
def generate_commentary_from_loaded_match():
    """Main function to load match data and generate commentary"""
    print("🚀 STARTING COMPLETE COMMENTARY GENERATION SYSTEM")
    print("=" * 70)

    try:
        # Try to load real match data first
        print("🔍 Attempting to load real match data...")
        extended_events, extended_context = load_optimal_match_with_rich_events()

        if extended_events is None or len(extended_events) == 0:
            print("\n⚠️  No suitable real match data found, using enhanced sample data")
            extended_events, extended_context = create_fallback_sample_events()

        print(f"\n✅ Match data loaded successfully!")
        print(f"   Match: {extended_context['home_team']} vs {extended_context['away_team']}")
        print(f"   Competition: {extended_context['competition']}")
        print(f"   Events available: {len(extended_events)}")

        # Generate enhanced commentary
        print(f"\n🎙️ Generating enhanced commentary for FULL MATCH...")
        commentary_results = generate_enhanced_template_commentary(
            extended_events,
            extended_context,
            max_events=None  # Process ALL events - no limit!
        )

        print(f"\n🎉 COMMENTARY GENERATION COMPLETE!")
        print(f"Generated {len(commentary_results)} commentary segments")

        return commentary_results, extended_events, extended_context

    except Exception as e:
        print(f"❌ Error in commentary generation: {e}")
        print("🔄 Falling back to sample data...")

        # Ultimate fallback
        extended_events, extended_context = create_fallback_sample_events()

        commentary_results = generate_enhanced_template_commentary(
            extended_events,
            extended_context,
            max_events=None  # Process ALL events
        )

        return commentary_results, extended_events, extended_context

# UTILITY FUNCTIONS
def export_commentary_to_text(commentary_results, filename="match_commentary.txt"):
    """Export generated commentary to a text file"""
    if not commentary_results:
        print("❌ No commentary results to export")
        return

    try:
        with open(filename, 'w', encoding='utf-8') as f:
            f.write("MATCH COMMENTARY TRANSCRIPT\n")
            f.write("=" * 50 + "\n\n")

            for result in commentary_results:
                if result['event_type'] in ['Match Start', 'Full Time']:
                    f.write(f"\n{result['commentary']}\n\n")
                else:
                    f.write(f"[{result['minute']}'] {result['commentary']}\n\n")

            f.write("\n" + "=" * 50 + "\n")
            f.write(f"Generated by Enhanced Commentary System\n")
            f.write(f"Total segments: {len(commentary_results)}\n")
            f.write(f"Total words: {sum(r['word_count'] for r in commentary_results)}\n")

        print(f"✅ Commentary exported to {filename}")

    except Exception as e:
        print(f"❌ Error exporting commentary: {e}")

def create_match_summary(commentary_results, context):
    """Create a concise match summary from the commentary"""
    if not commentary_results:
        return "No match data available"

    match_events = [r for r in commentary_results if r['event_type'] not in ['Match Start', 'Full Time']]

    # Extract key events
    goals = [r for r in match_events if r['event_type'] == 'Goal']
    cards = [r for r in match_events if r['event_type'] == 'Card']
    substitutions = [r for r in match_events if r['event_type'] == 'Substitution']

    # Calculate final score
    home_goals = len([g for g in goals if g['team'] == context.get('home_team', 'Home')])
    away_goals = len([g for g in goals if g['team'] == context.get('away_team', 'Away')])

    summary = f"""
🏆 MATCH SUMMARY
{context.get('home_team', 'Home')} {home_goals} - {away_goals} {context.get('away_team', 'Away')}
{context.get('competition', 'Match')} • {context.get('match_date', 'Date TBD')}

⚽ Goals Scored: {len(goals)}
🟨 Yellow Cards: {len([c for c in cards if c.get('details', {}).get('card_type', 'Yellow') == 'Yellow'])}
🟥 Red Cards: {len([c for c in cards if c.get('details', {}).get('card_type', 'Red') == 'Red'])}
🔄 Substitutions: {len(substitutions)}

📊 Total Events Covered: {len(match_events)}
📝 Commentary Segments: {len(commentary_results)}
🎙️ Total Words Generated: {sum(r['word_count'] for r in commentary_results)}
"""

    if goals:
        summary += "\n🥅 Goal Scorers:\n"
        for i, goal in enumerate(goals, 1):
            summary += f"   {i}. {goal['minute']}' - {goal['player']} ({goal['team']})\n"

    return summary

# MAIN EXECUTION
if __name__ == "__main__":
    print("🎮 ENHANCED SPORTS COMMENTARY GENERATION SYSTEM")
    print("=" * 60)
    print("Starting complete commentary generation process...\n")

    # Generate commentary
    results, events, context = generate_commentary_from_loaded_match()

    if results:
        print("\n" + "=" * 60)
        print("🎊 GENERATION COMPLETE! Additional Options:")
        print("=" * 60)

        # Create summary
        summary = create_match_summary(results, context)
        print(f"{summary}")

        # Export option
        export_commentary_to_text(results, "enhanced_match_commentary.txt")

        print(f"\n✨ Enhanced Commentary System Complete!")
        print(f"📁 Check 'enhanced_match_commentary.txt' for full transcript")

    else:
        print("❌ Commentary generation failed")

# Quick function for external use
def quick_commentary_generation(events_data, context_data, max_events=None):
    """Quick function for external use - processes ALL events by default"""
    return generate_enhanced_template_commentary(events_data, context_data, max_events)

🎮 ENHANCED SPORTS COMMENTARY GENERATION SYSTEM
Starting complete commentary generation process...

🚀 STARTING COMPLETE COMMENTARY GENERATION SYSTEM
🔍 Attempting to load real match data...
🔍 SEARCHING FOR OPTIMAL MATCH WITH RICH EVENT COVERAGE
📋 Found 74 competitions to search

📊 Competition 1: 1. Bundesliga (2023/2024)
   Available matches: 34
   ✅ Good match 1: Bayer Leverkusen vs Werder Bremen
      Events: 131, Score: 883.0
      Time span: 87 minutes
   ✅ Good match 2: Union Berlin vs Bayer Leverkusen
      Events: 168, Score: 1041.0
      Time span: 95 minutes
   ✅ Good match 3: Eintracht Frankfurt vs Bayer Leverkusen
      Events: 151, Score: 917.0
      Time span: 92 minutes
   ✅ Good match 4: Bochum vs Bayer Leverkusen
      Events: 146, Score: 961.0
      Time span: 93 minutes
   ✅ Good match 5: Bayer Leverkusen vs Augsburg
      Events: 150, Score: 990.0
      Time span: 93 minutes
   ✅ Good match 6: Bayer Leverkusen vs Hoffenheim
      Events: 194, Score: 1240.0
      Time s

In [70]:
from google.colab import files
files.download('enhanced_match_commentary.txt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>