In [None]:
# Check GPU availability
!nvidia-smi

import torch
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"GPU device: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'}")

# Install required packages
!pip install transformers accelerate datasets torch bitsandbytes -q
!pip install huggingface_hub tokenizers -q

print("✅ Environment setup complete!")

Sun May 25 19:38:54 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   49C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Configure for T4 GPU optimization
device = "cuda" if torch.cuda.is_available() else "cpu"

# Quantization config for T4 GPU
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

print(f"🔄 Loading Phi-4...")

try:
    # Load Phi-4 with quantization
    tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-4", trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        "microsoft/Phi-4",
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True,
        torch_dtype=torch.float16
    )

    # Set pad token if needed
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    print(f"✅ Successfully loaded Phi-4")
    print(f"🎯 Device: {device}")
    print(f"🤖 Model ready for inference")

except Exception as e:
    print(f"❌ Failed to load Phi-4: {str(e)}")
    model = None
    tokenizer = None

🔄 Loading Phi-4...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/17.7k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.61M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/917k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/4.25M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.50k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/95.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/802 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.4k [00:00<?, ?B/s]

Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

model-00001-of-00006.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00004-of-00006.safetensors:   0%|          | 0.00/4.77G [00:00<?, ?B/s]

model-00003-of-00006.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00006-of-00006.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00005-of-00006.safetensors:   0%|          | 0.00/4.77G [00:00<?, ?B/s]

model-00002-of-00006.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

✅ Successfully loaded Phi-4
🎯 Device: cuda
🤖 Model ready for inference


**Loading and Preprocessing**

In [None]:
import requests
import json
import pandas as pd

class StatsBombAPI:
    def __init__(self):
        self.base_url = "https://raw.githubusercontent.com/statsbomb/open-data/master/data"

    def get_competitions(self):
        """Get all available competitions"""
        url = f"{self.base_url}/competitions.json"
        response = requests.get(url)
        return response.json()

    def get_matches(self, competition_id, season_id):
        """Get matches for a specific competition/season"""
        url = f"{self.base_url}/matches/{competition_id}/{season_id}.json"
        response = requests.get(url)
        if response.status_code == 200:
            return response.json()
        return []

    def get_events(self, match_id):
        """Get events for a specific match"""
        url = f"{self.base_url}/events/{match_id}.json"
        response = requests.get(url)
        if response.status_code == 200:
            return response.json()
        return []

# Use the API
sb_api = StatsBombAPI()

print("📊 Getting competitions...")
competitions = sb_api.get_competitions()
print(f"Found {len(competitions)} competitions")

# Show available competitions
comp_df = pd.DataFrame(competitions)
print("\nAvailable competitions:")
print(comp_df.head(10))

# Get matches for first competition
comp = competitions[0]
print(f"\n🏟️ Getting matches for {comp['competition_name']}...")
matches = sb_api.get_matches(comp['competition_id'], comp['season_id'])
print(f"Found {len(matches)} matches")

if matches:
    # Show sample matches
    match_df = pd.DataFrame(matches)
    print("\nSample matches:")
    print(match_df[['match_date', 'home_team', 'away_team']].head())

    # Get events for first match
    match_id = matches[0]['match_id']
    print(f"\n⚽ Getting events for match {match_id}...")
    events = sb_api.get_events(match_id)
    print(f"Found {len(events)} events")

    # Look for goals
    goals = []
    for event in events:
        # Check if it's a shot that resulted in a goal
        if (event.get('type', {}).get('name') == 'Shot' and
            event.get('shot', {}).get('outcome', {}).get('name') == 'Goal'):
            goals.append({
                'minute': event.get('minute'),
                'player': event.get('player', {}).get('name'),
                'team': event.get('team', {}).get('name')
            })

    print(f"\n🥅 Found {len(goals)} goals:")
    for goal in goals:
        print(f"  {goal['minute']}' - {goal['player']} ({goal['team']})")

    # Show event types to understand the data structure
    event_types = {}
    for event in events:
        event_type = event.get('type', {}).get('name', 'Unknown')
        event_types[event_type] = event_types.get(event_type, 0) + 1

    print(f"\n📋 Event types in this match:")
    for event_type, count in sorted(event_types.items(), key=lambda x: x[1], reverse=True)[:10]:
        print(f"  {event_type}: {count}")

📊 Getting competitions...
Found 74 competitions

Available competitions:
   competition_id  season_id country_name        competition_name  \
0               9        281      Germany           1. Bundesliga   
1               9         27      Germany           1. Bundesliga   
2            1267        107       Africa  African Cup of Nations   
3              16          4       Europe        Champions League   
4              16          1       Europe        Champions League   
5              16          2       Europe        Champions League   
6              16         27       Europe        Champions League   
7              16         26       Europe        Champions League   
8              16         25       Europe        Champions League   
9              16         24       Europe        Champions League   

  competition_gender  competition_youth  competition_international  \
0               male              False                      False   
1               male       

In [None]:
# Load the specific match from the list
match_id = 3895302

# Find the specific match in the list
specific_match = None
for match in matches:
    if match['match_id'] == match_id:
        specific_match = match
        break

if specific_match is None:
    print(f"❌ Match {match_id} not found in matches data")
    print(f"Available match IDs: {[m['match_id'] for m in matches[:5]]}")  # Show first 5
else:
    print(f"✅ Found match {match_id}")
    print(f"   {specific_match['home_team']['home_team_name']} vs {specific_match['away_team']['away_team_name']}")
    print(f"   Date: {specific_match['match_date']}")

    # Setup match context for the specific match
    match_context = {
        'home_team': specific_match['home_team']['home_team_name'],
        'away_team': specific_match['away_team']['away_team_name'],
        'competition': '1. Bundesliga',
        'season': '2024/25',
        'match_date': specific_match['match_date'],
        'match_id': match_id
    }

    print(f"\n📋 Match Context Set:")
    print(f"   {match_context['home_team']} vs {match_context['away_team']}")
    print(f"   {match_context['competition']} - {match_context['match_date']}")
    print(f"   Match ID: {match_context['match_id']}")

✅ Found match 3895302
   Bayer Leverkusen vs Werder Bremen
   Date: 2024-04-14

📋 Match Context Set:
   Bayer Leverkusen vs Werder Bremen
   1. Bundesliga - 2024-04-14
   Match ID: 3895302


In [None]:
def filter_for_full_match_commentary_specific(events_df, match_context, target_events=400):
    """Filter events for comprehensive full match commentary with specific team names"""
    print(f"🎯 Filtering events for {match_context['home_team']} vs {match_context['away_team']}")
    print(f"   Target: ~{target_events} events for full match commentary")

    if events_df.empty:
        print("❌ No events to filter")
        return pd.DataFrame()

    # Lower importance thresholds for full match coverage
    importance_tiers = {
        'critical': {'min_score': 60, 'max_events': 500},      # Goals, red cards, penalties
        'high': {'min_score': 40, 'max_events': 500},         # Great shots, yellow cards, key moments
        'medium': {'min_score': 20, 'max_events': 550},       # Good shots, fouls, tackles in key areas
        'low': {'min_score': 10, 'max_events': 500}           # Other shots, defensive actions, build-up play
    }

    # Convert DataFrame to list for processing
    all_events = []
    match_state = {
        'total_goals': 0,
        'home_goals': 0,
        'away_goals': 0,
        'score': [0, 0],
        'score_diff': 0,
        'minute': 0,
        'half': 1
    }

    home_team = match_context['home_team']
    away_team = match_context['away_team']

    print(f"   🏠 Home team: {home_team}")
    print(f"   ✈️ Away team: {away_team}")

    for idx, row in events_df.iterrows():
        event_dict = row.to_dict()

        # Update match state
        minute = event_dict['minute']
        if minute > 45:
            match_state['half'] = 2

        if event_dict['commentary_category'] == 'Goal':
            match_state['total_goals'] += 1

            # Correctly assign goals to home/away teams
            if event_dict['team'] == home_team:
                match_state['home_goals'] += 1
                match_state['score'][0] += 1
                print(f"   ⚽ HOME GOAL: {event_dict['player']} ({home_team}) - {minute}'")
            elif event_dict['team'] == away_team:
                match_state['away_goals'] += 1
                match_state['score'][1] += 1
                print(f"   ⚽ AWAY GOAL: {event_dict['player']} ({away_team}) - {minute}'")
            else:
                # Try partial matching for team names
                if any(word in event_dict['team'] for word in home_team.split()):
                    match_state['home_goals'] += 1
                    match_state['score'][0] += 1
                    print(f"   ⚽ HOME GOAL (matched): {event_dict['player']} ({event_dict['team']}) - {minute}'")
                else:
                    match_state['away_goals'] += 1
                    match_state['score'][1] += 1
                    print(f"   ⚽ AWAY GOAL (matched): {event_dict['player']} ({event_dict['team']}) - {minute}'")

            match_state['score_diff'] = abs(match_state['score'][0] - match_state['score'][1])

        match_state['minute'] = minute

        # Calculate importance with enhanced scoring
        importance = calculate_enhanced_importance_specific(event_dict, match_state, match_context)
        event_dict['importance_score'] = importance

        all_events.append(event_dict)

    print(f"\n📊 Final Score: {home_team} {match_state['score'][0]} - {match_state['score'][1]} {away_team}")

    # Separate events by importance tiers
    selected_events = []

    for tier_name, tier_config in importance_tiers.items():
        tier_events = [e for e in all_events if e['importance_score'] >= tier_config['min_score']]

        # Sort by importance within tier
        tier_events.sort(key=lambda x: x['importance_score'], reverse=True)

        # Take top events from this tier
        selected_tier_events = tier_events[:tier_config['max_events']]
        selected_events.extend(selected_tier_events)

        print(f"   📊 {tier_name.capitalize()} tier: {len(selected_tier_events)} events (min score: {tier_config['min_score']})")

    # Remove duplicates
    seen_events = set()
    unique_events = []
    for event in selected_events:
        event_key = (event['minute'], event['second'], event['player'], event['commentary_category'])
        if event_key not in seen_events:
            seen_events.add(event_key)
            unique_events.append(event)

    # Sort all selected events by time
    unique_events.sort(key=lambda x: (x['minute'], x['second']))

    # If we have too many events, prioritize the most important
    if len(unique_events) > target_events:
        unique_events.sort(key=lambda x: x['importance_score'], reverse=True)
        unique_events = unique_events[:target_events]
        unique_events.sort(key=lambda x: (x['minute'], x['second']))

    # Convert back to DataFrame
    filtered_df = pd.DataFrame(unique_events)

    print(f"\n✅ Selected {len(filtered_df)} events for full match commentary")

    # Show summary
    if not filtered_df.empty:
        print(f"\n📊 Event Summary:")
        for category, count in filtered_df['commentary_category'].value_counts().items():
            avg_importance = filtered_df[filtered_df['commentary_category'] == category]['importance_score'].mean()
            print(f"   • {category}: {count} events (avg: {avg_importance:.1f})")

        # Show goals by team
        goals_df = filtered_df[filtered_df['commentary_category'] == 'Goal']
        if not goals_df.empty:
            print(f"\n⚽ Goals by Team:")
            home_goals = goals_df[goals_df['team'].str.contains(home_team.split()[0], na=False)]
            away_goals = goals_df[goals_df['team'].str.contains(away_team.split()[0], na=False)]
            print(f"   • {home_team}: {len(home_goals)} goals")
            print(f"   • {away_team}: {len(away_goals)} goals")

            print(f"\n🥅 Goal Timeline:")
            for _, goal in goals_df.iterrows():
                print(f"   • {goal['minute']}' - {goal['player']} ({goal['team']}) - Importance: {goal['importance_score']:.1f}")

    return filtered_df

def calculate_enhanced_importance_specific(event, match_state, match_context):
    """Enhanced importance calculation with specific match context"""
    event_type = event['commentary_category']
    minute = event['minute']

    # Enhanced base weights
    base_weights = {
        'Goal': 100,
        'Shot': 55,
        'Card': 60,
        'Substitution': 55,
        'Foul': 30,
        'Tackle': 25,
        'Interception': 20,
        'Block': 35,
        'Clearance': 20,
        'Offside': 40
    }

    base_weight = base_weights.get(event_type, 15)

    # Enhanced shot outcome modifiers
    if event_type == 'Shot':
        outcome = event.get('shot_outcome', 'Unknown')
        xg = event.get('shot_xg', 0)

        outcome_modifiers = {
            'Saved': 1.4,
            'Post': 1.7,
            'Crossbar': 1.7,
            'Blocked': 1.2,
            'Off T': 0.9,
            'Wide': 0.8,
            'Wayward': 0.6
        }
        base_weight *= outcome_modifiers.get(outcome, 1.0)

        # xG bonus
        if xg > 0.3:
            base_weight *= 1.3
        elif xg > 0.6:
            base_weight *= 1.5

    # Enhanced card modifiers
    elif event_type == 'Card':
        card_type = event.get('card_type', 'Yellow')
        if 'Red' in str(card_type):
            base_weight = 95
        elif 'Second Yellow' in str(card_type):
            base_weight = 85
        else:
            base_weight = 50

    # Enhanced time modifiers for narrative flow
    time_modifier = 1.0
    if minute <= 3:
        time_modifier = 1.4  # Very early action
    elif minute <= 10:
        time_modifier = 1.2  # Early action
    elif minute >= 85:
        time_modifier = 1.8  # Late drama
    elif minute >= 90:
        time_modifier = 2.2  # Stoppage time
    elif 42 <= minute <= 45:
        time_modifier = 1.3  # Before halftime
    elif 45 <= minute <= 50:
        time_modifier = 1.2  # Second half start

    # Enhanced score context with specific teams
    score_modifier = 1.0
    if event_type == 'Goal':
        if match_state['total_goals'] == 1:
            score_modifier = 2.0  # Opening goal
        elif match_state['score_diff'] == 0:
            score_modifier = 1.7  # Equalizer
        elif minute >= 75:
            score_modifier = 1.8  # Late goal
        else:
            score_modifier = 1.3

    # Position-based modifiers
    location_x = event.get('location_x', 50)
    position_modifier = 1.0
    if location_x >= 88:  # Penalty area
        position_modifier = 1.5
    elif location_x >= 70:  # Final third
        position_modifier = 1.3
    elif location_x <= 30:  # Defensive third
        position_modifier = 1.2

    # Calculate final importance
    importance = base_weight * time_modifier * score_modifier * position_modifier
    return min(importance, 100)

In [None]:
# Apply filtering with specific match context
full_match_events = filter_for_full_match_commentary_specific(processed_events, match_context, target_events=400)

print("\n🚀 SPECIFIC MATCH COMMENTARY SETUP COMPLETE!")
print("=" * 60)
print(f"🏟️ Match: {match_context['home_team']} vs {match_context['away_team']}")
print(f"📊 Total Events for Commentary: {len(full_match_events)}")

if not full_match_events.empty:
    print(f"⚽ Goals: {len(full_match_events[full_match_events['commentary_category'] == 'Goal'])}")
    print(f"🎯 Shots: {len(full_match_events[full_match_events['commentary_category'] == 'Shot'])}")
    print(f"🟨 Cards: {len(full_match_events[full_match_events['commentary_category'] == 'Card'])}")
    print(f"🔄 Substitutions: {len(full_match_events[full_match_events['commentary_category'] == 'Substitution'])}")

    print(f"\n✅ Ready for Phi-4 commentary generation!")
    print(f"Match ID {match_id} events loaded successfully")
else:
    print("❌ No events found for this match")

🎯 Filtering events for Bayer Leverkusen vs Werder Bremen
   Target: ~400 events for full match commentary
   🏠 Home team: Bayer Leverkusen
   ✈️ Away team: Werder Bremen
   ⚽ HOME GOAL: Victor Okoh Boniface (Bayer Leverkusen) - 24'
   ⚽ HOME GOAL: Granit Xhaka (Bayer Leverkusen) - 59'
   ⚽ HOME GOAL: Florian Wirtz (Bayer Leverkusen) - 67'
   ⚽ HOME GOAL: Florian Wirtz (Bayer Leverkusen) - 82'
   ⚽ HOME GOAL: Florian Wirtz (Bayer Leverkusen) - 89'

📊 Final Score: Bayer Leverkusen 5 - 0 Werder Bremen
   📊 Critical tier: 30 events (min score: 60)
   📊 High tier: 33 events (min score: 40)
   📊 Medium tier: 46 events (min score: 20)
   📊 Low tier: 46 events (min score: 10)

✅ Selected 46 events for full match commentary

📊 Event Summary:
   • Shot: 22 events (avg: 93.9)
   • Foul: 16 events (avg: 38.0)
   • Goal: 5 events (avg: 100.0)
   • Card: 3 events (avg: 61.7)

⚽ Goals by Team:
   • Bayer Leverkusen: 5 goals
   • Werder Bremen: 0 goals

🥅 Goal Timeline:
   • 24' - Victor Okoh Boniface

In [None]:
class TrainingDataCreator:
    def __init__(self, templates):
        self.templates = templates

    def create_training_data(self, events_df, match_context):
        """Create training data from events"""
        print("📝 Creating training data...")

        training_examples = []

        # Get commentary-worthy events
        commentary_events = events_df[events_df['commentary_category'].isin([
            'Goal', 'Shot', 'Card', 'Substitution', 'Interception', 'Block', 'Clearance', 'Tackle', 'Offside'
        ])].copy()

        # Create training examples
        for idx, event in commentary_events.iterrows():

            # Create input prompt
            prompt = self.create_event_prompt(event, match_context)

            # Create target commentary
            target = self.create_target_commentary(event)

            # Format as training example
            training_text = f"{prompt}\n\nCommentary: {target}<|endoftext|>"

            training_examples.append({
                "text": training_text,
                "event_type": event['commentary_category'],
                "minute": event['minute'],
                "player": event['player'],
                "team": event['team']
            })

        print(f"✅ Created {len(training_examples)} training examples")
        return training_examples

    def create_event_prompt(self, event, match_context):
        """Create structured prompt for event"""
        prompt = f"""Generate professional sports commentary for this football event:

Match: {match_context.get('home_team', 'Home')} vs {match_context.get('away_team', 'Away')}
Competition: {match_context.get('competition', 'Football Match')}
Minute: {event['minute']}'
Event Type: {event['commentary_category']}
Player: {event['player']}
Team: {event['team']}
Position: {event.get('position', 'Player')}"""

        # Add event-specific details
        if event['commentary_category'] == 'Goal':
            prompt += f"""
Shot Type: {event.get('shot_technique', 'Unknown')}
Body Part: {event.get('shot_body_part', 'Unknown')}
xG: {event.get('shot_xg', 0):.2f}"""

        elif event['commentary_category'] == 'Shot':
            prompt += f"""
Shot Outcome: {event.get('shot_outcome', 'Unknown')}
Shot Type: {event.get('shot_technique', 'Unknown')}
Body Part: {event.get('shot_body_part', 'Unknown')}"""

        elif event['commentary_category'] == 'Card':
            prompt += f"""
Card Type: {event.get('card_type', 'Yellow')}
Foul Type: {event.get('foul_type', 'General')}"""

        elif event['commentary_category'] == 'Substitution':
            prompt += f"""
Replacement: {event.get('substitution_replacement', 'Unknown')}"""

        return prompt

    def create_target_commentary(self, event):
        """Create target commentary using enhanced templates"""

        player = event['player']
        team = event['team']
        minute = event['minute']
        event_type = event['commentary_category'].lower()
        position = event.get('position', 'player').split()[-1].lower()  # Get last word of position

        # Goal commentary with subtypes
        if event_type == 'goal':
            shot_technique = event.get('shot_technique', '').lower()
            body_part = event.get('shot_body_part', '').lower()

            if 'header' in body_part:
                subtype = 'header'
            elif 'power' in shot_technique or 'driven' in shot_technique:
                subtype = 'power_finish'
            elif minute > 25:  # Assume long range if later in game
                subtype = 'long_range'
            else:
                subtype = 'placed_finish'

            template = self.templates.get_template('goal', subtype)

        # Shot commentary with subtypes
        elif event_type == 'shot':
            shot_outcome = event.get('shot_outcome', 'Saved').lower()

            if 'saved' in shot_outcome:
                subtype = 'saved'
            elif 'off t' in shot_outcome or 'wide' in shot_outcome:
                subtype = 'wide'
            elif 'blocked' in shot_outcome:
                subtype = 'blocked'
            elif 'post' in shot_outcome or 'crossbar' in shot_outcome:
                subtype = 'post_crossbar'
            else:
                subtype = 'saved'  # Default

            template = self.templates.get_template('shot', subtype)

        # Card commentary with subtypes
        elif event_type == 'card':
            card_type = event.get('card_type', 'Yellow').lower()
            foul_type = event.get('foul_type', '').lower()

            if 'red' in card_type:
                if 'violent' in foul_type or 'serious' in foul_type:
                    subtype = 'red_violent'
                else:
                    subtype = 'red_second_yellow'
            else:  # Yellow card
                if 'dissent' in foul_type:
                    subtype = 'yellow_dissent'
                elif minute > 70:  # Late in game, likely tactical
                    subtype = 'yellow_tactical'
                else:
                    subtype = 'yellow_reckless'

            template = self.templates.get_template('card', subtype)

        # Substitution commentary
        elif event_type == 'substitution':
            replacement = event.get('substitution_replacement', 'New Player')

            if minute > 75:
                subtype = 'tactical'
            elif minute < 30:
                subtype = 'injury'
            else:
                subtype = 'performance'

            template = self.templates.get_template('substitution', subtype)
            template = template.format(
                player_out=player,
                player_in=replacement,
                team=team,
                position=position
            )
            return template

        # Defensive actions
        elif event_type in ['tackle', 'interception', 'block', 'clearance']:
            template = self.templates.get_template('defensive', event_type)

        # Offside
        elif event_type == 'offside':
            template = self.templates.get_template('offside')

        else:
            template = f"{player} is involved in the action for {team} in the {minute}th minute!"

        # Fill in the template
        try:
            filled_template = template.format(
                player=player,
                team=team,
                minute=minute,
                position=position
            )
            return filled_template
        except:
            return template  # Return unfilled if formatting fails

In [None]:
import random

# Create training data for the specific match
data_creator = TrainingDataCreator(templates)
training_data = data_creator.create_training_data(full_match_events, match_context, max_examples=300)

print(f"\n🎯 TRAINING DATA FOR MATCH {match_id}:")
print(f"   • Match: {match_context['home_team']} vs {match_context['away_team']}")
print(f"   • {len(training_data)} training examples created")
print(f"   • Ready for LoRA fine-tuning on this specific match")

# Show a sample training example
if training_data:
    sample = training_data[0]
    print(f"\n📋 Sample Training Example:")
    print(f"Event: {sample['event_type']} - {sample['player']} at {sample['minute']}'")
    print(f"Importance: {sample['importance_score']:.1f}/100")

📝 Creating training data from 46 events...
✅ Created 46 training examples

📋 Sample Training Example:
Event: Foul - Edmond Fayçal Tapsoba
Text preview: Generate professional football commentary for this event:

Match: Bayer Leverkusen vs Werder Bremen
Competition: 1. Bundesliga
Minute: 5'
Event: Foul
Player: Edmond Fayçal Tapsoba
Team: Bayer Leverkus...

🎯 TRAINING DATA FOR MATCH 3895302:
   • Match: Bayer Leverkusen vs Werder Bremen
   • 46 training examples created
   • Ready for LoRA fine-tuning on this specific match

📋 Sample Training Example:
Event: Foul - Edmond Fayçal Tapsoba at 5'
Importance: 24.0/100


**Commentary Templates**

In [None]:
import random

class EnhancedCommentaryTemplates:
    def __init__(self):
        self.templates = self._create_comprehensive_templates()

    def _create_comprehensive_templates(self):
        """Create extensive commentary templates for all situations with maximum diversity"""
        return {
            'goal': {
                'power_finish': [
                    # Explosive energy variants
                    "GOAL! {player} absolutely demolishes it for {team}! Pure dynamite from the {position}!",
                    "THUNDERBOLT! {player} unleashes hell for {team}! The net nearly bursts from that missile!",
                    "BOOM! {player} hammers home a screamer for {team}! Goalkeeper didn't even see it coming!",
                    "UNSTOPPABLE! {player} rifles it into the roof for {team}! Sheer brutal force meets perfect technique!",
                    "CARNAGE! {player} obliterates the ball for {team}! That's a strike that could break the sound barrier!",
                    "NUCLEAR! {player} detonates one for {team}! The power on that finish was absolutely frightening!",
                    "DEVASTATING! {player} launches an Exocet missile for {team}! Raw, unadulterated power!",
                    "CANNONBALL! {player} fires a bullet for {team}! The keeper's gloves would have caught fire!",
                    "EXPLOSIVE! {player} rockets it home for {team}! That's a finish with the force of a freight train!",
                    "FEROCIOUS! {player} batters the ball home for {team}! Savage power from close range!"
                ],
                'placed_finish': [
                    # Precision and artistry variants
                    "SUBLIME! {player} caresses it home for {team}! That's footballing silk from the {position}!",
                    "POETRY! {player} guides it to perfection for {team}! Michelangelo couldn't have painted it better!",
                    "EXQUISITE! {player} strokes it into the corner for {team}! Ballet-like grace in that finish!",
                    "CLINICAL! {player} dissects the goal for {team}! Surgical precision meets ice-cold composure!",
                    "DELICATE! {player} places it with a feather's touch for {team}! That's artistry of the highest order!",
                    "MASTERCLASS! {player} picks his spot magnificently for {team}! Chess-like calculation in the finish!",
                    "SILKEN! {player} threads it home for {team}! Smooth as butter, precise as a Swiss watch!",
                    "CULTURED! {player} finds the postage stamp for {team}! That's finishing school perfection!",
                    "GENIUS! {player} slots home with academic precision for {team}! PhD-level composure!",
                    "VELVET! {player} cushions it perfectly for {team}! Like placing a baby in its crib!"
                ],
                'header': [
                    # Aerial prowess variants
                    "TOWERING! {player} climbs like a skyscraper for {team}! Majestic in the air!",
                    "AERIAL MAJESTY! {player} rises like a phoenix for {team}! Commanding the airspace!",
                    "IMPERIAL! {player} reigns supreme for {team}! That's aerial dominance at its peak!",
                    "GRAVITATIONAL! {player} defies physics for {team}! He seemed to hang in the air forever!",
                    "MOUNTAINOUS! {player} peaks magnificently for {team}! Everest-like elevation!",
                    "STRATOSPHERIC! {player} launches skyward for {team}! NASA would be proud of that altitude!",
                    "CATHEDRAL! {player} builds upward for {team}! Gothic architecture in human form!",
                    "SPECTACULAR! {player} orchestrates a symphony in the air for {team}! Beethoven-esque timing!",
                    "REGAL! {player} crowns himself king of the box for {team}! Royal blood in that leap!",
                    "MAGNIFICENT! {player} soars like an eagle for {team}! Predatory instinct meets perfect timing!"
                ],
                'long_range': [
                    # Distance shooting variants
                    "OUTRAGEOUS! {player} tries his luck from the car park for {team}! Absolutely sensational!",
                    "HOLLYWOOD! {player} scripts a blockbuster for {team}! That's pure movie magic from 35 yards!",
                    "AUDACIOUS! {player} attempts the impossible for {team}! The audacity pays off spectacularly!",
                    "TELESCOPIC! {player} finds the target from orbit for {team}! GPS couldn't have been more accurate!",
                    "METEORITE! {player} strikes from another galaxy for {team}! That ball came down with snow on it!",
                    "RIDICULOUS! {player} breaks the laws of physics for {team}! Somebody call the science police!",
                    "PREPOSTEROUS! {player} conjures magic from thin air for {team}! Houdini would tip his hat!",
                    "EXTRAORDINARY! {player} launches from the stratosphere for {team}! That's intergalactic precision!",
                    "UNTHINKABLE! {player} delivers the undeliverable for {team}! Pure science fiction made reality!",
                    "LEGENDARY! {player} writes folklore for {team}! They'll be telling stories about that for decades!"
                ],
                'tap_in': [
                    # Simple finishes with flair
                    "INEVITABLE! {player} applies the coup de grâce for {team}! Right place, right time, right result!",
                    "PREDATORY! {player} pounces like a panther for {team}! Instinctive finishing from the fox in the box!",
                    "SIMPLE! {player} does the necessary for {team}! Sometimes football's beautiful simplicity wins!",
                    "OPPORTUNISTIC! {player} seizes the moment for {team}! That's what separates strikers from defenders!",
                    "RUTHLESS! {player} shows killer instinct for {team}! No mercy, no hesitation, just pure execution!",
                    "TEXTBOOK! {player} follows the manual for {team}! Finishing 101 demonstrated to perfection!"
                ]
            },

            'shot': {
                'saved': [
                    # Goalkeeper heroics variants
                    "{player} unleashes fury for {team}! DENIED! What superhuman reflexes from the last line of defense!",
                    "{player} goes for broke for {team}! SPECTACULAR SAVE! The keeper pulls a rabbit from his hat!",
                    "{player} tries to break the internet for {team}! INCREDIBLE! Flying hands keep dreams alive!",
                    "{player} seeks perfection for {team}! BRILLIANT! The keeper defies gravity with that save!",
                    "{player} summons his best for {team}! PHENOMENAL! Goalkeeping of the highest caliber!",
                    "{player} tests the keeper for {team}! EXTRAORDINARY! What reactions, what positioning!",
                    "{player} goes hunting for {team}! MAGNIFICENT! The keeper turns poacher into prey!",
                    "{player} rolls the dice for {team}! STUNNING! Lady Luck favors the brave between the sticks!",
                    "{player} demands answers for {team}! SENSATIONAL! The keeper provides all the solutions!",
                    "{player} knocks on heaven's door for {team}! DIVINE! The keeper plays the role of St. Peter!"
                ],
                'wide': [
                    # Missing the target variants
                    "{player} searches for perfection for {team}! Agonizingly close but the angle deceives him!",
                    "{player} paints the town for {team}! Just missing the canvas by the finest of margins!",
                    "{player} seeks the sublime for {team}! The geometry wasn't quite in his favor there!",
                    "{player} chases glory for {team}! Sometimes the hunter becomes the hunted by the crosswind!",
                    "{player} courts danger for {team}! The ball refuses to bend to his will on this occasion!",
                    "{player} flirts with history for {team}! Millimeters separate triumph from heartbreak!",
                    "{player} dances with destiny for {team}! The target plays hard to get in the crucial moment!",
                    "{player} reaches for the stars for {team}! Gravity pulls his hopes earthward!",
                    "{player} chases perfection for {team}! The gods of geometry deny him this time!",
                    "{player} hunts for magic for {team}! The enchantment slips through his fingers!"
                ],
                'blocked': [
                    # Defensive heroics variants
                    "{player} seeks passage for {team}! BLOCKED! Human barricade stands firm in the storm!",
                    "{player} tests resolve for {team}! DENIED! Bodies on the line, hearts on sleeves!",
                    "{player} demands entry for {team}! REJECTED! The defensive wall refuses to crumble!",
                    "{player} probes for weakness for {team}! REBUFFED! Steel-like determination in defense!",
                    "{player} searches for cracks for {team}! THWARTED! Defensive solidarity at its absolute finest!",
                    "{player} hunts for space for {team}! NULLIFIED! The rearguard action holds strong!",
                    "{player} seeks destruction for {team}! PREVENTED! Heroic defending saves the day!",
                    "{player} courts chaos for {team}! CONTAINED! Organization triumphs over individual brilliance!",
                    "{player} chases breakthrough for {team}! FRUSTRATED! The defensive unit stands as one!",
                    "{player} demands breakthrough for {team}! STONEWALLED! Defensive resilience personified!"
                ],
                'post_crossbar': [
                    # Woodwork variants
                    "{player} seeks perfection for {team}! WOODWORK! The frame of the goal becomes the 12th man!",
                    "{player} courts glory for {team}! CROSSBAR! Sometimes luck favors the architectural!",
                    "{player} chases immortality for {team}! POST! The metalwork plays spoiler in this tale!",
                    "{player} demands entry for {team}! DENIED BY INCHES! The woodwork stands sentinel!",
                    "{player} flirts with greatness for {team}! SO CLOSE! The frame mocks his best intentions!",
                    "{player} seeks vindication for {team}! RATTLES THE BAR! Physics conspires against perfection!",
                    "{player} hunts for magic for {team}! STRIKES IRON! The goal frame plays cruel tricks!",
                    "{player} courts destiny for {team}! THUNDERS OFF THE POST! Millimeters from eternal glory!",
                    "{player} chases dreams for {team}! CANNONS OFF THE CROSSBAR! The frame plays judge and jury!",
                    "{player} demands satisfaction for {team}! DENIED BY GEOMETRY! The angles refuse to cooperate!"
                ]
            },

            'card': {
                'yellow_tactical': [
                    "Calculated cynicism from {player}! The {team} {position} takes one for the collective good!",
                    "Professional foul executed by {player}! Tactical nous from the experienced {team} campaigner!",
                    "Strategic intervention from {player}! The {team} defender reads the manual perfectly!",
                    "Deliberate disruption from {player}! Chess-like thinking from the {team} tactician!",
                    "Cynical but clever from {player}! The {team} {position} sacrifices himself for the greater good!",
                    "Textbook game management from {player}! The {team} veteran knows exactly what he's doing!",
                    "Academic fouling from {player}! PhD-level tactical awareness from the {team} schemer!",
                    "Machiavellian from {player}! The dark arts practiced to perfection by the {team} master!"
                ],
                'yellow_reckless': [
                    "Reckless abandon from {player}! The {team} {position} lets passion override precision!",
                    "Wild and woolly from {player}! Testosterone trumps technique for the {team} warrior!",
                    "Uncontrolled aggression from {player}! The red mist descends on the {team} hothead!",
                    "Dangerous play from {player}! The {team} {position} crosses the line between brave and stupid!",
                    "Rash decision-making from {player}! The {team} player lets emotions rule over logic!",
                    "Ill-judged challenge from {player}! The {team} {position} picks the wrong moment for heroics!",
                    "Overzealous from {player}! The {team} player's enthusiasm exceeds his execution!",
                    "Poorly timed aggression from {player}! The {team} {position} misjudges the moment badly!"
                ],
                'red_violent': [
                    "MADNESS from {player}! Complete mental meltdown costs {team} dearly!",
                    "VOLCANIC ERUPTION from {player}! The {team} {position} explodes spectacularly!",
                    "BRAIN EXPLOSION from {player}! Ten seconds of stupidity ruins {team}'s afternoon!",
                    "SELF-DESTRUCTION from {player}! The {team} player becomes his own worst enemy!",
                    "MOMENT OF LUNACY from {player}! Temporary insanity leaves {team} down to ten!",
                    "CATASTROPHIC JUDGMENT from {player}! The {team} {position} throws his team under the bus!",
                    "MELTDOWN COMPLETE from {player}! The {team} player loses his mind at the worst possible time!",
                    "PROFESSIONAL SUICIDE from {player}! Career-threatening stupidity from the {team} man!"
                ]
            },

            'substitution': [
                "Tactical alchemy from the sideline! {player_in} replaces {player_out} as {team} shuffles the deck!",
                "Fresh blood for {team}! {player_in} enters the theater as {player_out} takes his bow!",
                "Strategic masterstroke brewing! {player_in} comes on for {player_out} as {team} changes the script!",
                "Personnel adjustment for {team}! {player_out} hands over the baton to {player_in} in this relay race!",
                "Managerial intervention! {player_in} replaces {player_out} as {team} seeks a different formula!",
                "Chess move from the touchline! {player_in} for {player_out} as {team} repositions their pieces!",
                "Fresh legs, fresh ideas! {player_in} takes over from {player_out} as {team} injects new energy!",
                "Calculated gamble! {player_in} enters for {player_out} as {team} rolls the tactical dice!",
                "Change of personnel for {team}! {player_out} makes way for {player_in} in this human chess match!",
                "Tactical tweak incoming! {player_in} replaces {player_out} as {team} fine-tunes their approach!"
            ],

            'defensive': {
                'tackle': [
                    "Surgical precision from {player}! The {team} {position} operates with scalpel-like accuracy!",
                    "Defensive artistry from {player}! The {team} maestro conducts a masterclass in timing!",
                    "Clinical intervention from {player}! The {team} {position} performs defensive surgery!",
                    "Textbook technique from {player}! The {team} professor gives a lesson in proper defending!",
                    "Perfectly calibrated from {player}! The {team} {position} times it to millisecond precision!",
                    "Academic defending from {player}! The {team} scholar demonstrates PhD-level technique!",
                    "Defensive poetry from {player}! The {team} artist paints a masterpiece with his feet!",
                    "Surgical strike from {player}! The {team} {position} operates with medical precision!"
                ],
                'interception': [
                    "Telepathic reading from {player}! The {team} {position} seems to predict the future!",
                    "Psychic powers from {player}! The {team} mind-reader anticipates everything!",
                    "Crystal ball defending from {player}! The {team} fortune-teller sees all!",
                    "Nostradamus-like from {player}! The {team} prophet predicts and prevents!",
                    "ESP defending from {player}! The {team} {position} reads minds with supernatural ability!",
                    "Clairvoyant skills from {player}! The {team} mystic sees the pass before it's played!",
                    "Oracle-like wisdom from {player}! The {team} sage knows all and sees all!",
                    "Mind-reading mastery from {player}! The {team} mentalist intercepts thoughts and passes!"
                ],
                'block': [
                    "Kamikaze courage from {player}! The {team} {position} sacrifices body for cause!",
                    "Gladiatorial bravery from {player}! The {team} warrior throws himself into the colosseum!",
                    "Heroic sacrifice from {player}! The {team} martyr pays the physical price!",
                    "Spartan resolve from {player}! The {team} defender channels ancient warrior spirit!",
                    "Lionheart courage from {player}! The {team} {position} fears no physical consequences!",
                    "Medieval valor from {player}! The {team} knight charges into battle fearlessly!",
                    "Battlefield bravery from {player}! The {team} soldier takes one for the regiment!",
                    "Samurai spirit from {player}! The {team} warrior embodies bushido principles!"
                ],
                'clearance': [
                    "Nuclear option from {player}! The {team} {position} launches it into orbit!",
                    "Sledgehammer approach from {player}! The {team} defender chooses power over precision!",
                    "No-nonsense from {player}! The {team} pragmatist picks function over form!",
                    "Agricultural approach from {player}! The {team} farmer hoofs it to the next county!",
                    "Caveman defending from {player}! The {team} prehistoric specialist goes basic!",
                    "Industrial-strength from {player}! The {team} bulldozer clears everything in sight!",
                    "Old-school from {player}! The {team} traditionalist keeps it simple and effective!",
                    "Brutalist architecture from {player}! The {team} {position} favors function over beauty!"
                ]
            },

            'foul': [
                "Transgression from {player}! The {team} {position} crosses the line of acceptable conduct!",
                "Infringement by {player}! The {team} player steps outside the boundaries of fair play!",
                "Misdemeanor from {player}! The {team} {position} commits a footballing crime!",
                "Violation by {player}! The {team} player breaks the sacred laws of the game!",
                "Breach of conduct from {player}! The {team} {position} oversteps the mark!",
                "Indiscretion from {player}! The {team} player makes an error of judgment!",
                "Transgression in the rulebook from {player}! The {team} {position} requires referee intervention!",
                "Footballing felony from {player}! The {team} perpetrator faces judicial review!"
            ],

            'offside': [
                "Premature celebration from {player}! The {team} {position} jumps the gun by milliseconds!",
                "Temporal miscalculation from {player}! The {team} {position} mistimes the space-time continuum!",
                "Eagerness exceeds legality for {player}! The {team} hunter becomes the hunted by geometry!",
                "Physics denies {player}! The {team} {position} falls victim to the laws of positioning!",
                "Marginal but fatal for {player}! The {team} {position} pays for split-second impatience!",
                "Geometric injustice for {player}! The {team} {position} becomes a victim of angles!",
                "Millimeter madness for {player}! The {team} {position} loses the game of inches!",
                "Mathematical precision denies {player}! The {team} {position} falls to pure calculation!"
            ],

            'pass': {
                'successful_short': [
                    "Silk-smooth from {player}! The {team} {position} threads it beautifully!",
                    "Velvet touch from {player}! The {team} maestro caresses it to his teammate!",
                    "Poetry in motion from {player}! The {team} artist paints with his feet!",
                    "Perfection from {player}! The {team} {position} finds his man with GPS precision!"
                ],
                'successful_long': [
                    "Telescopic vision from {player}! The {team} {position} picks out his target from distance!",
                    "Satellite navigation from {player}! The {team} quarterback delivers a 40-yard strike!",
                    "Intercontinental ballistic from {player}! The {team} {position} launches it across the field!",
                    "GPS-guided from {player}! The {team} {position} finds his man from another postal code!"
                ],
                'failed': [
                    "Communication breakdown from {player}! The {team} {position} and his teammate speak different languages!",
                    "Misfiring from {player}! The {team} {position} picks out the wrong address!",
                    "Technical malfunction from {player}! The {team} {position}'s radar seems to be offline!",
                    "System error from {player}! The {team} {position} needs a software update!"
                ]
            }
        }

    def get_template(self, event_type, subtype=None, intensity='medium'):
        """Get appropriate template based on event details with enhanced randomization"""
        templates = self.templates.get(event_type, [])

        if isinstance(templates, dict):
            if subtype and subtype in templates:
                return random.choice(templates[subtype])
            else:
                # Pick from all subtypes with weighted selection
                all_templates = []
                for sub_templates in templates.values():
                    all_templates.extend(sub_templates)
                return random.choice(all_templates) if all_templates else f"Action from {'{player}'} for {'{team}'}!"
        else:
            return random.choice(templates) if templates else f"Action from {'{player}'} for {'{team}'}!"

    def get_contextual_template(self, event_type, subtype=None, context=None):
        """Get template with additional context considerations"""
        base_template = self.get_template(event_type, subtype)

        # Add contextual modifiers
        if context:
            if context.get('minute', 0) > 85:
                intensity_words = ['DRAMATIC!', 'CRUCIAL!', 'VITAL!', 'DECISIVE!']
                base_template = f"{random.choice(intensity_words)} {base_template}"
            elif context.get('minute', 0) < 10:
                early_words = ['Early doors,', 'Straight from kickoff,', 'Within minutes,']
                base_template = f"{random.choice(early_words)} {base_template}"

        return base_template

# Initialize templates
templates = EnhancedCommentaryTemplates()

**Phi-4 with LoRa**

In [None]:
def setup_lora_training(model, tokenizer):
    """Setup LoRA configuration for Phi-4 training"""
    from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training

    print("🔧 Setting up LoRA training configuration...")

    # Prepare model for k-bit training
    model = prepare_model_for_kbit_training(model)

    # LoRA configuration optimized for Phi-4
    lora_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=16,  # Rank - balance between performance and efficiency
        lora_alpha=32,  # Scaling factor
        lora_dropout=0.1,
        target_modules=[
            "q_proj", "k_proj", "v_proj", "o_proj",  # Attention layers
            "gate_proj", "up_proj", "down_proj"      # MLP layers
        ],
        bias="none"
    )

    # Apply LoRA to the model
    peft_model = get_peft_model(model, lora_config)

    # Print trainable parameters
    print("📊 LoRA Configuration Applied:")
    peft_model.print_trainable_parameters()

    return peft_model, lora_config

# Apply LoRA setup (assuming you have model and tokenizer loaded)
peft_model, lora_config = setup_lora_training(model, tokenizer)

🔧 Setting up LoRA training configuration...




📊 LoRA Configuration Applied:
trainable params: 21,299,200 || all params: 14,680,806,400 || trainable%: 0.1451


In [None]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, TaskType
from datasets import Dataset
import json

def setup_peft_model(model, target_modules=None):
    """Set up PEFT model with LoRA configuration"""

    # Default target modules for common model architectures
    if target_modules is None:
        # These work for most transformer models (GPT, LLaMA, etc.)
        target_modules = [
            "q_proj", "k_proj", "v_proj", "o_proj",  # Attention layers
            "gate_proj", "up_proj", "down_proj",     # MLP layers
        ]

    # LoRA configuration
    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=16,                    # LoRA rank
        lora_alpha=32,           # LoRA scaling parameter
        lora_dropout=0.1,        # LoRA dropout
        target_modules=target_modules,
        bias="none",             # Don't adapt bias parameters
    )

    # Apply PEFT to the model
    peft_model = get_peft_model(model, peft_config)

    print("🔧 PEFT Model Configuration:")
    peft_model.print_trainable_parameters()

    return peft_model

def train_commentary_model_with_peft(model, tokenizer, training_data, output_dir="./peft-commentary-lora"):
    """Train model using PEFT/LoRA"""

    print(f"🏈 Setting up PEFT/LoRA training...")
    print(f"📊 Training on {len(training_data)} examples")

    # Set up PEFT model
    peft_model = setup_peft_model(model)

    # Create dataset
    dataset = Dataset.from_list(training_data)

    # Training arguments optimized for PEFT
    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=4,      # Can use larger batch with PEFT
        gradient_accumulation_steps=2,       # Effective batch size = 8
        warmup_steps=50,
        max_steps=300,
        learning_rate=2e-4,                 # Higher LR works well with LoRA
        fp16=True,
        logging_steps=25,
        save_steps=100,
        save_total_limit=3,
        remove_unused_columns=False,
        dataloader_pin_memory=False,
        dataloader_num_workers=0,
        report_to=None,
        dataloader_drop_last=True,
        eval_strategy="no",
        push_to_hub=False,
    )

    # Data collator
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,
        pad_to_multiple_of=8 if training_args.fp16 else None,
    )

    # Initialize trainer
    trainer = Trainer(
        model=peft_model,
        args=training_args,
        train_dataset=dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
    )

    # Training info
    print("🚀 PEFT Training started...")
    print(f"   • Base model: {model.config.name_or_path if hasattr(model.config, 'name_or_path') else 'Unknown'}")
    print(f"   • Dataset size: {len(training_data)} examples")
    print(f"   • Max steps: {training_args.max_steps}")
    print(f"   • Learning rate: {training_args.learning_rate}")

    # Train
    trainer.train()

    # Save the PEFT adapter
    peft_model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    print("✅ PEFT Training completed!")
    print(f"📁 LoRA adapter saved to: {output_dir}")
    print("💡 To use: load base model + adapter with peft.PeftModel.from_pretrained()")

    return trainer, peft_model

def load_trained_peft_model(base_model_path, adapter_path, tokenizer_path=None):
    """Helper function to load a trained PEFT model"""
    from peft import PeftModel

    # Load base model and tokenizer
    base_model = AutoModelForCausalLM.from_pretrained(base_model_path)
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path or adapter_path)

    # Load PEFT model
    peft_model = PeftModel.from_pretrained(base_model, adapter_path)

    return peft_model, tokenizer

In [None]:
import torch
import random

class FootballCommentaryGenerator:
    def __init__(self, model, tokenizer, templates, match_context):
        self.model = model
        self.tokenizer = tokenizer
        self.templates = templates
        self.match_context = match_context
        self.device = model.device if hasattr(model, 'device') else ('cuda' if torch.cuda.is_available() else 'cpu')

        # Match state tracking
        self.match_state = {
            'score': [0, 0],
            'minute': 0,
            'half': 1,
            'goals_home': 0,
            'goals_away': 0,
            'cards_shown': 0,
            'substitutions_made': 0
        }

    def update_match_state(self, event):
        """Update internal match state based on event"""
        self.match_state['minute'] = event['minute']

        if event['minute'] > 45:
            self.match_state['half'] = 2

        if event['commentary_category'] == 'Goal':
            if event['team'] == self.match_context['home_team']:
                self.match_state['goals_home'] += 1
                self.match_state['score'][0] += 1
            else:
                self.match_state['goals_away'] += 1
                self.match_state['score'][1] += 1

        elif event['commentary_category'] == 'Card':
            self.match_state['cards_shown'] += 1

        elif event['commentary_category'] == 'Substitution':
            self.match_state['substitutions_made'] += 1

    def safe_lower(self, value, default=''):
        """Safely convert value to lowercase string"""
        if value is None:
            return default
        if isinstance(value, (int, float)):
            return str(value).lower()
        if isinstance(value, str):
            return value.lower()
        return str(value).lower()

    def get_last_name(self, full_name):
        """Extract last name from full name"""
        if not full_name or full_name.strip() == '':
            return 'Unknown'

        # Handle common name formats
        name_parts = full_name.strip().split()

        if len(name_parts) == 1:
            # Single name (like "Pelé")
            return name_parts[0]
        elif len(name_parts) == 2:
            # First Last (like "Lionel Messi")
            return name_parts[1]
        elif len(name_parts) >= 3:
            # Handle compound names and multiple names
            # Common patterns:
            # "João Silva Santos" -> "Santos"
            # "José María García" -> "García"
            # "van der Berg" -> "van der Berg"
            # "De Bruyne" -> "De Bruyne"

            # Check for particles (van, de, del, etc.)
            particles = ['van', 'der', 'de', 'del', 'da', 'dos', 'von', 'di', 'du', 'le', 'la', 'el']

            # Find where the last name starts (after particles)
            last_name_start = len(name_parts) - 1

            # Work backwards to include particles
            for i in range(len(name_parts) - 2, -1, -1):
                if name_parts[i].lower() in particles:
                    last_name_start = i
                else:
                    break

            # Return the last name with particles
            return ' '.join(name_parts[last_name_start:])

        return full_name  # Fallback

    def map_event_to_template(self, event):
        """Map event data to appropriate template category and subtype"""
        event_type = self.safe_lower(event.get('commentary_category', ''))
        subtype = None

        # Event type mapping
        event_mapping = {
            'shot': 'shot',
            'goal': 'goal',
            'foul': 'foul',
            'card': 'card',
            'substitution': 'substitution',
            'offside': 'offside',
            'tackle': 'defensive',
            'interception': 'defensive',
            'block': 'defensive',
            'clearance': 'defensive',
            'pass': 'pass'
        }

        # Determine subtypes based on event details
        if event_type == 'shot':
            outcome = self.safe_lower(event.get('shot_outcome', ''))
            if 'goal' in outcome:
                event_type = 'goal'
                subtype = self._determine_goal_subtype(event)
            elif 'saved' in outcome or 'keeper' in outcome:
                subtype = 'saved'
            elif 'wide' in outcome or 'miss' in outcome:
                subtype = 'wide'
            elif 'blocked' in outcome:
                subtype = 'blocked'
            elif 'post' in outcome or 'crossbar' in outcome or 'woodwork' in outcome:
                subtype = 'post_crossbar'
            else:
                subtype = 'saved'  # Default for shots

        elif event_type == 'goal':
            subtype = self._determine_goal_subtype(event)

        elif event_type == 'card':
            card_type = self.safe_lower(event.get('card_type', 'yellow'))
            reason = self.safe_lower(event.get('foul_type', ''))

            if 'yellow' in card_type:
                if 'tactical' in reason or 'professional' in reason:
                    subtype = 'yellow_tactical'
                else:
                    subtype = 'yellow_reckless'
            elif 'red' in card_type:
                subtype = 'red_violent'
            else:
                # Default for card events
                subtype = 'yellow_reckless'

        elif event_type in ['tackle', 'interception', 'block', 'clearance']:
            subtype = event_type
            event_type = 'defensive'

        # Map to template category
        template_category = event_mapping.get(event_type, event_type)

        return template_category, subtype

    def _determine_goal_subtype(self, event):
        """Determine goal subtype from event characteristics"""
        body_part = self.safe_lower(event.get('shot_body_part', ''))
        technique = self.safe_lower(event.get('shot_technique', ''))
        distance = event.get('shot_distance', 0)
        power = self.safe_lower(event.get('shot_power', 'medium'))

        # Ensure distance is numeric
        try:
            distance = float(distance) if distance else 0
        except (ValueError, TypeError):
            distance = 0

        # Header goals
        if 'head' in body_part:
            return 'header'

        # Long range goals
        if distance > 25 or 'long' in technique:
            return 'long_range'

        # Power vs placed finish
        if power in ['high', 'very_high'] or 'power' in technique:
            return 'power_finish'
        elif 'placed' in technique or 'precise' in technique:
            return 'placed_finish'

        # Tap-ins and simple finishes
        if distance < 6 or 'tap' in technique or 'simple' in technique:
            return 'tap_in'

        # Default to placed finish
        return 'placed_finish'

    def generate_template_commentary(self, event):
        """Generate commentary using enhanced templates"""
        try:
            # Map event to template
            template_category, subtype = self.map_event_to_template(event)

            # Create context for contextual templates
            context = {
                'minute': event.get('minute', 0),
                'score_situation': self.match_state['score'],
                'importance': event.get('importance_score', 50)
            }

            # Get template
            if hasattr(self.templates, 'get_contextual_template'):
                template = self.templates.get_contextual_template(template_category, subtype, context)
            else:
                template = self.templates.get_template(template_category, subtype)

            # Get player position for more natural commentary
            position = self.safe_lower(event.get('position', 'player'))
            if not position or position in ['unknown', 'player', '']:
                position = self._guess_position_from_event(event)

            # Safely get player and team names
            player = event.get('player', 'Unknown Player')
            team = event.get('team', 'Unknown Team')
            minute = event.get('minute', 0)

            # Fill template with event data
            commentary = template.format(
                player=player,
                team=team,
                position=position,
                minute=minute
            )

            return commentary

        except Exception as e:
            print(f"⚠️ Template generation error for {event.get('player', 'Unknown')}: {e}")
            # Fallback to AI generation
            return self.generate_ai_commentary(event)

    def _guess_position_from_event(self, event):
        """Guess player position from event type"""
        event_type = self.safe_lower(event.get('commentary_category', ''))

        position_mapping = {
            'goal': 'striker',
            'shot': 'forward',
            'tackle': 'defender',
            'clearance': 'defender',
            'block': 'defender',
            'interception': 'midfielder',
            'pass': 'midfielder',
            'foul': 'player',
            'card': 'player'
        }

        return position_mapping.get(event_type, 'player')

    def generate_ai_commentary(self, event, max_length=100, temperature=0.8):
        """Generate AI commentary as fallback or enhancement"""
        # Create enhanced prompt
        prompt = self.create_enhanced_prompt(event)

        # Tokenize input
        inputs = self.tokenizer(
            prompt,
            return_tensors="pt",
            truncation=True,
            max_length=450,
            padding=False
        )
        inputs = {k: v.to(self.device) for k, v in inputs.items()}

        # Generate commentary
        with torch.no_grad():
            try:
                outputs = self.model.generate(
                    **inputs,
                    max_new_tokens=max_length,
                    temperature=temperature,
                    do_sample=True,
                    top_p=0.9,
                    top_k=50,
                    repetition_penalty=1.1,
                    pad_token_id=self.tokenizer.eos_token_id,
                    eos_token_id=self.tokenizer.eos_token_id,
                    no_repeat_ngram_size=3,
                )

                # Decode and clean
                generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

                if "Generate exciting, professional commentary:" in generated_text:
                    commentary = generated_text.split("Generate exciting, professional commentary:")[-1].strip()
                else:
                    commentary = generated_text[len(prompt):].strip()

                commentary = commentary.split('\n')[0].strip()
                return commentary

            except Exception as e:
                print(f"⚠️ Error generating AI commentary: {e}")
                return f"{event['player']} in action for {event['team']}!"

    def create_enhanced_prompt(self, event):
        """Create enhanced prompt with match context for AI generation"""
        importance = event.get('importance_score', 50)
        current_score = f"{self.match_state['score'][0]}-{self.match_state['score'][1]}"

        prompt = f"""Generate professional football commentary for this live match event:
MATCH: {self.match_context['home_team']} vs {self.match_context['away_team']}
COMPETITION: {self.match_context['competition']}
CURRENT SCORE: {current_score}
MINUTE: {event['minute']}'
HALF: {self.match_state['half']}
EVENT DETAILS:
- Type: {event['commentary_category']}
- Player: {event['player']}
- Team: {event['team']}
- Importance: {importance:.0f}/100"""

        # Add specific event context
        if event['commentary_category'] == 'Goal':
            prompt += f"""
- GOAL DETAILS: {event.get('shot_technique', 'Unknown')} finish with {event.get('shot_body_part', 'foot')}
- Expected Goals (xG): {event.get('shot_xg', 0):.2f}
- Goal #{self.match_state['goals_home'] + self.match_state['goals_away']} of the match"""
        elif event['commentary_category'] == 'Shot':
            prompt += f"""
- Shot outcome: {event.get('shot_outcome', 'Unknown')}
- Technique: {event.get('shot_technique', 'Unknown')}
- xG: {event.get('shot_xg', 0):.2f}"""
        elif event['commentary_category'] == 'Card':
            prompt += f"""
- Card type: {event.get('card_type', 'Yellow')}
- Reason: {event.get('foul_type', 'Foul')}
- Cards shown this match: {self.match_state['cards_shown']}"""
        elif event['commentary_category'] == 'Substitution':
            prompt += f"""
- Player coming on: {event.get('substitution_replacement', 'Unknown')}
- Substitutions made: {self.match_state['substitutions_made']}"""

        prompt += "\n\nGenerate exciting, professional commentary:"
        return prompt

    def generate_commentary(self, event, use_templates=True, use_ai_fallback=True):
        """Main commentary generation method - combines templates and AI"""

        # Update match state first
        self.update_match_state(event)

        # Choose generation method
        if use_templates:
            try:
                # Try template-based generation first
                commentary = self.generate_template_commentary(event)

                # Enhance with AI if desired (optional)
                if use_ai_fallback and random.random() < 0.3:  # 30% chance to use AI enhancement
                    ai_commentary = self.generate_ai_commentary(event)
                    # You could combine or choose the better one here
                    pass

                return commentary

            except Exception as e:
                print(f"⚠️ Template generation failed: {e}")
                if use_ai_fallback:
                    return self.generate_ai_commentary(event)
                else:
                    return f"{event['player']} in action for {event['team']}!"
        else:
            # Pure AI generation
            return self.generate_ai_commentary(event)

# Initialize with enhanced templates
templates = EnhancedCommentaryTemplates()
commentary_generator = FootballCommentaryGenerator(model, tokenizer, templates, match_context)

In [None]:
def generate_full_match_commentary(generator, events_df, save_results=True):
    """Generate commentary for the entire match"""

    print("🎙️ GENERATING FULL MATCH COMMENTARY")
    print("=" * 60)
    print(f"🏟️ {generator.match_context['home_team']} vs {generator.match_context['away_team']}")
    print(f"📊 Processing {len(events_df)} events...")

    # Sort events chronologically
    events_sorted = events_df.sort_values(['minute', 'second']).reset_index(drop=True)

    # Generate commentary for each event
    commentary_results = []

    # Match introduction
    intro = f"Welcome to this {generator.match_context['competition']} encounter between {generator.match_context['home_team']} and {generator.match_context['away_team']}! We're live from the stadium for what promises to be an exciting match!"

    commentary_results.append({
        'minute': 0,
        'event_type': 'Match Start',
        'player': 'Commentator',
        'team': 'Broadcast',
        'commentary': intro,
        'score': '0-0',
        'importance_score': 100
    })

    print(f"🎙️ {intro}\n")

    # Process each event
    for idx, event in events_sorted.iterrows():
        try:
            print(f"Generating {idx+1}/{len(events_sorted)}: {event['minute']}' - {event['commentary_category']} ({event['player']})")

            # Generate commentary
            commentary = generator.generate_commentary(event)

            # Current score
            current_score = f"{generator.match_state['score'][0]}-{generator.match_state['score'][1]}"

            # Store result
            result = {
                'minute': event['minute'],
                'event_type': event['commentary_category'],
                'player': event['player'],
                'team': event['team'],
                'commentary': commentary,
                'score': current_score,
                'importance_score': event.get('importance_score', 50)
            }

            commentary_results.append(result)

            # Print as we go
            print(f"   [{event['minute']}'] ({current_score}) {commentary}")
            print()

        except Exception as e:
            print(f"⚠️ Error processing event {idx}: {e}")
            continue

    # Match conclusion
    final_score = f"{generator.match_state['score'][0]}-{generator.match_state['score'][1]}"
    conclusion = f"FULL TIME! {generator.match_context['home_team']} {final_score} {generator.match_context['away_team']}! What an entertaining match that was! Thank you for joining us for this {generator.match_context['competition']} encounter!"

    commentary_results.append({
        'minute': 90,
        'event_type': 'Full Time',
        'player': 'Referee',
        'team': 'Officials',
        'commentary': conclusion,
        'score': final_score,
        'importance_score': 100
    })

    print(f"🏁 {conclusion}")

    # Save results if requested
    if save_results:
        results_df = pd.DataFrame(commentary_results)
        filename = f"match_commentary_{generator.match_context['match_id']}.csv"
        results_df.to_csv(filename, index=False)
        print(f"📁 Commentary saved to: {filename}")

        # Also save as text file
        text_filename = f"match_commentary_{generator.match_context['match_id']}.txt"
        with open(text_filename, 'w', encoding='utf-8') as f:
            f.write(f"MATCH COMMENTARY: {generator.match_context['home_team']} vs {generator.match_context['away_team']}\n")
            f.write(f"Competition: {generator.match_context['competition']}\n")
            f.write(f"Date: {generator.match_context['match_date']}\n")
            f.write("=" * 80 + "\n\n")

            for result in commentary_results:
                if result['event_type'] in ['Match Start', 'Full Time']:
                    f.write(f"{result['commentary']}\n\n")
                else:
                    f.write(f"[{result['minute']}'] {result['commentary']}\n\n")

        print(f"📁 Text commentary saved to: {text_filename}")

    # Generate summary statistics
    print_match_summary(commentary_results, generator.match_context)

    return commentary_results

def print_match_summary(results, match_context):
    """Print comprehensive match summary"""
    print(f"\n📊 MATCH SUMMARY")
    print("=" * 40)

    match_events = [r for r in results if r['event_type'] not in ['Match Start', 'Full Time']]

    print(f"🏟️ {match_context['home_team']} vs {match_context['away_team']}")
    print(f"🏆 {match_context['competition']}")
    print(f"📅 {match_context['match_date']}")

    if match_events:
        final_score = results[-2]['score']  # Second to last (before Full Time)
        print(f"⚽ Final Score: {final_score}")

        # Event breakdown
        event_counts = {}
        for result in match_events:
            event_type = result['event_type']
            event_counts[event_type] = event_counts.get(event_type, 0) + 1

        print(f"\n📋 Events Covered:")
        for event_type, count in sorted(event_counts.items(), key=lambda x: x[1], reverse=True):
            print(f"   • {event_type}: {count}")

        # Goals timeline
        goals = [r for r in match_events if r['event_type'] == 'Goal']
        if goals:
            print(f"\n🥅 Goals Timeline:")
            for goal in goals:
                print(f"   • {goal['minute']}' - {goal['player']} ({goal['team']})")

        print(f"\n📝 Total Commentary Segments: {len(results)}")
        print(f"💬 Total Words Generated: {sum(len(r['commentary'].split()) for r in results)}")

# Run the complete pipeline
commentary_results = generate_full_match_commentary(commentary_generator, full_match_events, save_results=True)

🎙️ GENERATING FULL MATCH COMMENTARY
🏟️ Bayer Leverkusen vs Werder Bremen
📊 Processing 46 events...
🎙️ Welcome to this 1. Bundesliga encounter between Bayer Leverkusen and Werder Bremen! We're live from the stadium for what promises to be an exciting match!

Generating 1/46: 5' - Foul (Edmond Fayçal Tapsoba)
   [5'] (0-0) Straight from kickoff, Infringement by Edmond Fayçal Tapsoba! The Bayer Leverkusen player steps outside the boundaries of fair play!

Generating 2/46: 6' - Shot (Leonardo Bittencourt)
   [6'] (0-0) Within minutes, Leonardo Bittencourt chases breakthrough for Werder Bremen! FRUSTRATED! The defensive unit stands as one!

Generating 3/46: 7' - Shot (Piero Martín Hincapié Reyna)
   [7'] (0-0) Straight from kickoff, Piero Martín Hincapié Reyna seeks perfection for Bayer Leverkusen! BRILLIANT! The keeper defies gravity with that save!

Generating 4/46: 8' - Foul (Robert Andrich)
   [8'] (0-0) Early doors, Infringement by Robert Andrich! The Bayer Leverkusen player steps outs

In [None]:
from google.colab import files
files.download('match_commentary_3895302.txt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>