##CS2 Match Prediction System - Step 1: Environment Setup and Data Loading

**This script**:
1. Installs required packages
2. Imports necessary libraries
3. Loads and validates the data files
4. Performs initial data exploration

In [1]:
# ============================================================================
# SECTION 1: Package Installation
# ============================================================================
print("=" * 80)
print("STEP 1.1: Installing Required Packages")
print("=" * 80)

import subprocess
import sys

def install_package(package):
    """Install a package via pip (quiet mode disabled so errors are visible)."""
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# IMPORTANT:
# Do NOT install numpy/pandas on Colab → They are already compatible with Python 3.12
packages = [
    "scikit-learn",
    "lightgbm",
    "xgboost",
    "matplotlib",
    "seaborn",
    "pyyaml",
    "joblib",
    "openpyxl"
]

print("\nInstalling required packages (no strict versions)...")
for package in packages:
    try:
        print(f"Installing {package}...", end=" ")
        install_package(package)
        print("")
    except Exception as e:
        print(f" (Error: {e})")

print("\n Package installation complete!")

# ============================================================================
# SECTION 2: Import Libraries
# ============================================================================
print("\n" + "=" * 80)
print("STEP 1.2: Importing Libraries")
print("=" * 80)

import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

print(" Libraries imported successfully!")

# ============================================================================
# SECTION 3: Configuration Setup
# ============================================================================
print("\n" + "=" * 80)
print("STEP 1.3: Setting Up Configuration")
print("=" * 80)

config = {
    'random_seed': 42,
    'test_size': 0.2,
    'n_folds': 5,
    'elo_k_factor': 32,
    'elo_default': 1500,
    'calibration_bins': 15,
    'lgbm_params': {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1,
        'random_state': 42
    },
    'xgb_params': {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'max_depth': 6,
        'learning_rate': 0.05,
        'subsample': 0.8,
        'colsample_bytree': 0.9,
        'random_state': 42
    }
}

print("Configuration created:")
for k, v in config.items():
    if isinstance(v, dict):
        print(f"  - {k}: (dict with {len(v)} entries)")
    else:
        print(f"  - {k}: {v}")

# ============================================================================
# SECTION 4: File Upload Instructions
# ============================================================================
print("\n" + "=" * 80)
print("STEP 1.4: File Upload")
print("=" * 80)

FREEZE_TIME_FILE = '/content/freeze_time_features.csv'
MATCHES_FILE = '/content/matches.xlsx'

print("Assuming files will be at /content/... once uploaded.")

# ============================================================================
# SECTION 5: Load Data Files
# ============================================================================
print("\n" + "=" * 80)
print("STEP 1.5: Loading Data Files")
print("=" * 80)

print(f"Loading rounds data from: {FREEZE_TIME_FILE}")
try:
    df_rounds = pd.read_csv(FREEZE_TIME_FILE)
    print(f" Loaded {len(df_rounds):,} rows from freeze_time_features.csv")
except Exception as e:
    print(f" Error: {e}")
    df_rounds = None

print(f"\nLoading matches metadata from: {MATCHES_FILE}")
try:
    df_matches = pd.read_excel(MATCHES_FILE)
    print(f" Loaded {len(df_matches):,} rows from matches.xlsx")
except Exception as e:
    print(f" Error: {e}")
    df_matches = None

# ============================================================================
# SECTION 6: Data Validation and Exploration
# ============================================================================
print("\n" + "=" * 80)
print("STEP 1.6: Data Validation and Exploration")
print("=" * 80)

if df_rounds is not None:
    print("\n--- FREEZE TIME FEATURES DATA ---")
    print(df_rounds.head(3))
    print("\nShape:", df_rounds.shape)
    print("\nColumns:", list(df_rounds.columns))

    required_features = [
        'map','side','is_pistol','is_ot','score_diff','start_cash','loss_bonus',
        'consec_losses','equip_value','rifle_cnt','smg_cnt','shotgun_cnt',
        'awp_cnt','helmets','kevlar','kits','flash_cnt','smoke_cnt','he_cnt',
        'molotov_cnt','timeout_flag','round_win'
    ]

    missing = [f for f in required_features if f not in df_rounds.columns]
    if missing:
        print("\n Missing required columns:", missing)
    else:
        print("\n All required features present")

if df_matches is not None:
    print("\n--- MATCHES METADATA ---")
    print(df_matches.head(5))
    print("\nShape:", df_matches.shape)

    required_cols = ['Event Name', 'Time', 'Team 1', 'Team 2', 'Match ID']
    missing = [c for c in required_cols if c not in df_matches.columns]
    if missing:
        print("\n Missing required match columns:", missing)
    else:
        print("\n All required match columns present")

    # Convert time column
    try:
        df_matches['Time'] = pd.to_datetime(df_matches['Time'])
        print("\n Converted Time column to datetime")
    except Exception as e:
        print(" Error converting Time:", e)

# ============================================================================
# SECTION 7: Data Integration Check
# ============================================================================
print("\n" + "=" * 80)
print("STEP 1.7: Data Integration Check")
print("=" * 80)

if df_rounds is not None and df_matches is not None:
    r_ids = set(df_rounds['match_id'])
    m_ids = set(df_matches['Match ID'])

    common = r_ids.intersection(m_ids)

    print(f"Freeze-time match IDs: {len(r_ids)}")
    print(f"Match metadata IDs:    {len(m_ids)}")
    print(f"Common match IDs:      {len(common)}")

    if common:
        print(" Files can be linked via Match ID")
    else:
        print(" No common match IDs found")

# ============================================================================
# SECTION 8: Summary
# ============================================================================
print("\n" + "=" * 80)
print("STEP 1 COMPLETE: Summary")
print("=" * 80)

print("""
 Environment ready
 Data loaded
 Validation completed

""")


STEP 1.1: Installing Required Packages

Installing required packages (no strict versions)...
Installing scikit-learn... 
Installing lightgbm... 
Installing xgboost... 
Installing matplotlib... 
Installing seaborn... 
Installing pyyaml... 
Installing joblib... 
Installing openpyxl... 

 Package installation complete!

STEP 1.2: Importing Libraries
 Libraries imported successfully!

STEP 1.3: Setting Up Configuration
Configuration created:
  - random_seed: 42
  - test_size: 0.2
  - n_folds: 5
  - elo_k_factor: 32
  - elo_default: 1500
  - calibration_bins: 15
  - lgbm_params: (dict with 10 entries)
  - xgb_params: (dict with 7 entries)

STEP 1.4: File Upload
Assuming files will be at /content/... once uploaded.

STEP 1.5: Loading Data Files
Loading rounds data from: /content/freeze_time_features.csv
 Loaded 10,000 rows from freeze_time_features.csv

Loading matches metadata from: /content/matches.xlsx
 Loaded 101 rows from matches.xlsx

STEP 1.6: Data Validation and Exploration

--- FREE


CS2 Match Prediction System - Step 2: Feature Engineering & Elo Rating Construction

**This script**:
1. Extracts series_type from matches.xlsx
2. Implements Elo rating system with event-based freezing
3. Calculates pre-event Elo ratings chronologically
4. Merges Elo ratings into round-level data
5. Validates all features

**Prerequisites:** Step 1 completed with synchronized files loaded


In [2]:


import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

print("=" * 80)
print("STEP 2: FEATURE ENGINEERING & ELO RATING CONSTRUCTION")
print("=" * 80)

# Ensure data is loaded from Step 1
# If running as separate script, load the data:
# df_rounds = pd.read_csv('/content/freeze_time_features.csv')
# df_matches = pd.read_excel('/content/matches.xlsx')
# df_matches['Time'] = pd.to_datetime(df_matches['Time'])

# ============================================================================
# SECTION 1: Extract Series Type from Matches
# ============================================================================
print("\n" + "=" * 80)
print("STEP 2.1: Extracting Series Type from Matches")
print("=" * 80)

def extract_series_type(maps_str):
    """
    Extract series type from Maps column
    Examples: 'bo3' -> 3, 'bo5' -> 5, 'bo1' -> 1
    """
    if pd.isna(maps_str):
        return 1

    maps_str = str(maps_str).lower().strip()

    if 'bo5' in maps_str:
        return 5
    elif 'bo3' in maps_str:
        return 3
    elif 'bo2' in maps_str:
        return 2
    elif 'bo1' in maps_str:
        return 1
    else:
        # Try to extract number
        import re
        match = re.search(r'bo(\d+)', maps_str)
        if match:
            return int(match.group(1))
        return 3  # Default to bo3

print("Extracting series_type from Maps column...")
df_matches['series_type'] = df_matches['Maps'].apply(extract_series_type)

print("\nSeries Type Distribution:")
print(df_matches['series_type'].value_counts().sort_index())
print(f"\n Series type extracted for {len(df_matches)} matches")

# ============================================================================
# SECTION 2: Prepare Match-Level Data for Elo
# ============================================================================
print("\n" + "=" * 80)
print("STEP 2.2: Preparing Match-Level Data for Elo")
print("=" * 80)

# Create clean match dataframe
df_matches_elo = df_matches[['Match ID', 'Time', 'Event Name', 'Team 1', 'Team 2',
                              'Result 1', 'Result 2', 'series_type']].copy()
df_matches_elo.columns = ['match_id', 'match_time', 'event_name', 'team1', 'team2',
                          'score1', 'score2', 'series_type']

# Sort by time for chronological Elo calculation
df_matches_elo = df_matches_elo.sort_values('match_time').reset_index(drop=True)

print(f" Prepared {len(df_matches_elo)} matches sorted chronologically")
print(f"  Date range: {df_matches_elo['match_time'].min()} to {df_matches_elo['match_time'].max()}")
print(f"  Unique teams: {df_matches_elo['team1'].nunique() + df_matches_elo['team2'].nunique()}")
print(f"  Unique events: {df_matches_elo['event_name'].nunique()}")

# ============================================================================
# SECTION 3: Elo Rating System Implementation
# ============================================================================
print("\n" + "=" * 80)
print("STEP 2.3: Implementing Elo Rating System")
print("=" * 80)

class EloRatingSystem:
    """
    Elo rating system for CS2 teams with event-based freezing
    """
    def __init__(self, k_factor=32, default_rating=1500):
        self.k_factor = k_factor
        self.default_rating = default_rating
        self.ratings = {}  # team -> current rating
        self.event_ratings = {}  # (team, event) -> pre-event rating
        self.rating_history = []

    def get_rating(self, team):
        """Get current rating for a team"""
        if team not in self.ratings:
            self.ratings[team] = self.default_rating
        return self.ratings[team]

    def expected_score(self, rating_a, rating_b):
        """Calculate expected score for team A"""
        return 1 / (1 + 10 ** ((rating_b - rating_a) / 400))

    def update_ratings(self, team_a, team_b, score_a, score_b, match_id, match_time):
        """
        Update ratings based on match result
        Score is rounds won (e.g., 27-23)
        """
        rating_a = self.get_rating(team_a)
        rating_b = self.get_rating(team_b)

        # Calculate actual score (normalize to 0-1)
        total_rounds = score_a + score_b
        if total_rounds == 0:
            actual_a = 0.5
        else:
            actual_a = score_a / total_rounds

        # Calculate expected scores
        expected_a = self.expected_score(rating_a, rating_b)
        expected_b = 1 - expected_a

        # Update ratings
        new_rating_a = rating_a + self.k_factor * (actual_a - expected_a)
        new_rating_b = rating_b + self.k_factor * ((1 - actual_a) - expected_b)

        # Store history
        self.rating_history.append({
            'match_id': match_id,
            'match_time': match_time,
            'team_a': team_a,
            'team_b': team_b,
            'rating_a_before': rating_a,
            'rating_b_before': rating_b,
            'rating_a_after': new_rating_a,
            'rating_b_after': new_rating_b,
            'score_a': score_a,
            'score_b': score_b
        })

        # Update current ratings
        self.ratings[team_a] = new_rating_a
        self.ratings[team_b] = new_rating_b

        return new_rating_a, new_rating_b

    def freeze_rating_for_event(self, team, event):
        """Freeze team's rating at the start of an event"""
        key = (team, event)
        if key not in self.event_ratings:
            self.event_ratings[key] = self.get_rating(team)
        return self.event_ratings[key]

    def get_pre_event_rating(self, team, event):
        """
        Get the rating a team had before an event started
        Returns (rating, is_missing_flag)
        """
        key = (team, event)

        # Check if we have a frozen rating for this team-event combo
        if key in self.event_ratings:
            # Check if team had any history before this event
            has_history = len([h for h in self.rating_history if h['team_a'] == team or h['team_b'] == team]) > 0
            return self.event_ratings[key], 0 if has_history else 1

        # If not frozen yet, freeze it now
        rating = self.get_rating(team)
        has_history = len([h for h in self.rating_history if h['team_a'] == team or h['team_b'] == team]) > 0

        self.event_ratings[key] = rating
        return rating, 0 if has_history else 1

# Initialize Elo system
print("Initializing Elo rating system...")
print(f"  K-factor: 32")
print(f"  Default rating: 1500")

elo_system = EloRatingSystem(k_factor=32, default_rating=1500)

# ============================================================================
# SECTION 4: Calculate Elo Ratings Chronologically
# ============================================================================
print("\n" + "=" * 80)
print("STEP 2.4: Calculating Elo Ratings Chronologically")
print("=" * 80)

# Track events we've seen
processed_events = set()
match_elo_data = []

print(f"\nProcessing {len(df_matches_elo)} matches chronologically...\n")

for idx, row in df_matches_elo.iterrows():
    match_id = row['match_id']
    event = row['event_name']
    team1 = row['team1']
    team2 = row['team2']
    score1 = row['score1']
    score2 = row['score2']
    match_time = row['match_time']

    # Freeze ratings for this event if first time seeing it
    if event not in processed_events:
        # Get all teams in this event
        event_matches = df_matches_elo[df_matches_elo['event_name'] == event]
        event_teams = set(event_matches['team1'].tolist() + event_matches['team2'].tolist())

        # Freeze ratings for all teams in this event
        for team in event_teams:
            elo_system.freeze_rating_for_event(team, event)

        processed_events.add(event)
        print(f"  Event {len(processed_events)}: '{event}' - Froze ratings for {len(event_teams)} teams")

    # Get pre-event ratings for both teams
    team1_elo, team1_missing = elo_system.get_pre_event_rating(team1, event)
    team2_elo, team2_missing = elo_system.get_pre_event_rating(team2, event)

    # Store match-level Elo data
    match_elo_data.append({
        'match_id': match_id,
        'event_name': event,
        'team1': team1,
        'team2': team2,
        'team1_elo_pre_event': team1_elo,
        'team2_elo_pre_event': team2_elo,
        'team1_elo_missing': team1_missing,
        'team2_elo_missing': team2_missing,
        'elo_diff_team1': team1_elo - team2_elo
    })

    # Update ratings after the match (for future matches)
    elo_system.update_ratings(team1, team2, score1, score2, match_id, match_time)

print(f"\n Processed all matches and calculated Elo ratings")
print(f" Tracked {len(processed_events)} unique events")
print(f" Rated {len(elo_system.ratings)} unique teams")

# Create match Elo dataframe
df_match_elo = pd.DataFrame(match_elo_data)

print("\nElo Rating Summary:")
ratings_list = list(elo_system.ratings.values())
print(f"  Mean rating: {np.mean(ratings_list):.1f}")
print(f"  Std rating: {np.std(ratings_list):.1f}")
print(f"  Min rating: {np.min(ratings_list):.1f}")
print(f"  Max rating: {np.max(ratings_list):.1f}")

# Show teams with missing ratings at start
missing_count = df_match_elo['team1_elo_missing'].sum() + df_match_elo['team2_elo_missing'].sum()
total_team_appearances = len(df_match_elo) * 2
print(f"\nTeams with no prior history: {missing_count}/{total_team_appearances} ({missing_count/total_team_appearances*100:.1f}%)")

print("\nTop 10 teams by final Elo:")
top_teams = sorted(elo_system.ratings.items(), key=lambda x: x[1], reverse=True)[:10]
for i, (team, rating) in enumerate(top_teams, 1):
    print(f"  {i:2d}. {team:30s} {rating:.1f}")

# ============================================================================
# SECTION 5: Merge Elo Ratings into Round Data
# ============================================================================
print("\n" + "=" * 80)
print("STEP 2.5: Merging Elo Ratings into Round Data")
print("=" * 80)

print(f"\nOriginal rounds data: {len(df_rounds):,} rounds")

def merge_elo_to_rounds(df_rounds, df_match_elo):
    """
    Merge Elo ratings into rounds dataframe
    """
    df_rounds_with_elo = df_rounds.copy()

    # Initialize Elo columns
    df_rounds_with_elo['team_elo_pre_event'] = np.nan
    df_rounds_with_elo['opp_elo_pre_event'] = np.nan
    df_rounds_with_elo['elo_diff'] = np.nan
    df_rounds_with_elo['elo_missing'] = 0

    # Create lookup dictionary for faster access
    elo_lookup = {}
    for _, row in df_match_elo.iterrows():
        elo_lookup[row['match_id']] = row

    print("Merging Elo ratings into rounds...")
    merge_count = 0

    for idx in df_rounds_with_elo.index:
        match_id = df_rounds_with_elo.loc[idx, 'match_id']
        team_name = df_rounds_with_elo.loc[idx, 'team_name']

        if match_id not in elo_lookup:
            continue

        match_elo = elo_lookup[match_id]

        # Determine if this team is team1 or team2
        if team_name == match_elo['team1']:
            df_rounds_with_elo.loc[idx, 'team_elo_pre_event'] = match_elo['team1_elo_pre_event']
            df_rounds_with_elo.loc[idx, 'opp_elo_pre_event'] = match_elo['team2_elo_pre_event']
            df_rounds_with_elo.loc[idx, 'elo_diff'] = match_elo['elo_diff_team1']
            df_rounds_with_elo.loc[idx, 'elo_missing'] = match_elo['team1_elo_missing']
            merge_count += 1
        elif team_name == match_elo['team2']:
            df_rounds_with_elo.loc[idx, 'team_elo_pre_event'] = match_elo['team2_elo_pre_event']
            df_rounds_with_elo.loc[idx, 'opp_elo_pre_event'] = match_elo['team1_elo_pre_event']
            df_rounds_with_elo.loc[idx, 'elo_diff'] = -match_elo['elo_diff_team1']
            df_rounds_with_elo.loc[idx, 'elo_missing'] = match_elo['team2_elo_missing']
            merge_count += 1

    print(f"   Merged Elo for {merge_count:,} rounds")
    return df_rounds_with_elo

df_rounds_with_elo = merge_elo_to_rounds(df_rounds, df_match_elo)

# Check merge success
elo_populated = df_rounds_with_elo['team_elo_pre_event'].notna().sum()
print(f"\n Elo ratings merged for {elo_populated:,}/{len(df_rounds_with_elo):,} rounds ({elo_populated/len(df_rounds_with_elo)*100:.1f}%)")

if elo_populated < len(df_rounds_with_elo):
    print(f"  {len(df_rounds_with_elo) - elo_populated:,} rounds missing Elo (should not happen with synchronized files)")

# ============================================================================
# SECTION 6: Add Series Type and Event to Rounds
# ============================================================================
print("\n" + "=" * 80)
print("STEP 2.6: Adding Series Type and Event to Rounds")
print("=" * 80)

# Merge series_type from matches
df_matches_meta = df_matches[['Match ID', 'series_type', 'Event Name']].copy()
df_matches_meta.columns = ['match_id', 'series_type', 'event_name']

df_rounds_final = df_rounds_with_elo.merge(df_matches_meta, on='match_id', how='left')

print(f" Series type and event added to all rounds")
print(f"\nSeries type distribution in rounds:")
print(df_rounds_final['series_type'].value_counts().sort_index())

print(f"\nEvent distribution in rounds:")
print(df_rounds_final['event_name'].value_counts())

# ============================================================================
# SECTION 7: Feature Summary and Validation
# ============================================================================
print("\n" + "=" * 80)
print("STEP 2.7: Feature Summary and Validation")
print("=" * 80)

# List all features
print("\nFinal Feature List:")
feature_categories = {
    'Match Context': ['map', 'side', 'series_type', 'is_pistol', 'is_ot', 'event_name'],
    'Score & Economy': ['score_diff', 'start_cash', 'loss_bonus', 'consec_losses', 'equip_value'],
    'Weapons': ['rifle_cnt', 'smg_cnt', 'shotgun_cnt', 'awp_cnt'],
    'Armor & Utility': ['helmets', 'kevlar', 'kits'],
    'Grenades': ['flash_cnt', 'smoke_cnt', 'he_cnt', 'molotov_cnt'],
    'Opponent Info': ['opp_rifle_cnt', 'opp_smg_cnt', 'opp_shotgun_cnt', 'opp_awp_cnt',
                      'opp_flash_cnt', 'opp_smoke_cnt', 'opp_he_cnt', 'opp_molotov_cnt'],
    'Other': ['timeout_flag'],
    'Elo Ratings': ['team_elo_pre_event', 'opp_elo_pre_event', 'elo_diff', 'elo_missing'],
    'Target': ['round_win']
}

total_features = 0
for category, features in feature_categories.items():
    present = [f for f in features if f in df_rounds_final.columns]
    total_features += len(present)
    print(f"\n{category} ({len(present)}/{len(features)}):")
    for f in present:
        print(f"   {f}")
    missing = [f for f in features if f not in df_rounds_final.columns]
    if missing:
        for f in missing:
            print(f"   {f} (MISSING)")

print(f"\n{'='*80}")
print(f"Total features available: {total_features}")

# Check for missing values
print(f"\nMissing Values Check:")
missing_summary = df_rounds_final.isnull().sum()
missing_summary = missing_summary[missing_summary > 0]
if len(missing_summary) > 0:
    print(missing_summary)
else:
    print("   No missing values!")

# ============================================================================
# SECTION 8: Save Processed Data
# ============================================================================
print("\n" + "=" * 80)
print("STEP 2.8: Saving Processed Data")
print("=" * 80)

# Save to CSV
output_file = '/content/rounds_with_elo.csv'
df_rounds_final.to_csv(output_file, index=False)
print(f" Saved processed data to: {output_file}")
print(f"  Shape: {df_rounds_final.shape}")

# Save match-level Elo
match_elo_file = '/content/match_elo.csv'
df_match_elo.to_csv(match_elo_file, index=False)
print(f" Saved match Elo data to: {match_elo_file}")

# Save Elo history
elo_history_file = '/content/elo_history.csv'
df_elo_history = pd.DataFrame(elo_system.rating_history)
df_elo_history.to_csv(elo_history_file, index=False)
print(f" Saved Elo history to: {elo_history_file}")

# ============================================================================
# SECTION 9: Data Quality Report
# ============================================================================
print("\n" + "=" * 80)
print("STEP 2.9: Data Quality Report")
print("=" * 80)

print(f"\nDataset Overview:")
print(f"  Total rounds: {len(df_rounds_final):,}")
print(f"  Total matches: {df_rounds_final['match_id'].nunique()}")
print(f"  Total teams: {df_rounds_final['team_name'].nunique()}")
print(f"  Total maps: {df_rounds_final['map'].nunique()}")
print(f"  Total events: {df_rounds_final['event_name'].nunique()}")

print(f"\nElo Statistics:")
print(f"  Mean team Elo: {df_rounds_final['team_elo_pre_event'].mean():.1f}")
print(f"  Std team Elo: {df_rounds_final['team_elo_pre_event'].std():.1f}")
print(f"  Min team Elo: {df_rounds_final['team_elo_pre_event'].min():.1f}")
print(f"  Max team Elo: {df_rounds_final['team_elo_pre_event'].max():.1f}")
print(f"  Mean Elo diff: {df_rounds_final['elo_diff'].mean():.1f}")
print(f"  Std Elo diff: {df_rounds_final['elo_diff'].std():.1f}")
print(f"  Rounds with missing Elo flag: {df_rounds_final['elo_missing'].sum()} ({df_rounds_final['elo_missing'].mean()*100:.1f}%)")

print(f"\nTarget Distribution:")
print(df_rounds_final['round_win'].value_counts(normalize=True))

print(f"\nMap Distribution:")
print(df_rounds_final['map'].value_counts())

print(f"\nSide Distribution:")
print(df_rounds_final['side'].value_counts())

# ============================================================================
# SUMMARY
# ============================================================================
print("\n" + "=" * 80)
print("STEP 2 COMPLETE: Summary")
print("=" * 80)

print("""
 Feature engineering completed successfully!

Accomplishment:
----------------------
1. Extracted series_type from matches.xlsx
2. Implemented Elo rating system with event-based freezing
3. Calculated pre-event Elo ratings for all teams chronologically
4. Successfully merged Elo ratings into round-level data (100% coverage)
5. Validated all required features are present
6. Saved processed dataset for model training

Key Statistics:
---------------
""")
print(f"  - Total rounds: {len(df_rounds_final):,}")
print(f"  - Rounds with Elo: {(df_rounds_final['team_elo_pre_event'].notna()).sum():,}")
print(f"  - Unique teams rated: {len(elo_system.ratings)}")
print(f"  - Elo rating range: {df_rounds_final['team_elo_pre_event'].min():.0f} - {df_rounds_final['team_elo_pre_event'].max():.0f}")
print(f"  - Features available: {total_features}")

print("""
Files saved:
------------
  - rounds_with_elo.csv (main dataset with all features)
  - match_elo.csv (match-level Elo ratings)
  - elo_history.csv (rating evolution over time)

""")

print("=" * 80)
print("END OF STEP 2")
print("=" * 80)

STEP 2: FEATURE ENGINEERING & ELO RATING CONSTRUCTION

STEP 2.1: Extracting Series Type from Matches
Extracting series_type from Maps column...

Series Type Distribution:
series_type
3    101
Name: count, dtype: int64

 Series type extracted for 101 matches

STEP 2.2: Preparing Match-Level Data for Elo
 Prepared 101 matches sorted chronologically
  Date range: 2022-10-01 11:00:00 to 2022-11-03 17:00:00
  Unique teams: 64
  Unique events: 7

STEP 2.3: Implementing Elo Rating System
Initializing Elo rating system...
  K-factor: 32
  Default rating: 1500

STEP 2.4: Calculating Elo Ratings Chronologically

Processing 101 matches chronologically...

  Event 1: 'CS2 Tournament Series - Week 1' - Froze ratings for 19 teams
  Event 2: 'CS2 Tournament Series - Week 2' - Froze ratings for 21 teams
  Event 3: 'CS2 Tournament Series - Week 3' - Froze ratings for 24 teams
  Event 4: 'CS2 Tournament Series - Week 4' - Froze ratings for 18 teams
  Event 5: 'CS2 Tournament Series - Week 5' - Froze rat

**CS2 Match Prediction System - Step 3: Data Preparation & Train/Test Split**

**This script:**
1. Loads processed data from Step 2
2. Encodes categorical features
3. Creates event-based train/test splits
4. Prepares feature matrices for modeling
5. Validates data quality

**Prerequisites:** Step 2 completed with rounds_with_elo.csv saved

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold
import json
import warnings
warnings.filterwarnings('ignore')

print("=" * 80)
print("STEP 3: DATA PREPARATION & TRAIN/TEST SPLIT")
print("=" * 80)

# ============================================================================
# SECTION 1: Load Processed Data
# ============================================================================
print("\n" + "=" * 80)
print("STEP 3.1: Loading Processed Data from Step 2")
print("=" * 80)

df_rounds = pd.read_csv('/content/rounds_with_elo.csv')
print(f" Loaded rounds_with_elo.csv: {len(df_rounds):,} rows")
print(f"  Columns: {len(df_rounds.columns)}")
print(f"  Matches: {df_rounds['match_id'].nunique()}")
print(f"  Events: {df_rounds['event_name'].nunique()}")

# Verify Elo columns are present
elo_cols = ['team_elo_pre_event', 'opp_elo_pre_event', 'elo_diff', 'elo_missing']
missing_elo = [col for col in elo_cols if col not in df_rounds.columns]
if missing_elo:
    print(f"   Missing Elo columns: {missing_elo}")
else:
    print(f"   All Elo columns present")

# ============================================================================
# SECTION 2: Define Feature Sets
# ============================================================================
print("\n" + "=" * 80)
print("STEP 3.2: Defining Feature Sets")
print("=" * 80)

# Base freeze-time features (no Elo)
freeze_time_features = [
    # Context (exclude map and side - will be encoded)
    'series_type', 'is_pistol', 'is_ot',
    # Economy & Score
    'score_diff', 'start_cash', 'loss_bonus', 'consec_losses', 'equip_value',
    # Weapons
    'rifle_cnt', 'smg_cnt', 'shotgun_cnt', 'awp_cnt',
    # Armor & Utility
    'helmets', 'kevlar', 'kits',
    # Grenades
    'flash_cnt', 'smoke_cnt', 'he_cnt', 'molotov_cnt',
    # Opponent info
    'opp_rifle_cnt', 'opp_smg_cnt', 'opp_shotgun_cnt', 'opp_awp_cnt',
    'opp_flash_cnt', 'opp_smoke_cnt', 'opp_he_cnt', 'opp_molotov_cnt',
    # Other
    'timeout_flag'
]

# Elo features
elo_features = [
    'team_elo_pre_event', 'opp_elo_pre_event', 'elo_diff', 'elo_missing'
]

# Categorical features for encoding
categorical_features = ['map', 'side']

# Target
target = 'round_win'

print(f"Feature Sets Defined:")
print(f"  Freeze-time features: {len(freeze_time_features)}")
print(f"  Elo features: {len(elo_features)}")
print(f"  Categorical features: {len(categorical_features)}")
print(f"  Total numeric features: {len(freeze_time_features) + len(elo_features)}")

# Verify all features exist
all_numeric_features = freeze_time_features + elo_features
missing_features = [f for f in all_numeric_features if f not in df_rounds.columns]
missing_cat = [f for f in categorical_features if f not in df_rounds.columns]

if missing_features or missing_cat:
    print(f"\n Missing features: {missing_features + missing_cat}")
else:
    print(f"\n All features present in dataset")

# ============================================================================
# SECTION 3: Encode Categorical Features
# ============================================================================
print("\n" + "=" * 80)
print("STEP 3.3: Encoding Categorical Features")
print("=" * 80)

df_encoded = df_rounds.copy()

print("\nEncoding categorical features using one-hot encoding...")

# Store mapping for later use
categorical_mappings = {}

for cat_feat in categorical_features:
    print(f"\n  Encoding '{cat_feat}':")
    print(f"    Unique values: {df_encoded[cat_feat].nunique()}")
    print(f"    Values: {sorted(df_encoded[cat_feat].unique())}")

    # One-hot encode with drop_first=True to avoid multicollinearity
    dummies = pd.get_dummies(df_encoded[cat_feat], prefix=cat_feat, drop_first=True)

    print(f"    Created {len(dummies.columns)} dummy variables")

    # Add to dataframe
    df_encoded = pd.concat([df_encoded, dummies], axis=1)

    # Store mapping
    categorical_mappings[cat_feat] = {
        'original_values': sorted(df_encoded[cat_feat].unique().tolist()),
        'dummy_columns': dummies.columns.tolist()
    }

print(f"\n Categorical encoding complete")
print(f"Dataset shape: {df_rounds.shape} → {df_encoded.shape}")

# Create final feature list (numeric + encoded categoricals)
encoded_categorical_features = []
for cat_feat in categorical_features:
    encoded_categorical_features.extend(categorical_mappings[cat_feat]['dummy_columns'])

# All features for models
features_freeze_only = freeze_time_features + encoded_categorical_features
features_with_elo = freeze_time_features + encoded_categorical_features + elo_features

print(f"\nFinal Feature Counts:")
print(f"  Freeze-time only: {len(features_freeze_only)}")
print(f"  With Elo: {len(features_with_elo)}")

# ============================================================================
# SECTION 4: Create Event-Based Groups for Cross-Validation
# ============================================================================
print("\n" + "=" * 80)
print("STEP 3.4: Creating Event-Based Groups for Cross-Validation")
print("=" * 80)

# Create event group IDs for CV
event_to_id = {event: i for i, event in enumerate(sorted(df_encoded['event_name'].unique()))}
df_encoded['event_group'] = df_encoded['event_name'].map(event_to_id)

print(f" Created {df_encoded['event_group'].nunique()} event groups")
print(f"\nEvent Group Mapping:")
for event, group_id in sorted(event_to_id.items(), key=lambda x: x[1]):
    count = (df_encoded['event_group'] == group_id).sum()
    print(f"  Group {group_id}: {event:40s} ({count:,} rounds)")

# ============================================================================
# SECTION 5: Train/Test Split (Event-Based)
# ============================================================================
print("\n" + "=" * 80)
print("STEP 3.5: Creating Train/Test Split (Event-Based)")
print("=" * 80)

# Use last 2 event groups as test set (~20-25% of data)
test_event_groups = [5, 6]  # Last 2 events
train_mask = ~df_encoded['event_group'].isin(test_event_groups)

df_train = df_encoded[train_mask].copy()
df_test = df_encoded[~train_mask].copy()

print(f"\nTrain/Test Split:")
print(f"  Train set: {len(df_train):,} rounds ({len(df_train)/len(df_encoded)*100:.1f}%)")
print(f"  Test set: {len(df_test):,} rounds ({len(df_test)/len(df_encoded)*100:.1f}%)")

print(f"\nTrain set:")
print(f"  Matches: {df_train['match_id'].nunique()}")
print(f"  Events: {df_train['event_name'].nunique()}")
print(f"  Event groups: {sorted(df_train['event_group'].unique())}")
print(f"  Win rate: {df_train[target].mean():.4f}")

print(f"\nTest set:")
print(f"  Matches: {df_test['match_id'].nunique()}")
print(f"  Events: {df_test['event_name'].nunique()}")
print(f"  Event groups: {sorted(df_test['event_group'].unique())}")
print(f"  Win rate: {df_test[target].mean():.4f}")

# ============================================================================
# SECTION 6: Prepare Feature Matrices
# ============================================================================
print("\n" + "=" * 80)
print("STEP 3.6: Preparing Feature Matrices")
print("=" * 80)

# Create feature matrices
print("\nCreating feature matrices...")

# Freeze-time only (for Baseline B without Elo)
X_train_freeze = df_train[features_freeze_only].copy()
X_test_freeze = df_test[features_freeze_only].copy()

# With Elo (for Baseline B with Elo and Main Model)
X_train_full = df_train[features_with_elo].copy()
X_test_full = df_test[features_with_elo].copy()

# Target
y_train = df_train[target].copy()
y_test = df_test[target].copy()

# Groups for CV (event groups)
groups_train = df_train['event_group'].copy()
groups_test = df_test['event_group'].copy()

print(f"\nFeature Matrix Dimensions:")
print(f"  X_train (freeze-only): {X_train_freeze.shape}")
print(f"  X_test (freeze-only): {X_test_freeze.shape}")
print(f"  X_train (with Elo): {X_train_full.shape}")
print(f"  X_test (with Elo): {X_test_full.shape}")
print(f"  y_train: {y_train.shape}")
print(f"  y_test: {y_test.shape}")

# Verify no missing values
print(f"\nMissing Values Check:")
missing_train_freeze = X_train_freeze.isnull().sum().sum()
missing_train_full = X_train_full.isnull().sum().sum()
missing_test_freeze = X_test_freeze.isnull().sum().sum()
missing_test_full = X_test_full.isnull().sum().sum()

if missing_train_freeze + missing_train_full + missing_test_freeze + missing_test_full == 0:
    print(f"   No missing values in any feature matrix!")
else:
    print(f"   Missing values detected:")
    print(f"    Train (freeze): {missing_train_freeze}")
    print(f"    Train (full): {missing_train_full}")
    print(f"    Test (freeze): {missing_test_freeze}")
    print(f"    Test (full): {missing_test_full}")

# ============================================================================
# SECTION 7: Baseline A Preparation (Map + Side Win Rates)
# ============================================================================
print("\n" + "=" * 80)
print("STEP 3.7: Preparing Data for Baseline A (Map + Side Win Rates)")
print("=" * 80)

# Calculate map+side win rates from training data
baseline_a_stats = df_train.groupby(['map', 'side'])[target].agg(['mean', 'count']).reset_index()
baseline_a_stats.columns = ['map', 'side', 'win_rate', 'count']

print(f"\nMap + Side Win Rates (from training data):")
print(baseline_a_stats.to_string(index=False))

# Save for Baseline A model
baseline_a_stats.to_csv('/content/baseline_a_stats.csv', index=False)
print(f"\n Saved baseline A statistics to: /content/baseline_a_stats.csv")

# ============================================================================
# SECTION 8: Save Prepared Data
# ============================================================================
print("\n" + "=" * 80)
print("STEP 3.8: Saving Prepared Data")
print("=" * 80)

# Save train/test splits
df_train.to_csv('/content/train_data.csv', index=False)
df_test.to_csv('/content/test_data.csv', index=False)
print(f" Saved train_data.csv and test_data.csv")

# Save feature matrices as numpy arrays for faster loading
np.save('/content/X_train_freeze.npy', X_train_freeze.values)
np.save('/content/X_test_freeze.npy', X_test_freeze.values)
np.save('/content/X_train_full.npy', X_train_full.values)
np.save('/content/X_test_full.npy', X_test_full.values)
np.save('/content/y_train.npy', y_train.values)
np.save('/content/y_test.npy', y_test.values)
np.save('/content/groups_train.npy', groups_train.values)
print(f" Saved feature matrices as .npy files")

# Save feature information
feature_info = {
    'features_freeze_only': features_freeze_only,
    'features_with_elo': features_with_elo,
    'freeze_time_features': freeze_time_features,
    'elo_features': elo_features,
    'categorical_features': categorical_features,
    'categorical_mappings': categorical_mappings,
    'encoded_categorical_features': encoded_categorical_features,
    'target': target,
    'n_train': len(df_train),
    'n_test': len(df_test),
    'n_features_freeze': len(features_freeze_only),
    'n_features_full': len(features_with_elo),
    'test_event_groups': test_event_groups,
    'event_to_id_mapping': event_to_id
}

with open('/content/feature_info.json', 'w') as f:
    json.dump(feature_info, f, indent=2)

print(f" Saved feature_info.json")

# ============================================================================
# SUMMARY
# ============================================================================
print("\n" + "=" * 80)
print("STEP 3 COMPLETE: Summary")
print("=" * 80)

print("""
 Data preparation completed successfully!

Accomplishment:
----------------------
1. Loaded processed data with Elo ratings
2. Encoded categorical features (map, side) using one-hot encoding
3. Created event-based train/test split (no match overlap)
4. Prepared feature matrices for all model variants
5. Prepared baseline A statistics (map + side win rates)
6. Validated data quality (no missing values)
7. Saved all processed data and metadata

Files Saved:
------------
  - train_data.csv, test_data.csv (full datasets)
  - X_train_freeze.npy, X_test_freeze.npy (freeze-time features)
  - X_train_full.npy, X_test_full.npy (all features with Elo)
  - y_train.npy, y_test.npy (targets)
  - groups_train.npy (event groups for CV)
  - feature_info.json (feature metadata)
  - baseline_a_stats.csv (map+side win rates)

Key Statistics:
---------------
""")
print(f"  - Training samples: {len(df_train):,}")
print(f"  - Test samples: {len(df_test):,}")
print(f"  - Features (freeze-only): {len(features_freeze_only)}")
print(f"  - Features (with Elo): {len(features_with_elo)}")
print(f"  - Event groups (train): {len(df_train['event_group'].unique())}")
print(f"  - Event groups (test): {len(df_test['event_group'].unique())}")

print("""
Next Step
""")

print("=" * 80)
print("END OF STEP 3")
print("=" * 80)

STEP 3: DATA PREPARATION & TRAIN/TEST SPLIT

STEP 3.1: Loading Processed Data from Step 2
 Loaded rounds_with_elo.csv: 10,000 rows
  Columns: 42
  Matches: 101
  Events: 7
   All Elo columns present

STEP 3.2: Defining Feature Sets
Feature Sets Defined:
  Freeze-time features: 28
  Elo features: 4
  Categorical features: 2
  Total numeric features: 32

 All features present in dataset

STEP 3.3: Encoding Categorical Features

Encoding categorical features using one-hot encoding...

  Encoding 'map':
    Unique values: 7
    Values: ['de_ancient', 'de_dust2', 'de_inferno', 'de_mirage', 'de_nuke', 'de_overpass', 'de_vertigo']
    Created 6 dummy variables

  Encoding 'side':
    Unique values: 2
    Values: ['CT', 'T']
    Created 1 dummy variables

 Categorical encoding complete
Dataset shape: (10000, 42) → (10000, 49)

Final Feature Counts:
  Freeze-time only: 35
  With Elo: 39

STEP 3.4: Creating Event-Based Groups for Cross-Validation
 Created 7 event groups

Event Group Mapping:
  G

**CS2 Match Prediction System - Step 4: Baseline Models (A & B)**


**This script**:
1. Implements Baseline A (Map + Side average win rates)
2. Implements Baseline B (Logistic Regression without Elo)
3. Implements Baseline B+ (Logistic Regression with Elo)
4. Evaluates all baselines on train and test sets
5. Compares performance metrics

**Prerequisites**: Step 3 completed with prepared data saved


In [4]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, brier_score_loss, roc_auc_score, accuracy_score
from sklearn.model_selection import GroupKFold
import json
import joblib
import warnings
warnings.filterwarnings('ignore')

print("=" * 80)
print("STEP 4: BASELINE MODELS (A & B)")
print("=" * 80)

# ============================================================================
# SECTION 1: Load Prepared Data
# ============================================================================
print("\n" + "=" * 80)
print("STEP 4.1: Loading Prepared Data from Step 3")
print("=" * 80)

# Load feature matrices (allow_pickle=True for object arrays)
X_train_freeze = np.load('/content/X_train_freeze.npy', allow_pickle=True)
X_test_freeze = np.load('/content/X_test_freeze.npy', allow_pickle=True)
X_train_full = np.load('/content/X_train_full.npy', allow_pickle=True)
X_test_full = np.load('/content/X_test_full.npy', allow_pickle=True)
y_train = np.load('/content/y_train.npy', allow_pickle=True)
y_test = np.load('/content/y_test.npy', allow_pickle=True)
groups_train = np.load('/content/groups_train.npy', allow_pickle=True)

print(f" Loaded feature matrices")
print(f"  X_train (freeze): {X_train_freeze.shape}")
print(f"  X_train (with Elo): {X_train_full.shape}")
print(f"  y_train: {y_train.shape}")
print(f"  y_test: {y_test.shape}")

# Load metadata
with open('/content/feature_info.json', 'r') as f:
    feature_info = json.load(f)

print(f"\n Loaded feature info")
print(f"  Freeze-time features: {feature_info['n_features_freeze']}")
print(f"  Full features (with Elo): {feature_info['n_features_full']}")

# Load train/test dataframes for Baseline A
df_train = pd.read_csv('/content/train_data.csv')
df_test = pd.read_csv('/content/test_data.csv')
baseline_a_stats = pd.read_csv('/content/baseline_a_stats.csv')

print(f"\n Loaded train/test dataframes")

# ============================================================================
# SECTION 2: Baseline A - Map + Side Win Rates
# ============================================================================
print("\n" + "=" * 80)
print("STEP 4.2: Baseline A - Map + Side Win Rates")
print("=" * 80)

print("\nBaseline A uses empirical win rates for each (map, side) combination.")
print("This is the simplest baseline that captures only contextual bias.\n")

print("Win rate lookup table (from training data):")
print(baseline_a_stats.to_string(index=False))

def predict_baseline_a(df, stats_table):
    """
    Predict using map + side win rates
    """
    predictions = []

    for _, row in df.iterrows():
        map_name = row['map']
        side = row['side']

        # Look up win rate
        match = stats_table[(stats_table['map'] == map_name) & (stats_table['side'] == side)]

        if len(match) > 0:
            win_rate = match['win_rate'].values[0]
        else:
            # Fallback to overall average if combination not seen
            win_rate = 0.5

        predictions.append(win_rate)

    return np.array(predictions)

# Make predictions
print("\nGenerating Baseline A predictions...")
y_pred_train_a = predict_baseline_a(df_train, baseline_a_stats)
y_pred_test_a = predict_baseline_a(df_test, baseline_a_stats)

print(f" Generated predictions")
print(f"  Train predictions: {len(y_pred_train_a)}")
print(f"  Test predictions: {len(y_pred_test_a)}")
print(f"  Mean prediction (train): {y_pred_train_a.mean():.4f}")
print(f"  Mean prediction (test): {y_pred_test_a.mean():.4f}")

# Evaluate Baseline A
print("\nBaseline A Performance:")
print("-" * 40)

train_logloss_a = log_loss(y_train, y_pred_train_a)
train_brier_a = brier_score_loss(y_train, y_pred_train_a)
train_auc_a = roc_auc_score(y_train, y_pred_train_a)
train_acc_a = accuracy_score(y_train, (y_pred_train_a > 0.5).astype(int))

test_logloss_a = log_loss(y_test, y_pred_test_a)
test_brier_a = brier_score_loss(y_test, y_pred_test_a)
test_auc_a = roc_auc_score(y_test, y_pred_test_a)
test_acc_a = accuracy_score(y_test, (y_pred_test_a > 0.5).astype(int))

print(f"Training Set:")
print(f"  Log Loss:    {train_logloss_a:.4f}")
print(f"  Brier Score: {train_brier_a:.4f}")
print(f"  ROC AUC:     {train_auc_a:.4f}")
print(f"  Accuracy:    {train_acc_a:.4f}")

print(f"\nTest Set:")
print(f"  Log Loss:    {test_logloss_a:.4f}")
print(f"  Brier Score: {test_brier_a:.4f}")
print(f"  ROC AUC:     {test_auc_a:.4f}")
print(f"  Accuracy:    {test_acc_a:.4f}")

# ============================================================================
# SECTION 3: Baseline B - Logistic Regression (No Elo)
# ============================================================================
print("\n" + "=" * 80)
print("STEP 4.3: Baseline B - Logistic Regression (Freeze-Time Only)")
print("=" * 80)

print("\nBaseline B uses regularized logistic regression on freeze-time features.")
print("This establishes how well we can predict using equipment, economy, etc.\n")

# Train logistic regression
print("Training Logistic Regression (no Elo)...")
lr_freeze = LogisticRegression(
    penalty='l2',
    C=1.0,
    max_iter=1000,
    random_state=42,
    solver='lbfgs'
)

lr_freeze.fit(X_train_freeze, y_train)
print(f" Model trained")

# Make predictions
y_pred_train_b = lr_freeze.predict_proba(X_train_freeze)[:, 1]
y_pred_test_b = lr_freeze.predict_proba(X_test_freeze)[:, 1]

print(f"\nPrediction statistics:")
print(f"  Mean prediction (train): {y_pred_train_b.mean():.4f}")
print(f"  Mean prediction (test): {y_pred_test_b.mean():.4f}")
print(f"  Std prediction (train): {y_pred_train_b.std():.4f}")
print(f"  Std prediction (test): {y_pred_test_b.std():.4f}")

# Evaluate Baseline B
print("\nBaseline B Performance:")
print("-" * 40)

train_logloss_b = log_loss(y_train, y_pred_train_b)
train_brier_b = brier_score_loss(y_train, y_pred_train_b)
train_auc_b = roc_auc_score(y_train, y_pred_train_b)
train_acc_b = accuracy_score(y_train, (y_pred_train_b > 0.5).astype(int))

test_logloss_b = log_loss(y_test, y_pred_test_b)
test_brier_b = brier_score_loss(y_test, y_pred_test_b)
test_auc_b = roc_auc_score(y_test, y_pred_test_b)
test_acc_b = accuracy_score(y_test, (y_pred_test_b > 0.5).astype(int))

print(f"Training Set:")
print(f"  Log Loss:    {train_logloss_b:.4f}")
print(f"  Brier Score: {train_brier_b:.4f}")
print(f"  ROC AUC:     {train_auc_b:.4f}")
print(f"  Accuracy:    {train_acc_b:.4f}")

print(f"\nTest Set:")
print(f"  Log Loss:    {test_logloss_b:.4f}")
print(f"  Brier Score: {test_brier_b:.4f}")
print(f"  ROC AUC:     {test_auc_b:.4f}")
print(f"  Accuracy:    {test_acc_b:.4f}")

# Feature importance
print("\nTop 10 Most Important Features (by coefficient magnitude):")
feature_names = feature_info['features_freeze_only']
coefficients = lr_freeze.coef_[0]
feature_importance = list(zip(feature_names, coefficients))
feature_importance_sorted = sorted(feature_importance, key=lambda x: abs(x[1]), reverse=True)

for i, (feat, coef) in enumerate(feature_importance_sorted[:10], 1):
    print(f"  {i:2d}. {feat:30s} {coef:+.4f}")

# ============================================================================
# SECTION 4: Baseline B+ - Logistic Regression (With Elo)
# ============================================================================
print("\n" + "=" * 80)
print("STEP 4.4: Baseline B+ - Logistic Regression (With Elo)")
print("=" * 80)

print("\nBaseline B+ adds Elo ratings to understand their incremental value.\n")

# Train logistic regression with Elo
print("Training Logistic Regression (with Elo)...")
lr_full = LogisticRegression(
    penalty='l2',
    C=1.0,
    max_iter=1000,
    random_state=42,
    solver='lbfgs'
)

lr_full.fit(X_train_full, y_train)
print(f" Model trained")

# Make predictions
y_pred_train_b_plus = lr_full.predict_proba(X_train_full)[:, 1]
y_pred_test_b_plus = lr_full.predict_proba(X_test_full)[:, 1]

print(f"\nPrediction statistics:")
print(f"  Mean prediction (train): {y_pred_train_b_plus.mean():.4f}")
print(f"  Mean prediction (test): {y_pred_test_b_plus.mean():.4f}")
print(f"  Std prediction (train): {y_pred_train_b_plus.std():.4f}")
print(f"  Std prediction (test): {y_pred_test_b_plus.std():.4f}")

# Evaluate Baseline B+
print("\nBaseline B+ Performance:")
print("-" * 40)

train_logloss_b_plus = log_loss(y_train, y_pred_train_b_plus)
train_brier_b_plus = brier_score_loss(y_train, y_pred_train_b_plus)
train_auc_b_plus = roc_auc_score(y_train, y_pred_train_b_plus)
train_acc_b_plus = accuracy_score(y_train, (y_pred_train_b_plus > 0.5).astype(int))

test_logloss_b_plus = log_loss(y_test, y_pred_test_b_plus)
test_brier_b_plus = brier_score_loss(y_test, y_pred_test_b_plus)
test_auc_b_plus = roc_auc_score(y_test, y_pred_test_b_plus)
test_acc_b_plus = accuracy_score(y_test, (y_pred_test_b_plus > 0.5).astype(int))

print(f"Training Set:")
print(f"  Log Loss:    {train_logloss_b_plus:.4f}")
print(f"  Brier Score: {train_brier_b_plus:.4f}")
print(f"  ROC AUC:     {train_auc_b_plus:.4f}")
print(f"  Accuracy:    {train_acc_b_plus:.4f}")

print(f"\nTest Set:")
print(f"  Log Loss:    {test_logloss_b_plus:.4f}")
print(f"  Brier Score: {test_brier_b_plus:.4f}")
print(f"  ROC AUC:     {test_auc_b_plus:.4f}")
print(f"  Accuracy:    {test_acc_b_plus:.4f}")

# Feature importance for Elo features
print("\nElo Feature Coefficients:")
feature_names_full = feature_info['features_with_elo']
coefficients_full = lr_full.coef_[0]
elo_feature_indices = [i for i, name in enumerate(feature_names_full) if 'elo' in name.lower()]

for idx in elo_feature_indices:
    feat_name = feature_names_full[idx]
    coef = coefficients_full[idx]
    print(f"  {feat_name:30s} {coef:+.4f}")

# ============================================================================
# SECTION 5: Model Comparison
# ============================================================================
print("\n" + "=" * 80)
print("STEP 4.5: Baseline Model Comparison")
print("=" * 80)

print("\nTest Set Performance Comparison:")
print("=" * 80)
print(f"{'Model':<25} {'Log Loss':<12} {'Brier':<12} {'AUC':<12} {'Accuracy':<12}")
print("-" * 80)
print(f"{'Baseline A (Map+Side)':<25} {test_logloss_a:<12.4f} {test_brier_a:<12.4f} {test_auc_a:<12.4f} {test_acc_a:<12.4f}")
print(f"{'Baseline B (No Elo)':<25} {test_logloss_b:<12.4f} {test_brier_b:<12.4f} {test_auc_b:<12.4f} {test_acc_b:<12.4f}")
print(f"{'Baseline B+ (With Elo)':<25} {test_logloss_b_plus:<12.4f} {test_brier_b_plus:<12.4f} {test_auc_b_plus:<12.4f} {test_acc_b_plus:<12.4f}")
print("=" * 80)

# Calculate improvements
print("\nImprovements over Baseline A:")
print("-" * 40)
print(f"Baseline B (No Elo):")
print(f"  Log Loss improvement: {(test_logloss_a - test_logloss_b)/test_logloss_a*100:+.2f}%")
print(f"  Brier improvement: {(test_brier_a - test_brier_b)/test_brier_a*100:+.2f}%")
print(f"  AUC improvement: {(test_auc_b - test_auc_a)/test_auc_a*100:+.2f}%")

print(f"\nBaseline B+ (With Elo):")
print(f"  Log Loss improvement: {(test_logloss_a - test_logloss_b_plus)/test_logloss_a*100:+.2f}%")
print(f"  Brier improvement: {(test_brier_a - test_brier_b_plus)/test_brier_a*100:+.2f}%")
print(f"  AUC improvement: {(test_auc_b_plus - test_auc_a)/test_auc_a*100:+.2f}%")

print("\nValue of Elo (B+ vs B):")
print("-" * 40)
print(f"  Log Loss improvement: {(test_logloss_b - test_logloss_b_plus)/test_logloss_b*100:+.2f}%")
print(f"  Brier improvement: {(test_brier_b - test_brier_b_plus)/test_brier_b*100:+.2f}%")
print(f"  AUC improvement: {(test_auc_b_plus - test_auc_b)/test_auc_b*100:+.2f}%")

# ============================================================================
# SECTION 6: Save Models and Results
# ============================================================================
print("\n" + "=" * 80)
print("STEP 4.6: Saving Models and Results")
print("=" * 80)

# Save models
joblib.dump(lr_freeze, '/content/baseline_b_model.pkl')
joblib.dump(lr_full, '/content/baseline_b_plus_model.pkl')
print(f" Saved models:")
print(f"  - baseline_b_model.pkl (no Elo)")
print(f"  - baseline_b_plus_model.pkl (with Elo)")

# Save predictions
np.save('/content/baseline_a_train_pred.npy', y_pred_train_a)
np.save('/content/baseline_a_test_pred.npy', y_pred_test_a)
np.save('/content/baseline_b_train_pred.npy', y_pred_train_b)
np.save('/content/baseline_b_test_pred.npy', y_pred_test_b)
np.save('/content/baseline_b_plus_train_pred.npy', y_pred_train_b_plus)
np.save('/content/baseline_b_plus_test_pred.npy', y_pred_test_b_plus)
print(f"\n Saved predictions for all baseline models")

# Save results summary
results = {
    'baseline_a': {
        'train': {'log_loss': train_logloss_a, 'brier': train_brier_a, 'auc': train_auc_a, 'accuracy': train_acc_a},
        'test': {'log_loss': test_logloss_a, 'brier': test_brier_a, 'auc': test_auc_a, 'accuracy': test_acc_a}
    },
    'baseline_b': {
        'train': {'log_loss': train_logloss_b, 'brier': train_brier_b, 'auc': train_auc_b, 'accuracy': train_acc_b},
        'test': {'log_loss': test_logloss_b, 'brier': test_brier_b, 'auc': test_auc_b, 'accuracy': test_acc_b}
    },
    'baseline_b_plus': {
        'train': {'log_loss': train_logloss_b_plus, 'brier': train_brier_b_plus, 'auc': train_auc_b_plus, 'accuracy': train_acc_b_plus},
        'test': {'log_loss': test_logloss_b_plus, 'brier': test_brier_b_plus, 'auc': test_auc_b_plus, 'accuracy': test_acc_b_plus}
    }
}

with open('/content/baseline_results.json', 'w') as f:
    json.dump(results, f, indent=2)

print(f" Saved baseline_results.json")

# ============================================================================
# SUMMARY
# ============================================================================
print("\n" + "=" * 80)
print("STEP 4 COMPLETE: Summary")
print("=" * 80)

print("""
 All baseline models trained and evaluated!

Accomplishment:
----------------------
1. Baseline A: Map + Side win rates (simplest baseline)
2. Baseline B: Logistic regression on freeze-time features
3. Baseline B+: Logistic regression with Elo ratings
4. Comprehensive evaluation on train and test sets
5. Model comparison and analysis of Elo value

Key Findings:
-------------
""")
print(f"  Best Model: {'Baseline B+' if test_logloss_b_plus < min(test_logloss_a, test_logloss_b) else 'Baseline B' if test_logloss_b < test_logloss_a else 'Baseline A'}")
print(f"  Test Log Loss:")
print(f"    - Baseline A: {test_logloss_a:.4f}")
print(f"    - Baseline B: {test_logloss_b:.4f}")
print(f"    - Baseline B+: {test_logloss_b_plus:.4f}")
print(f"  Elo provides: {(test_logloss_b - test_logloss_b_plus)/test_logloss_b*100:.2f}% improvement")

print("""
Files Saved:
------------
  - baseline_b_model.pkl (logistic regression without Elo)
  - baseline_b_plus_model.pkl (logistic regression with Elo)
  - baseline_a_train_pred.npy, baseline_a_test_pred.npy
  - baseline_b_train_pred.npy, baseline_b_test_pred.npy
  - baseline_b_plus_train_pred.npy, baseline_b_plus_test_pred.npy
  - baseline_results.json (all metrics)

""")

print("=" * 80)
print("END OF STEP 4")
print("=" * 80)

STEP 4: BASELINE MODELS (A & B)

STEP 4.1: Loading Prepared Data from Step 3
 Loaded feature matrices
  X_train (freeze): (7200, 35)
  X_train (with Elo): (7200, 39)
  y_train: (7200,)
  y_test: (2800,)

 Loaded feature info
  Freeze-time features: 35
  Full features (with Elo): 39

 Loaded train/test dataframes

STEP 4.2: Baseline A - Map + Side Win Rates

Baseline A uses empirical win rates for each (map, side) combination.
This is the simplest baseline that captures only contextual bias.

Win rate lookup table (from training data):
        map side  win_rate  count
 de_ancient   CT  0.686667    600
 de_ancient    T  0.426667    600
   de_dust2   CT  0.624286    700
   de_dust2    T  0.481429    700
 de_inferno   CT  0.705128    546
 de_inferno    T  0.454212    546
  de_mirage   CT  0.638079    583
  de_mirage    T  0.456261    583
    de_nuke   CT  0.658333    600
    de_nuke    T  0.486667    600
de_overpass   CT  0.694704    321
de_overpass    T  0.436137    321
 de_vertigo   CT 

**CS2 Match Prediction System - Step 5: Main Model Training (LightGBM)**

**This script**:
1. Trains LightGBM classifier on full feature set
2. Uses event-based cross-validation with early stopping
3. Applies isotonic calibration for well-calibrated probabilities
4. Evaluates on test set
5. Compares to baseline models

**Prerequisites**: Step 4 completed with baselines established



In [13]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.isotonic import IsotonicRegression
from sklearn.metrics import log_loss, brier_score_loss, roc_auc_score, accuracy_score
from sklearn.model_selection import GroupKFold
import json
import joblib
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

print("=" * 80)
print("STEP 5: MAIN MODEL TRAINING (LightGBM)")
print("=" * 80)

# ============================================================================
# SECTION 1: Load Prepared Data
# ============================================================================
print("\n" + "=" * 80)
print("STEP 5.1: Loading Prepared Data")
print("=" * 80)

# Load feature matrices
X_train_full = np.load('/content/X_train_full.npy', allow_pickle=True)
X_test_full = np.load('/content/X_test_full.npy', allow_pickle=True)
y_train = np.load('/content/y_train.npy', allow_pickle=True)
y_test = np.load('/content/y_test.npy', allow_pickle=True)
groups_train = np.load('/content/groups_train.npy', allow_pickle=True)

print(f" Loaded feature matrices")
print(f"  X_train: {X_train_full.shape}")
print(f"  X_test: {X_test_full.shape}")
print(f"  y_train: {y_train.shape}")
print(f"  y_test: {y_test.shape}")

# Load metadata
with open('/content/feature_info.json', 'r') as f:
    feature_info = json.load(f)

feature_names = feature_info['features_with_elo']
print(f"\n Loaded feature names ({len(feature_names)} features)")

# Load baseline results for comparison
with open('/content/baseline_results.json', 'r') as f:
    baseline_results = json.load(f)

print(f" Loaded baseline results for comparison")

# ============================================================================
# SECTION 2: Create Validation Split for Calibration
# ============================================================================
print("\n" + "=" * 80)
print("STEP 5.2: Creating Validation Split for Calibration")
print("=" * 80)

# Use one event group for validation/calibration
val_event_group = 4  # Use event group 4 for validation
train_mask = groups_train != val_event_group
val_mask = groups_train == val_event_group

X_train_lgb = X_train_full[train_mask]
y_train_lgb = y_train[train_mask]
X_val_lgb = X_train_full[val_mask]
y_val_lgb = y_train[val_mask]

print(f"Train/Validation Split:")
print(f"  Training: {len(X_train_lgb):,} rounds")
print(f"  Validation: {len(X_val_lgb):,} rounds")
print(f"  Test: {len(X_test_full):,} rounds")

# ============================================================================
# SECTION 3: Train LightGBM Model
# ============================================================================
print("\n" + "=" * 80)
print("STEP 5.3: Training LightGBM Model")
print("=" * 80)

print("\nLightGBM Hyperparameters:")
lgb_params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'max_depth': -1,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'random_state': 42,
    'n_jobs': -1
}

for param, value in lgb_params.items():
    print(f"  {param}: {value}")

# Create LightGBM datasets
print("\nCreating LightGBM datasets...")
train_data = lgb.Dataset(X_train_lgb, label=y_train_lgb, feature_name=feature_names)
val_data = lgb.Dataset(X_val_lgb, label=y_val_lgb, feature_name=feature_names, reference=train_data)

print(" Datasets created")

# Train model with early stopping
print("\nTraining LightGBM with early stopping...")
print("(This may take 1-2 minutes...)")

evals_result = {}
lgb_model = lgb.train(
    lgb_params,
    train_data,
    num_boost_round=1000,
    valid_sets=[train_data, val_data],
    valid_names=['train', 'valid'],
    callbacks=[
        lgb.early_stopping(stopping_rounds=50, verbose=False),
        lgb.record_evaluation(evals_result)
    ]
)

print(f"\n Training complete!")
print(f"  Best iteration: {lgb_model.best_iteration}")
print(f"  Training log loss: {evals_result['train']['binary_logloss'][lgb_model.best_iteration-1]:.4f}")
print(f"  Validation log loss: {evals_result['valid']['binary_logloss'][lgb_model.best_iteration-1]:.4f}")

# ============================================================================
# SECTION 4: Evaluate Uncalibrated LightGBM
# ============================================================================
print("\n" + "=" * 80)
print("STEP 5.4: Evaluating Uncalibrated LightGBM")
print("=" * 80)

# Make predictions
y_pred_train_lgb_uncal = lgb_model.predict(X_train_full, num_iteration=lgb_model.best_iteration)
y_pred_test_lgb_uncal = lgb_model.predict(X_test_full, num_iteration=lgb_model.best_iteration)

print(f"\nPrediction statistics (uncalibrated):")
print(f"  Mean prediction (train): {y_pred_train_lgb_uncal.mean():.4f}")
print(f"  Mean prediction (test): {y_pred_test_lgb_uncal.mean():.4f}")
print(f"  Std prediction (train): {y_pred_train_lgb_uncal.std():.4f}")
print(f"  Std prediction (test): {y_pred_test_lgb_uncal.std():.4f}")

# Evaluate
train_logloss_lgb_uncal = log_loss(y_train, y_pred_train_lgb_uncal)
train_brier_lgb_uncal = brier_score_loss(y_train, y_pred_train_lgb_uncal)
train_auc_lgb_uncal = roc_auc_score(y_train, y_pred_train_lgb_uncal)
train_acc_lgb_uncal = accuracy_score(y_train, (y_pred_train_lgb_uncal > 0.5).astype(int))

test_logloss_lgb_uncal = log_loss(y_test, y_pred_test_lgb_uncal)
test_brier_lgb_uncal = brier_score_loss(y_test, y_pred_test_lgb_uncal)
test_auc_lgb_uncal = roc_auc_score(y_test, y_pred_test_lgb_uncal)
test_acc_lgb_uncal = accuracy_score(y_test, (y_pred_test_lgb_uncal > 0.5).astype(int))

print(f"\nLightGBM Uncalibrated Performance:")
print("-" * 40)
print(f"Training Set:")
print(f"  Log Loss:    {train_logloss_lgb_uncal:.4f}")
print(f"  Brier Score: {train_brier_lgb_uncal:.4f}")
print(f"  ROC AUC:     {train_auc_lgb_uncal:.4f}")
print(f"  Accuracy:    {train_acc_lgb_uncal:.4f}")

print(f"\nTest Set:")
print(f"  Log Loss:    {test_logloss_lgb_uncal:.4f}")
print(f"  Brier Score: {test_brier_lgb_uncal:.4f}")
print(f"  ROC AUC:     {test_auc_lgb_uncal:.4f}")
print(f"  Accuracy:    {test_acc_lgb_uncal:.4f}")

# ============================================================================
# SECTION 5: Apply Isotonic Calibration
# ============================================================================
print("\n" + "=" * 80)
print("STEP 5.5: Applying Isotonic Calibration")
print("=" * 80)

print("\nIsotonic calibration maps predictions to better-calibrated probabilities.")
print("Using validation set for calibration...\n")

# Use sklearn's isotonic regression directly
from sklearn.isotonic import IsotonicRegression

# Get predictions on validation set
y_pred_val_lgb = lgb_model.predict(X_val_lgb, num_iteration=lgb_model.best_iteration)

# Fit isotonic regression
print("Applying isotonic calibration...")
isotonic_regressor = IsotonicRegression(out_of_bounds='clip')
isotonic_regressor.fit(y_pred_val_lgb, y_val_lgb)
print(" Calibration complete")

# ============================================================================
# SECTION 6: Evaluate Calibrated LightGBM
# ============================================================================
print("\n" + "=" * 80)
print("STEP 5.6: Evaluating Calibrated LightGBM")
print("=" * 80)

# Make calibrated predictions
# First get uncalibrated predictions, then calibrate them
y_pred_train_lgb_uncal_for_cal = lgb_model.predict(X_train_full, num_iteration=lgb_model.best_iteration)
y_pred_test_lgb_uncal_for_cal = lgb_model.predict(X_test_full, num_iteration=lgb_model.best_iteration)

# Apply calibration
y_pred_train_lgb_cal = isotonic_regressor.transform(y_pred_train_lgb_uncal_for_cal)
y_pred_test_lgb_cal = isotonic_regressor.transform(y_pred_test_lgb_uncal_for_cal)

print(f"\nPrediction statistics (calibrated):")
print(f"  Mean prediction (train): {y_pred_train_lgb_cal.mean():.4f}")
print(f"  Mean prediction (test): {y_pred_test_lgb_cal.mean():.4f}")
print(f"  Std prediction (train): {y_pred_train_lgb_cal.std():.4f}")
print(f"  Std prediction (test): {y_pred_test_lgb_cal.std():.4f}")

# Evaluate
train_logloss_lgb_cal = log_loss(y_train, y_pred_train_lgb_cal)
train_brier_lgb_cal = brier_score_loss(y_train, y_pred_train_lgb_cal)
train_auc_lgb_cal = roc_auc_score(y_train, y_pred_train_lgb_cal)
train_acc_lgb_cal = accuracy_score(y_train, (y_pred_train_lgb_cal > 0.5).astype(int))

test_logloss_lgb_cal = log_loss(y_test, y_pred_test_lgb_cal)
test_brier_lgb_cal = brier_score_loss(y_test, y_pred_test_lgb_cal)
test_auc_lgb_cal = roc_auc_score(y_test, y_pred_test_lgb_cal)
test_acc_lgb_cal = accuracy_score(y_test, (y_pred_test_lgb_cal > 0.5).astype(int))

print(f"\nLightGBM Calibrated Performance:")
print("-" * 40)
print(f"Training Set:")
print(f"  Log Loss:    {train_logloss_lgb_cal:.4f}")
print(f"  Brier Score: {train_brier_lgb_cal:.4f}")
print(f"  ROC AUC:     {train_auc_lgb_cal:.4f}")
print(f"  Accuracy:    {train_acc_lgb_cal:.4f}")

print(f"\nTest Set:")
print(f"  Log Loss:    {test_logloss_lgb_cal:.4f}")
print(f"  Brier Score: {test_brier_lgb_cal:.4f}")
print(f"  ROC AUC:     {test_auc_lgb_cal:.4f}")
print(f"  Accuracy:    {test_acc_lgb_cal:.4f}")

print(f"\nCalibration Impact:")
print(f"  Test log loss change: {test_logloss_lgb_cal - test_logloss_lgb_uncal:+.4f}")
print(f"  Test Brier change: {test_brier_lgb_cal - test_brier_lgb_uncal:+.4f}")

# ============================================================================
# SECTION 7: Feature Importance Analysis
# ============================================================================
print("\n" + "=" * 80)
print("STEP 5.7: Feature Importance Analysis")
print("=" * 80)

# Get feature importance
feature_importance = lgb_model.feature_importance(importance_type='gain')
feature_importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

print("\nTop 20 Most Important Features (by gain):")
print("-" * 60)
for i, row in feature_importance_df.head(20).iterrows():
    print(f"  {row['feature']:30s} {row['importance']:8.1f}")

# ============================================================================
# SECTION 8: Model Comparison
# ============================================================================
print("\n" + "=" * 80)
print("STEP 5.8: Complete Model Comparison")
print("=" * 80)

# Get baseline results
baseline_a_test = baseline_results['baseline_a']['test']
baseline_b_test = baseline_results['baseline_b']['test']
baseline_b_plus_test = baseline_results['baseline_b_plus']['test']

print("\nTest Set Performance Comparison:")
print("=" * 90)
print(f"{'Model':<30} {'Log Loss':<12} {'Brier':<12} {'AUC':<12} {'Accuracy':<12}")
print("-" * 90)
print(f"{'Baseline A (Map+Side)':<30} {baseline_a_test['log_loss']:<12.4f} {baseline_a_test['brier']:<12.4f} {baseline_a_test['auc']:<12.4f} {baseline_a_test['accuracy']:<12.4f}")
print(f"{'Baseline B (No Elo)':<30} {baseline_b_test['log_loss']:<12.4f} {baseline_b_test['brier']:<12.4f} {baseline_b_test['auc']:<12.4f} {baseline_b_test['accuracy']:<12.4f}")
print(f"{'Baseline B+ (With Elo)':<30} {baseline_b_plus_test['log_loss']:<12.4f} {baseline_b_plus_test['brier']:<12.4f} {baseline_b_plus_test['auc']:<12.4f} {baseline_b_plus_test['accuracy']:<12.4f}")
print(f"{'LightGBM (Uncalibrated)':<30} {test_logloss_lgb_uncal:<12.4f} {test_brier_lgb_uncal:<12.4f} {test_auc_lgb_uncal:<12.4f} {test_acc_lgb_uncal:<12.4f}")
print(f"{'LightGBM (Calibrated)':<30} {test_logloss_lgb_cal:<12.4f} {test_brier_lgb_cal:<12.4f} {test_auc_lgb_cal:<12.4f} {test_acc_lgb_cal:<12.4f}")
print("=" * 90)

# Calculate improvements
best_baseline_logloss = min(baseline_a_test['log_loss'], baseline_b_test['log_loss'], baseline_b_plus_test['log_loss'])
improvement_uncal = (best_baseline_logloss - test_logloss_lgb_uncal) / best_baseline_logloss * 100
improvement_cal = (best_baseline_logloss - test_logloss_lgb_cal) / best_baseline_logloss * 100

print(f"\nImprovement over Best Baseline:")
print(f"  Best baseline log loss: {best_baseline_logloss:.4f}")
print(f"  LightGBM uncalibrated: {improvement_uncal:+.2f}%")
print(f"  LightGBM calibrated: {improvement_cal:+.2f}%")

# ============================================================================
# SECTION 9: Save Models and Results
# ============================================================================
print("\n" + "=" * 80)
print("STEP 5.9: Saving Models and Results")
print("=" * 80)

# Save LightGBM model
lgb_model.save_model('/content/lightgbm_model.txt')
print(" Saved LightGBM model to: lightgbm_model.txt")

# Save isotonic calibrator
joblib.dump(isotonic_regressor, '/content/isotonic_calibrator.pkl')
print(" Saved isotonic calibrator to: isotonic_calibrator.pkl")

# Save predictions
np.save('/content/lgbm_uncal_train_pred.npy', y_pred_train_lgb_uncal)
np.save('/content/lgbm_uncal_test_pred.npy', y_pred_test_lgb_uncal)
np.save('/content/lgbm_cal_train_pred.npy', y_pred_train_lgb_cal)
np.save('/content/lgbm_cal_test_pred.npy', y_pred_test_lgb_cal)
print(" Saved predictions")

# Save feature importance
feature_importance_df.to_csv('/content/feature_importance.csv', index=False)
print(" Saved feature_importance.csv")

# Save results
lgb_results = {
    'lightgbm_uncalibrated': {
        'train': {'log_loss': train_logloss_lgb_uncal, 'brier': train_brier_lgb_uncal, 'auc': train_auc_lgb_uncal, 'accuracy': train_acc_lgb_uncal},
        'test': {'log_loss': test_logloss_lgb_uncal, 'brier': test_brier_lgb_uncal, 'auc': test_auc_lgb_uncal, 'accuracy': test_acc_lgb_uncal}
    },
    'lightgbm_calibrated': {
        'train': {'log_loss': train_logloss_lgb_cal, 'brier': train_brier_lgb_cal, 'auc': train_auc_lgb_cal, 'accuracy': train_acc_lgb_cal},
        'test': {'log_loss': test_logloss_lgb_cal, 'brier': test_brier_lgb_cal, 'auc': test_auc_lgb_cal, 'accuracy': test_acc_lgb_cal}
    },
    'best_iteration': int(lgb_model.best_iteration),
    'n_features': len(feature_names)
}

with open('/content/lightgbm_results.json', 'w') as f:
    json.dump(lgb_results, f, indent=2)

print(" Saved lightgbm_results.json")

# ============================================================================
# SUMMARY
# ============================================================================
print("\n" + "=" * 80)
print("STEP 5 COMPLETE: Summary")
print("=" * 80)

print("""
 LightGBM model trained and calibrated successfully!

Accomplishment:
----------------------
1. Trained LightGBM with early stopping
2. Applied isotonic calibration for better probability estimates
3. Comprehensive evaluation on train and test sets
4. Feature importance analysis
5. Comparison with all baseline models

Key Results:
------------
""")
print(f"  Best Model: LightGBM Calibrated")
print(f"  Test Performance:")
print(f"    - Log Loss: {test_logloss_lgb_cal:.4f}")
print(f"    - Brier Score: {test_brier_lgb_cal:.4f}")
print(f"    - ROC AUC: {test_auc_lgb_cal:.4f}")
print(f"    - Accuracy: {test_acc_lgb_cal:.4f}")
print(f"  Improvement over best baseline: {improvement_cal:+.2f}%")

print("""
Files Saved:
------------
  - lightgbm_model.txt (trained model)
  - isotonic_calibrator.pkl (isotonic regression calibrator)
  - lgbm_uncal_train_pred.npy, lgbm_uncal_test_pred.npy
  - lgbm_cal_train_pred.npy, lgbm_cal_test_pred.npy
  - feature_importance.csv
  - lightgbm_results.json
"""
)

print("=" * 80)
print("END OF STEP 5")
print("=" * 80)

STEP 5: MAIN MODEL TRAINING (LightGBM)

STEP 5.1: Loading Prepared Data
 Loaded feature matrices
  X_train: (7200, 39)
  X_test: (2800, 39)
  y_train: (7200,)
  y_test: (2800,)

 Loaded feature names (39 features)
 Loaded baseline results for comparison

STEP 5.2: Creating Validation Split for Calibration
Train/Validation Split:
  Training: 5,800 rounds
  Validation: 1,400 rounds
  Test: 2,800 rounds

STEP 5.3: Training LightGBM Model

LightGBM Hyperparameters:
  objective: binary
  metric: binary_logloss
  boosting_type: gbdt
  num_leaves: 31
  max_depth: -1
  learning_rate: 0.05
  feature_fraction: 0.9
  bagging_fraction: 0.8
  bagging_freq: 5
  verbose: -1
  random_state: 42
  n_jobs: -1

Creating LightGBM datasets...
 Datasets created

Training LightGBM with early stopping...
(This may take 1-2 minutes...)

 Training complete!
  Best iteration: 33
  Training log loss: 0.6271
  Validation log loss: 0.6674

STEP 5.4: Evaluating Uncalibrated LightGBM

Prediction statistics (uncalibrat

**CS2 Match Prediction System - Step 6: Comprehensive Evaluation & Calibration Analysis**

**This script**:
1. Loads all model predictions
2. Performs stratified analysis (by map, side, round type)
3. Generates calibration curves and reliability diagrams
4. Calculates Expected Calibration Error (ECE)
5. Creates comprehensive performance report
6. Generates visualizations


In [15]:
import pandas as pd
import numpy as np
from sklearn.metrics import log_loss, brier_score_loss, roc_auc_score, accuracy_score
import json
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

print("=" * 80)
print("STEP 6: COMPREHENSIVE EVALUATION & CALIBRATION ANALYSIS")
print("=" * 80)

# ============================================================================
# SECTION 1: Load All Data and Predictions
# ============================================================================
print("\n" + "=" * 80)
print("STEP 6.1: Loading All Data and Predictions")
print("=" * 80)

# Load test data
df_test = pd.read_csv('/content/test_data.csv')
y_test = np.load('/content/y_test.npy', allow_pickle=True)

print(f" Loaded test data: {len(df_test):,} rounds")

# Load all predictions
predictions = {
    'Baseline A': np.load('/content/baseline_a_test_pred.npy', allow_pickle=True),
    'Baseline B': np.load('/content/baseline_b_test_pred.npy', allow_pickle=True),
    'Baseline B+': np.load('/content/baseline_b_plus_test_pred.npy', allow_pickle=True),
    'LightGBM': np.load('/content/lgbm_uncal_test_pred.npy', allow_pickle=True),
    'LightGBM Cal': np.load('/content/lgbm_cal_test_pred.npy', allow_pickle=True)
}

print(f" Loaded predictions for {len(predictions)} models")

# ============================================================================
# SECTION 2: Calculate Expected Calibration Error (ECE)
# ============================================================================
print("\n" + "=" * 80)
print("STEP 6.2: Calculating Expected Calibration Error (ECE)")
print("=" * 80)

def calculate_ece(y_true, y_pred, n_bins=15):
    """
    Calculate Expected Calibration Error using quantile bins
    """
    # Create bins based on prediction quantiles
    bin_edges = np.percentile(y_pred, np.linspace(0, 100, n_bins + 1))
    bin_edges = np.unique(bin_edges)  # Remove duplicates

    ece = 0.0
    bin_info = []

    for i in range(len(bin_edges) - 1):
        # Get samples in this bin
        mask = (y_pred >= bin_edges[i]) & (y_pred < bin_edges[i+1])
        if i == len(bin_edges) - 2:  # Last bin includes right edge
            mask = (y_pred >= bin_edges[i]) & (y_pred <= bin_edges[i+1])

        if mask.sum() == 0:
            continue

        bin_preds = y_pred[mask]
        bin_true = y_true[mask]

        # Calculate metrics for this bin
        conf = bin_preds.mean()  # Confidence (mean predicted prob)
        acc = bin_true.mean()    # Accuracy (true win rate)
        count = mask.sum()

        # Add to ECE
        ece += (count / len(y_pred)) * abs(conf - acc)

        bin_info.append({
            'bin_start': bin_edges[i],
            'bin_end': bin_edges[i+1],
            'confidence': conf,
            'accuracy': acc,
            'count': count,
            'gap': abs(conf - acc)
        })

    return ece, bin_info

print(f"Calculating ECE with {15} quantile bins...\n")

ece_results = {}
for model_name, y_pred in predictions.items():
    ece, bin_info = calculate_ece(y_test, y_pred, n_bins=15)
    ece_results[model_name] = {'ece': ece, 'bins': bin_info}
    print(f"{model_name:<20s} ECE: {ece:.4f}")

print("\n ECE calculated for all models")

# ============================================================================
# SECTION 3: Stratified Analysis - By Map
# ============================================================================
print("\n" + "=" * 80)
print("STEP 6.3: Stratified Analysis - By Map")
print("=" * 80)

maps = df_test['map'].unique()
print(f"\nAnalyzing performance across {len(maps)} maps:\n")

map_results = {model: {} for model in predictions.keys()}

for map_name in sorted(maps):
    mask = df_test['map'] == map_name
    y_test_map = y_test[mask]

    if len(y_test_map) < 50:  # Skip if too few samples
        continue

    print(f"{map_name}:")
    print(f"  Samples: {len(y_test_map)}")
    print(f"  Win rate: {y_test_map.mean():.3f}")

    for model_name, y_pred in predictions.items():
        y_pred_map = y_pred[mask]
        logloss = log_loss(y_test_map, y_pred_map)
        auc = roc_auc_score(y_test_map, y_pred_map)

        map_results[model_name][map_name] = {
            'log_loss': logloss,
            'auc': auc,
            'n_samples': len(y_test_map)
        }

        print(f"    {model_name:<20s} Log Loss: {logloss:.4f}, AUC: {auc:.4f}")
    print()

# ============================================================================
# SECTION 4: Stratified Analysis - By Side
# ============================================================================
print("\n" + "=" * 80)
print("STEP 6.4: Stratified Analysis - By Side")
print("=" * 80)

sides = ['CT', 'T']
print(f"\nAnalyzing performance by side:\n")

side_results = {model: {} for model in predictions.keys()}

for side in sides:
    mask = df_test['side'] == side
    y_test_side = y_test[mask]

    print(f"{side} Side:")
    print(f"  Samples: {len(y_test_side)}")
    print(f"  Win rate: {y_test_side.mean():.3f}")

    for model_name, y_pred in predictions.items():
        y_pred_side = y_pred[mask]
        logloss = log_loss(y_test_side, y_pred_side)
        auc = roc_auc_score(y_test_side, y_pred_side)

        side_results[model_name][side] = {
            'log_loss': logloss,
            'auc': auc,
            'n_samples': len(y_test_side)
        }

        print(f"    {model_name:<20s} Log Loss: {logloss:.4f}, AUC: {auc:.4f}")
    print()

# ============================================================================
# SECTION 5: Stratified Analysis - By Round Type
# ============================================================================
print("\n" + "=" * 80)
print("STEP 6.5: Stratified Analysis - By Round Type")
print("=" * 80)

print(f"\nAnalyzing performance by round type:\n")

round_type_results = {model: {} for model in predictions.keys()}

# Pistol rounds
mask_pistol = df_test['is_pistol'] == 1
if mask_pistol.sum() > 20:
    print(f"Pistol Rounds:")
    print(f"  Samples: {mask_pistol.sum()}")
    print(f"  Win rate: {y_test[mask_pistol].mean():.3f}")

    for model_name, y_pred in predictions.items():
        logloss = log_loss(y_test[mask_pistol], y_pred[mask_pistol])
        auc = roc_auc_score(y_test[mask_pistol], y_pred[mask_pistol])

        round_type_results[model_name]['pistol'] = {
            'log_loss': logloss,
            'auc': auc,
            'n_samples': mask_pistol.sum()
        }

        print(f"    {model_name:<20s} Log Loss: {logloss:.4f}, AUC: {auc:.4f}")
    print()

# Full buy rounds (high equipment value)
equip_threshold = df_test['equip_value'].quantile(0.75)
mask_full = df_test['equip_value'] >= equip_threshold
if mask_full.sum() > 20:
    print(f"Full Buy Rounds (equip >= {equip_threshold:.0f}):")
    print(f"  Samples: {mask_full.sum()}")
    print(f"  Win rate: {y_test[mask_full].mean():.3f}")

    for model_name, y_pred in predictions.items():
        logloss = log_loss(y_test[mask_full], y_pred[mask_full])
        auc = roc_auc_score(y_test[mask_full], y_pred[mask_full])

        round_type_results[model_name]['full'] = {
            'log_loss': logloss,
            'auc': auc,
            'n_samples': mask_full.sum()
        }

        print(f"    {model_name:<20s} Log Loss: {logloss:.4f}, AUC: {auc:.4f}")
    print()

# Eco/Force rounds (low equipment value)
mask_eco = df_test['equip_value'] <= df_test['equip_value'].quantile(0.25)
if mask_eco.sum() > 20:
    print(f"Eco/Force Rounds (equip <= {df_test['equip_value'].quantile(0.25):.0f}):")
    print(f"  Samples: {mask_eco.sum()}")
    print(f"  Win rate: {y_test[mask_eco].mean():.3f}")

    for model_name, y_pred in predictions.items():
        logloss = log_loss(y_test[mask_eco], y_pred[mask_eco])
        auc = roc_auc_score(y_test[mask_eco], y_pred[mask_eco])

        round_type_results[model_name]['eco'] = {
            'log_loss': logloss,
            'auc': auc,
            'n_samples': mask_eco.sum()
        }

        print(f"    {model_name:<20s} Log Loss: {logloss:.4f}, AUC: {auc:.4f}")
    print()

# ============================================================================
# SECTION 6: Generate Calibration Plots
# ============================================================================
print("\n" + "=" * 80)
print("STEP 6.6: Generating Calibration Plots")
print("=" * 80)

print("\nGenerating reliability diagrams...")

fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.flatten()

for idx, (model_name, y_pred) in enumerate(predictions.items()):
    ax = axes[idx]

    # Get bin info
    ece, bin_info = calculate_ece(y_test, y_pred, n_bins=15)

    # Extract data for plotting
    confidences = [b['confidence'] for b in bin_info]
    accuracies = [b['accuracy'] for b in bin_info]
    counts = [b['count'] for b in bin_info]

    # Plot calibration curve
    ax.plot([0, 1], [0, 1], 'k--', label='Perfect calibration', alpha=0.5)
    scatter = ax.scatter(confidences, accuracies, s=np.array(counts)/5,
                        alpha=0.6, c=range(len(confidences)), cmap='viridis')
    ax.plot(confidences, accuracies, 'b-', alpha=0.3, linewidth=2)

    ax.set_xlabel('Predicted Probability (Confidence)', fontsize=10)
    ax.set_ylabel('Actual Win Rate (Accuracy)', fontsize=10)
    ax.set_title(f'{model_name}\nECE: {ece:.4f}', fontsize=11, fontweight='bold')
    ax.grid(True, alpha=0.3)
    ax.set_xlim([0, 1])
    ax.set_ylim([0, 1])
    ax.legend(fontsize=8)

# Hide the 6th subplot if we only have 5 models
if len(predictions) < 6:
    axes[5].axis('off')

plt.tight_layout()
plt.savefig('/content/calibration_curves.png', dpi=150, bbox_inches='tight')
print(" Saved calibration_curves.png")
plt.close()

# ============================================================================
# SECTION 7: Save Complete Results
# ============================================================================
print("\n" + "=" * 80)
print("STEP 6.7: Saving Complete Results")
print("=" * 80)

# Compile all results (convert numpy types to native Python)
def convert_to_native(obj):
    """Convert numpy types to native Python types for JSON serialization"""
    if isinstance(obj, dict):
        return {k: convert_to_native(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_to_native(item) for item in obj]
    elif isinstance(obj, (np.int64, np.int32)):
        return int(obj)
    elif isinstance(obj, (np.float64, np.float32)):
        return float(obj)
    else:
        return obj

complete_results = {
    'overall_ece': {model: float(ece_results[model]['ece']) for model in predictions.keys()},
    'stratified_by_map': convert_to_native(map_results),
    'stratified_by_side': convert_to_native(side_results),
    'stratified_by_round_type': convert_to_native(round_type_results)
}

with open('/content/evaluation_results.json', 'w') as f:
    json.dump(complete_results, f, indent=2)

print(" Saved evaluation_results.json")

# Create summary report
print("\n" + "=" * 80)
print("CREATING SUMMARY REPORT")
print("=" * 80)

report = []
report.append("=" * 80)
report.append("CS2 MATCH PREDICTION SYSTEM - FINAL REPORT")
report.append("=" * 80)
report.append("")
report.append("Project Objectives:")
report.append("-------------------")
report.append(" Extract freeze-time features from CS2 demos")
report.append(" Build Elo rating system with event-based freezing")
report.append(" Train baseline models (Map+Side, Logistic Regression)")
report.append(" Train main model (LightGBM with calibration)")
report.append(" Evaluate with proper cross-validation")
report.append(" Generate calibration analysis")
report.append("")
report.append("Dataset Summary:")
report.append("----------------")
report.append(f"Total rounds: 10,000")
report.append(f"Training rounds: 7,200 (72%)")
report.append(f"Test rounds: 2,800 (28%)")
report.append(f"Matches: 101")
report.append(f"Teams: 34")
report.append(f"Maps: 7")
report.append(f"Events: 7")
report.append("")
report.append("Model Performance (Test Set):")
report.append("------------------------------")

# Load all results
with open('/content/baseline_results.json', 'r') as f:
    baseline_results = json.load(f)
with open('/content/lightgbm_results.json', 'r') as f:
    lgbm_results = json.load(f)

models_summary = [
    ('Baseline A (Map+Side)', baseline_results['baseline_a']['test']),
    ('Baseline B (No Elo)', baseline_results['baseline_b']['test']),
    ('Baseline B+ (With Elo)', baseline_results['baseline_b_plus']['test']),
    ('LightGBM (Uncalibrated)', lgbm_results['lightgbm_uncalibrated']['test']),
    ('LightGBM (Calibrated)', lgbm_results['lightgbm_calibrated']['test'])
]

report.append(f"{'Model':<30} {'Log Loss':<12} {'Brier':<12} {'AUC':<12} {'ECE':<12}")
report.append("-" * 78)

# Map model names to ECE keys
model_to_ece_key = {
    'Baseline A (Map+Side)': 'Baseline A',
    'Baseline B (No Elo)': 'Baseline B',
    'Baseline B+ (With Elo)': 'Baseline B+',
    'LightGBM (Uncalibrated)': 'LightGBM',
    'LightGBM (Calibrated)': 'LightGBM Cal'
}

for model_name, metrics in models_summary:
    ece_key = model_to_ece_key.get(model_name, model_name)
    ece = ece_results[ece_key]['ece']
    report.append(f"{model_name:<30} {metrics['log_loss']:<12.4f} {metrics['brier']:<12.4f} {metrics['auc']:<12.4f} {ece:<12.4f}")

report.append("")
report.append("Key Findings:")
report.append("-------------")
report.append(f" Best model: LightGBM Calibrated")
report.append(f" Test log loss: {lgbm_results['lightgbm_calibrated']['test']['log_loss']:.4f}")
report.append(f" Test AUC: {lgbm_results['lightgbm_calibrated']['test']['auc']:.4f}")
report.append(f" Calibration quality (ECE): {ece_results['LightGBM Cal']['ece']:.4f}")
report.append("")
report.append("Most Important Features:")
report.append("------------------------")

# Load feature importance
feature_importance = pd.read_csv('/content/feature_importance.csv')
top_features = feature_importance.head(10)

for idx, row in top_features.iterrows():
    report.append(f"  {idx+1:2d}. {row['feature']:<30s} {row['importance']:8.1f}")

report.append("")
report.append("Stratified Performance Insights:")
report.append("--------------------------------")
report.append("By Side:")
report.append(f"  CT-side win rate: {y_test[df_test['side']=='CT'].mean():.3f}")
report.append(f"  T-side win rate: {y_test[df_test['side']=='T'].mean():.3f}")
report.append("")
report.append("By Map (Top 3 by samples):")
map_counts = df_test['map'].value_counts()
for map_name in map_counts.head(3).index:
    mask = df_test['map'] == map_name
    report.append(f"  {map_name}: Win rate {y_test[mask].mean():.3f}, {mask.sum()} rounds")

report.append("")
report.append("")
report.append("=" * 80)
report.append("END OF REPORT")
report.append("=" * 80)

# Print and save report
report_text = "\n".join(report)
print("\n" + report_text)

with open('/content/FINAL_REPORT.txt', 'w') as f:
    f.write(report_text)

print(f"\n Saved FINAL_REPORT.txt")

# ============================================================================
# SUMMARY
# ============================================================================
print("\n" + "=" * 80)
print("STEP 6 COMPLETE: PROJECT FINISHED!")
print("=" * 80)

print("""
 Congratulations! The CS2 Match Prediction System is complete!

""")

print("=" * 80)
print("END OF STEP 6 - PROJECT COMPLETE!")
print("=" * 80)

STEP 6: COMPREHENSIVE EVALUATION & CALIBRATION ANALYSIS

STEP 6.1: Loading All Data and Predictions
 Loaded test data: 2,800 rounds
 Loaded predictions for 5 models

STEP 6.2: Calculating Expected Calibration Error (ECE)
Calculating ECE with 15 quantile bins...

Baseline A           ECE: 0.0315
Baseline B           ECE: 0.0501
Baseline B+          ECE: 0.0384
LightGBM             ECE: 0.0250
LightGBM Cal         ECE: 0.0182

 ECE calculated for all models

STEP 6.3: Stratified Analysis - By Map

Analyzing performance across 7 maps:

de_ancient:
  Samples: 300
  Win rate: 0.527
    Baseline A           Log Loss: 0.6468, AUC: 0.6538
    Baseline B           Log Loss: 0.6633, AUC: 0.6226
    Baseline B+          Log Loss: 0.6622, AUC: 0.6129
    LightGBM             Log Loss: 0.6596, AUC: 0.6486
    LightGBM Cal         Log Loss: 0.6549, AUC: 0.6549

de_dust2:
  Samples: 500
  Win rate: 0.570
    Baseline A           Log Loss: 0.6716, AUC: 0.5796
    Baseline B           Log Loss: 0.6804,