In [47]:
pip install openpyxl


Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m26.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [48]:
import pandas as pd
import pickle

# Load matchups (16 games to predict)
matchups = pd.read_excel("../data/raw/WHSDSC_Rnd1_matchups.xlsx")

# Load team stats (your aggregated team features)
team_stats = pd.read_csv("../data/processed/team_stats.csv")

# Load model artifacts
model = pickle.load(open("../outputs/baseline_model.pkl", "rb"))
scaler = pickle.load(open("../outputs/baseline_scaler.pkl", "rb"))

print(f"Matchups: {matchups.shape}")
print(f"Team stats: {team_stats.shape}")
print(f"Model loaded: {type(model)}")


Matchups: (16, 4)
Team stats: (32, 8)
Model loaded: <class 'sklearn.linear_model._logistic.LogisticRegression'>


In [49]:
# See what columns model_input.csv had
model_input = pd.read_csv("../data/processed/model_input.csv")

# Get feature columns (exclude game_id, teams, target)
feature_cols = [c for c in model_input.columns 
                if c not in ['game_id', 'home_team', 'away_team', 'home_win', 'home_goals', 'away_goals']]

print(f"Model expects {len(feature_cols)} features:")
print(feature_cols[:10])  # Show first 10


Model expects 37 features:
['home_xg', 'away_xg', 'toi', 'home_shots', 'away_shots', 'home_assists', 'away_assists', 'home_penalties_committed', 'away_penalties_committed', 'home_games']


In [50]:
# Build complete team features from model_input
print("Building complete team features from model_input...")

# Get the EXACT 28 features the scaler expects
trained_features = list(scaler.feature_names_in_)
print(f"Model expects {len(trained_features)} features")

# Get ALL home/away columns EXCEPT team name columns
exclude = ['home_team', 'away_team', 'home_goals', 'away_goals', 'home_assists', 'away_assists', 'home_win']
all_home_cols = [c for c in model_input.columns if c.startswith('home_') and c not in exclude]
all_away_cols = [c for c in model_input.columns if c.startswith('away_') and c not in exclude]

print(f"Using {len(all_home_cols)} home columns and {len(all_away_cols)} away columns")

# Build team feature dictionary
team_features_dict = {}

for team in model_input['home_team'].unique():
    # Get all games where team played as home
    home_games = model_input[model_input['home_team'] == team]
    home_stats = home_games[all_home_cols].mean()
    
    # Get all games where team played as away
    away_games = model_input[model_input['away_team'] == team]
    away_stats = away_games[all_away_cols].mean()
    
    # Average home and away stats, remove prefix
    combined_stats = {}
    for col in all_home_cols:
        base_name = col.replace('home_', '')
        combined_stats[base_name] = home_stats[col]
    
    team_features_dict[team] = combined_stats

print(f"‚úÖ Built features for {len(team_features_dict)} teams")

# Generate predictions
predictions = []

print(f"\nGenerating predictions for 16 games...\n")

for _, row in matchups.iterrows():
    game_id = row['game_id']
    home = row['home_team'].lower().strip()
    away = row['away_team'].lower().strip()
    
    # Get features
    home_feats = team_features_dict.get(home, {})
    away_feats = team_features_dict.get(away, {})
    
    # Build feature row with ONLY the 28 trained features
    feature_dict = {}
    for col in trained_features:
        if col.startswith('home_'):
            base = col.replace('home_', '')
            feature_dict[col] = home_feats.get(base, 0)
        elif col.startswith('away_'):
            base = col.replace('away_', '')
            feature_dict[col] = away_feats.get(base, 0)
    
    # Create dataframe with exact feature order
    X = pd.DataFrame([feature_dict])[trained_features]
    
    # Scale and predict
    X_scaled = scaler.transform(X.fillna(0))
    prob = model.predict_proba(X_scaled)[0, 1]
    
    predictions.append({
        'game_id': game_id,
        'home_win_prob': prob
    })
    
    print(f"  {game_id}: {home:12} vs {away:12} ‚Üí Home win: {prob:.1%}")

# Save
submission = pd.DataFrame(predictions)
submission.to_csv("../outputs/round1_predictions.csv", index=False)

print(f"\n{'='*60}")
print(f"‚úÖ SAVED: outputs/round1_predictions.csv")
print(f"üìä Total predictions: {len(predictions)}")
print(f"{'='*60}")


Building complete team features from model_input...
Model expects 28 features
Using 17 home columns and 17 away columns
‚úÖ Built features for 32 teams

Generating predictions for 16 games...

  game_1: brazil       vs kazakhstan   ‚Üí Home win: 83.7%
  game_2: netherlands  vs mongolia     ‚Üí Home win: 74.1%
  game_3: peru         vs rwanda       ‚Üí Home win: 73.1%
  game_4: thailand     vs oman         ‚Üí Home win: 75.3%
  game_5: pakistan     vs germany      ‚Üí Home win: 70.9%
  game_6: india        vs usa          ‚Üí Home win: 76.3%
  game_7: panama       vs switzerland  ‚Üí Home win: 66.9%
  game_8: iceland      vs canada       ‚Üí Home win: 69.2%
  game_9: china        vs france       ‚Üí Home win: 65.1%
  game_10: philippines  vs morocco      ‚Üí Home win: 58.2%
  game_11: ethiopia     vs saudi_arabia ‚Üí Home win: 62.2%
  game_12: singapore    vs new_zealand  ‚Üí Home win: 50.9%
  game_13: guatemala    vs south_korea  ‚Üí Home win: 60.1%
  game_14: uk           vs mexico   

In [53]:
# Validation checks
sub = pd.read_csv("../outputs/round1_predictions.csv")

print("VALIDATION CHECKLIST:")
print("="*60)

# Check 1: Row count
assert sub.shape[0] == 16, f"‚ùå Need 16 rows, got {sub.shape[0]}"
print("‚úÖ Row count: 16")

# Check 2: Column count
assert sub.shape[1] == 2, f"‚ùå Need 2 columns, got {sub.shape[1]}"
print("‚úÖ Column count: 2")

# Check 3: Column names
assert list(sub.columns) == ['game_id', 'home_win_prob'], f"‚ùå Wrong columns: {sub.columns}"
print("‚úÖ Column names: game_id, home_win_prob")

# Check 4: Probability range
assert sub['home_win_prob'].between(0, 1).all(), "‚ùå Probabilities must be 0-1"
print("‚úÖ All probabilities between 0 and 1")

# Check 5: No missing values
assert sub['home_win_prob'].notna().all(), "‚ùå Missing probabilities"
print("‚úÖ No missing values")

# Check 6: Game IDs match
expected_ids = [f'game_{i}' for i in range(1, 17)]
assert set(sub['game_id']) == set(expected_ids), "‚ùå Game IDs don't match"
print("‚úÖ All 16 game IDs present")

print("="*60)
print("üéØ FILE READY FOR SUBMISSION")
print("\nPreview:")
print(sub.head(3))
print(f"\nFile location: outputs/round1_predictions.csv")


VALIDATION CHECKLIST:
‚úÖ Row count: 16
‚úÖ Column count: 2
‚úÖ Column names: game_id, home_win_prob
‚úÖ All probabilities between 0 and 1
‚úÖ No missing values
‚úÖ All 16 game IDs present
üéØ FILE READY FOR SUBMISSION

Preview:
  game_id  home_win_prob
0  game_1       0.837057
1  game_2       0.740722
2  game_3       0.730924

File location: outputs/round1_predictions.csv
