In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import cross_val_score, TimeSeriesSplit
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report
import warnings


warnings.filterwarnings('ignore')

In [2]:

# ===== FEATURE WEIGHTS CONFIGURATION =====
# Adjust these weights to change the importance of different feature groups
FEATURE_WEIGHTS = {
    'team_id': 1.0,             # Team identifier importance
    'overall_performance': 1.0,  # PPG, win rates, etc.
    'goal_stats': 1.0,          # Goal-related statistics
    'home_away_stats': 1.0,     # Home/away specific performance
    'form': 1.0,                # Recent form (last 5 games)
    'h2h_stats': 1.0,           # Head-to-head statistics
    'game_stats': 1.0,          # Shots, corners, etc.
    'matchup_stats': 1.0,       # Attack vs defense, momentum
}

In [3]:
# =========================================

# Load the match data from excel file
df = pd.read_excel('match_data.xlsx', sheet_name='Match Stats')

# Data exploration
print("Total matches in dataset:", len(df))

# Convert team names to strings to avoid comparison errors
df['home_team'] = df['home_team'].astype(str)
df['away_team'] = df['away_team'].astype(str)
df = df[(df['home_team'] != '0') & (df['away_team'] != '0')]

# Standardize team names
df['home_team'] = df['home_team'].replace('Loyola Maryland', 'Loyola')
df['away_team'] = df['away_team'].replace('Loyola Maryland', 'Loyola')

# Create team encodings
teams = sorted(set(df['home_team'].unique()) | set(df['away_team'].unique()))
team_to_id = {team: i for i, team in enumerate(teams)}
df['home_team_id'] = df['home_team'].map(team_to_id)
df['away_team_id'] = df['away_team'].map(team_to_id)

print("Teams in dataset:", teams)
print("Total number of teams:", len(teams))
print("Bucknell matches:", len(df[(df['home_team'] == 'Bucknell') | (df['away_team'] == 'Bucknell')]))

Total matches in dataset: 59
Teams in dataset: ['American', 'Army West Point', 'BU', 'Bucknell', 'Colgate', 'Holy Cross', 'Lafayette', 'Lehigh', 'Loyola', 'Navy']
Total number of teams: 10
Bucknell matches: 13


In [4]:

# Ensure date column exists and is properly formatted
if 'Date' not in df.columns:
    # If date doesn't exist, create a placeholder with match sequence
    df['Date'] = pd.date_range(start='2023-01-01', periods=len(df), freq='D')
else:
    # Convert date format if needed
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# Sort by date to ensure chronological order
df = df.sort_values('Date')

In [5]:
# Create result column - from perspective of team1 vs team2
# W = team1 wins, L = team1 loses, D = draw
df['Result'] = np.where(df['goals_home'] > df['goals_away'], 'W',
                        np.where(df['goals_home'] < df['goals_away'], 'L', 'D'))

# Compute goal difference
df['goal_diff'] = df['goals_home'] - df['goals_away']
df['total_goals'] = df['goals_home'] + df['goals_away']
df['home_shot_conversion'] = df['goals_home'] / df['shots_home'].replace(0, 1)  # Avoid division by zero
df['away_shot_conversion'] = df['goals_away'] / df['shots_away'].replace(0, 1)  # Avoid division by zero

In [6]:
# Create team statistics based on previous matches
team_stats = {}

# Initialize team stats
for team in teams:
    team_stats[team] = {
        'matches_played': 0,
        'wins': 0,
        'draws': 0,
        'losses': 0,
        'goals_for': 0,
        'goals_against': 0,
        'shots': 0,
        'shots_on_target': 0,
        'corners': 0,
        'fouls': 0,
        'home_wins': 0,
        'home_draws': 0,
        'home_losses': 0,
        'away_wins': 0,
        'away_draws': 0,
        'away_losses': 0,
        'home_matches': 0,
        'away_matches': 0,
        'home_goals_for': 0,
        'home_goals_against': 0,
        'away_goals_for': 0,
        'away_goals_against': 0,
        'last_5_results': []  # Store last 5 results for form
    }

In [7]:
# Create head-to-head records with proper string keys
h2h_stats = {}
for team1 in teams:
    team1_str = str(team1)  # Ensure string
    h2h_stats[team1_str] = {}
    for team2 in teams:
        team2_str = str(team2)  # Ensure string
        if team1_str != team2_str:
            h2h_stats[team1_str][team2_str] = {
                'matches': 0,
                'wins': 0,
                'draws': 0,
                'losses': 0,
                'goals_for': 0,
                'goals_against': 0
            }


In [8]:
# Calculate team stats from match data
for _, row in df.iterrows():
    home_team = str(row['home_team'])  # Ensure string
    away_team = str(row['away_team'])  # Ensure string

    # Update head-to-head records
    if home_team != away_team:  # Skip if same team (shouldn't happen in real data)
        # Safe access with error checking
        if home_team in h2h_stats and away_team in h2h_stats[home_team]:
            h2h_stats[home_team][away_team]['matches'] += 1

            if row['Result'] == 'W':  # Home team wins
                h2h_stats[home_team][away_team]['wins'] += 1
            elif row['Result'] == 'L':  # Away team wins
                h2h_stats[home_team][away_team]['losses'] += 1
            else:  # Draw
                h2h_stats[home_team][away_team]['draws'] += 1

            h2h_stats[home_team][away_team]['goals_for'] += row['goals_home']
            h2h_stats[home_team][away_team]['goals_against'] += row['goals_away']

        if away_team in h2h_stats and home_team in h2h_stats[away_team]:
            h2h_stats[away_team][home_team]['matches'] += 1

            if row['Result'] == 'W':  # Home team wins
                h2h_stats[away_team][home_team]['losses'] += 1
            elif row['Result'] == 'L':  # Away team wins
                h2h_stats[away_team][home_team]['wins'] += 1
            else:  # Draw
                h2h_stats[away_team][home_team]['draws'] += 1

            h2h_stats[away_team][home_team]['goals_for'] += row['goals_away']
            h2h_stats[away_team][home_team]['goals_against'] += row['goals_home']

    # Update home team stats
    team_stats[home_team]['matches_played'] += 1
    team_stats[home_team]['home_matches'] += 1

    if row['Result'] == 'W':
        team_stats[home_team]['wins'] += 1
        team_stats[home_team]['home_wins'] += 1
    elif row['Result'] == 'D':
        team_stats[home_team]['draws'] += 1
        team_stats[home_team]['home_draws'] += 1
    else:
        team_stats[home_team]['losses'] += 1
        team_stats[home_team]['home_losses'] += 1

    team_stats[home_team]['goals_for'] += row['goals_home']
    team_stats[home_team]['goals_against'] += row['goals_away']
    team_stats[home_team]['home_goals_for'] += row['goals_home']
    team_stats[home_team]['home_goals_against'] += row['goals_away']
    team_stats[home_team]['shots'] += row['shots_home']
    team_stats[home_team]['corners'] += row['corners_home']
    team_stats[home_team]['fouls'] += row['fouls_home']

    # Add result to last 5 results (from home team perspective)
    team_stats[home_team]['last_5_results'].append(row['Result'])
    if len(team_stats[home_team]['last_5_results']) > 5:
        team_stats[home_team]['last_5_results'] = team_stats[home_team]['last_5_results'][-5:]

    # Update away team stats
    team_stats[away_team]['matches_played'] += 1
    team_stats[away_team]['away_matches'] += 1

    if row['Result'] == 'L':
        team_stats[away_team]['wins'] += 1
        team_stats[away_team]['away_wins'] += 1
    elif row['Result'] == 'D':
        team_stats[away_team]['draws'] += 1
        team_stats[away_team]['away_draws'] += 1
    else:
        team_stats[away_team]['losses'] += 1
        team_stats[away_team]['away_losses'] += 1

    team_stats[away_team]['goals_for'] += row['goals_away']
    team_stats[away_team]['goals_against'] += row['goals_home']
    team_stats[away_team]['away_goals_for'] += row['goals_away']
    team_stats[away_team]['away_goals_against'] += row['goals_home']
    team_stats[away_team]['shots'] += row['shots_away']
    team_stats[away_team]['corners'] += row['corners_away']
    team_stats[away_team]['fouls'] += row['fouls_away']

    # Add result to last 5 results (from away team perspective)
    away_result = 'W' if row['Result'] == 'L' else 'L' if row['Result'] == 'W' else 'D'
    team_stats[away_team]['last_5_results'].append(away_result)
    if len(team_stats[away_team]['last_5_results']) > 5:
        team_stats[away_team]['last_5_results'] = team_stats[away_team]['last_5_results'][-5:]


In [9]:
# Create calculated stats
for team in team_stats:
    stats = team_stats[team]
    matches = stats['matches_played']
    home_matches = max(stats['home_matches'], 1)  # Avoid division by zero
    away_matches = max(stats['away_matches'], 1)  # Avoid division by zero

    if matches > 0:
        stats['points'] = stats['wins'] * 3 + stats['draws']
        stats['ppg'] = stats['points'] / matches
        stats['win_rate'] = stats['wins'] / matches
        stats['draw_rate'] = stats['draws'] / matches
        stats['loss_rate'] = stats['losses'] / matches
        stats['goal_diff'] = stats['goals_for'] - stats['goals_against']
        stats['avg_goals_for'] = stats['goals_for'] / matches
        stats['avg_goals_against'] = stats['goals_against'] / matches
        stats['avg_shots'] = stats['shots'] / matches
        stats['avg_corners'] = stats['corners'] / matches
        stats['avg_fouls'] = stats['fouls'] / matches

        # Home-away specific stats
        stats['home_ppg'] = (stats['home_wins'] * 3 + stats['home_draws']) / home_matches
        stats['away_ppg'] = (stats['away_wins'] * 3 + stats['away_draws']) / away_matches
        stats['home_win_rate'] = stats['home_wins'] / home_matches
        stats['away_win_rate'] = stats['away_wins'] / away_matches
        stats['home_avg_goals_for'] = stats['home_goals_for'] / home_matches
        stats['home_avg_goals_against'] = stats['home_goals_against'] / home_matches
        stats['away_avg_goals_for'] = stats['away_goals_for'] / away_matches
        stats['away_avg_goals_against'] = stats['away_goals_against'] / away_matches

        # Form calculation (points from last 5 games)
        recent_results = stats['last_5_results']
        form_points = sum([3 if res == 'W' else 1 if res == 'D' else 0 for res in recent_results])
        stats['form'] = form_points / (len(recent_results) * 3)  # As percentage of maximum possible
    else:
        # Default values for teams with no matches
        stats['points'] = 0
        stats['ppg'] = 0
        stats['win_rate'] = 0
        stats['draw_rate'] = 0
        stats['loss_rate'] = 0
        stats['goal_diff'] = 0
        stats['avg_goals_for'] = 0
        stats['avg_goals_against'] = 0
        stats['avg_shots'] = 0
        stats['avg_corners'] = 0
        stats['avg_fouls'] = 0
        stats['home_ppg'] = 0
        stats['away_ppg'] = 0
        stats['home_win_rate'] = 0
        stats['away_win_rate'] = 0
        stats['home_avg_goals_for'] = 0
        stats['home_avg_goals_against'] = 0
        stats['away_avg_goals_for'] = 0
        stats['away_avg_goals_against'] = 0
        stats['form'] = 0.5  # Neutral form for teams with no data


In [10]:
# Create team statistics dataframe
team_stats_df = pd.DataFrame([
    {
        'Team': team,
        'Matches': stats['matches_played'],
        'Wins': stats['wins'],
        'Draws': stats['draws'],
        'Losses': stats['losses'],
        'GF': stats['goals_for'],
        'GA': stats['goals_against'],
        'GD': stats['goal_diff'],
        'Points': stats['points'],
        'PPG': stats['ppg'],
        'Win%': stats['win_rate'] * 100,
        'Home PPG': stats['home_ppg'],
        'Away PPG': stats['away_ppg'],
        'Form': stats['form'] * 100
    }
    for team, stats in team_stats.items()
]).sort_values('Points', ascending=False)

print("\nTeam Performance Table:")
print(team_stats_df[['Team', 'Matches', 'Wins', 'Draws', 'Losses', 'Points', 'PPG', 'GF', 'GA', 'GD', 'Form']])



Team Performance Table:
              Team  Matches  Wins  Draws  Losses  Points       PPG  GF  GA  \
3         Bucknell       13    10      0       3      30  2.307692  18   5   
1  Army West Point       14     8      1       5      25  1.785714  21  10   
2               BU       12     6      6       0      24  2.000000  22  10   
8           Loyola       14     4      5       5      17  1.214286  22  17   
4          Colgate       13     4      4       5      16  1.230769  18  22   
9             Navy       14     4      3       7      15  1.071429  14  25   
6        Lafayette        9     2      4       3      10  1.111111   8   6   
0         American        9     2      2       5       8  0.888889  10  19   
5       Holy Cross        9     2      2       5       8  0.888889   6  12   
7           Lehigh        9     2      1       6       7  0.777778   8  21   

   GD       Form  
3  13  80.000000  
1  11  60.000000  
2  12  86.666667  
8   5  40.000000  
4  -4  33.333333  
9 

In [11]:
# Enhanced feature engineering with configurable weights
def create_feature_vector(row):
    home_team = str(row['home_team'])  # Ensure string
    away_team = str(row['away_team'])  # Ensure string

    home_stats = team_stats[home_team]
    away_stats = team_stats[away_team]

    # Get head-to-head stats if available
    # Use safe access with default values
    if home_team in h2h_stats and away_team in h2h_stats[home_team]:
        h2h = h2h_stats[home_team][away_team]

        # Calculate head-to-head strength (how well home team performs against away team)
        if h2h['matches'] > 0:
            h2h_win_rate = h2h['wins'] / h2h['matches']
            h2h_ppg = (h2h['wins'] * 3 + h2h['draws']) / h2h['matches']
            h2h_goal_diff = ((h2h['goals_for'])- h2h['goals_against']) / h2h['matches']

        else:
            # Default values if no h2h history
            h2h_win_rate = 0.5
            h2h_ppg = home_stats['ppg'] - away_stats['ppg'] + 0.5
            h2h_goal_diff = 0
    else:
        # Default values if h2h relationship not found
        h2h_win_rate = 0.5
        h2h_ppg = home_stats['ppg'] - away_stats['ppg'] + 0.5
        h2h_goal_diff = 0

    # Create feature groups with their corresponding weights

    # Group 1: Team IDs
    team_id_features = [
        row['home_team_id'],
        row['away_team_id']
    ]

    # Group 2: Overall team performance
    overall_performance_features = [
        home_stats['ppg'],
        away_stats['ppg'],
        home_stats['win_rate'],
        away_stats['win_rate'],
        home_stats['draw_rate'],
        away_stats['draw_rate']
    ]

    # Group 3: Goal stats
    goal_stats_features = [
        home_stats['avg_goals_for'],
        home_stats['avg_goals_against'],
        away_stats['avg_goals_for'],
        away_stats['avg_goals_against']
    ]

    # Group 4: Home/away specific stats
    home_away_stats_features = [
        home_stats['home_ppg'],
        away_stats['away_ppg'],
        home_stats['home_win_rate'],
        away_stats['away_win_rate'],
        home_stats['home_avg_goals_for'],
        home_stats['home_avg_goals_against'],
        away_stats['away_avg_goals_for'],
        away_stats['away_avg_goals_against']
    ]

    # Group 5: Recent form
    form_features = [
        home_stats['form'],
        away_stats['form']
    ]

    # Group 6: Head-to-head stats
    h2h_stats_features = [
        h2h_win_rate,
        h2h_ppg,
        h2h_goal_diff
    ]

    # Group 7: Game stats
    game_stats_features = [
        home_stats['avg_shots'],
        away_stats['avg_shots'],
        home_stats['avg_corners'],
        away_stats['avg_corners']
    ]

    # Group 8: Strategic matchup insights
    matchup_stats_features = [
        home_stats['avg_goals_for'] - away_stats['avg_goals_against'],  # Home attack vs Away defense
        away_stats['avg_goals_for'] - home_stats['avg_goals_against'],  # Away attack vs Home defense
        home_stats['goal_diff'] / max(home_stats['matches_played'], 1),
        away_stats['goal_diff'] / max(away_stats['matches_played'], 1)
    ]

    # Apply weights to each feature group
    team_id_features = [x * FEATURE_WEIGHTS['team_id'] for x in team_id_features]
    overall_performance_features = [x * FEATURE_WEIGHTS['overall_performance'] for x in overall_performance_features]
    goal_stats_features = [x * FEATURE_WEIGHTS['goal_stats'] for x in goal_stats_features]
    home_away_stats_features = [x * FEATURE_WEIGHTS['home_away_stats'] for x in home_away_stats_features]
    form_features = [x * FEATURE_WEIGHTS['form'] for x in form_features]
    h2h_stats_features = [x * FEATURE_WEIGHTS['h2h_stats'] for x in h2h_stats_features]
    game_stats_features = [x * FEATURE_WEIGHTS['game_stats'] for x in game_stats_features]
    matchup_stats_features = [x * FEATURE_WEIGHTS['matchup_stats'] for x in matchup_stats_features]

    # Combine all feature groups into one vector
    features = (
            team_id_features +
            overall_performance_features +
            goal_stats_features +
            home_away_stats_features +
            form_features +
            h2h_stats_features +
            game_stats_features +
            matchup_stats_features
    )

    return features

In [12]:
def print_feature_details():
    # Feature groups as defined in the create_feature_vector function
    feature_groups = {
        'Team IDs': [
            'Home Team ID',
            'Away Team ID'
        ],
        'Overall Performance': [
            'Home Team Points per Game (PPG)',
            'Away Team Points per Game (PPG)',
            'Home Team Win Rate',
            'Away Team Win Rate',
            'Home Team Draw Rate',
            'Away Team Draw Rate'
        ],
        'Goal Stats': [
            'Home Team Avg Goals For',
            'Home Team Avg Goals Against',
            'Away Team Avg Goals For',
            'Away Team Avg Goals Against'
        ],
        'Home/Away Specific Stats': [
            'Home Team Home PPG',
            'Away Team Away PPG',
            'Home Team Home Win Rate',
            'Away Team Away Win Rate',
            'Home Team Home Avg Goals For',
            'Home Team Home Avg Goals Against',
            'Away Team Away Avg Goals For',
            'Away Team Away Avg Goals Against'
        ],
        'Recent Form': [
            'Home Team Recent Form',
            'Away Team Recent Form'
        ],
        'Head-to-Head Stats': [
            'Head-to-Head Win Rate',
            'Head-to-Head Points per Game',
            'Head-to-Head Goal Difference'
        ],
        'Game Stats': [
            'Home Team Avg Shots',
            'Away Team Avg Shots',
            'Home Team Avg Corners',
            'Away Team Avg Corners'
        ],
        'Strategic Matchup Insights': [
            'Home Attack vs Away Defense',
            'Away Attack vs Home Defense',
            'Home Team Goal Difference Ratio',
            'Away Team Goal Difference Ratio'
        ]
    }

    # Assuming FEATURE_WEIGHTS is defined globally or passed in
    print("=== Feature Group Weights ===")
    total_features = 0
    for group_name, features in feature_groups.items():
        group_weight = FEATURE_WEIGHTS.get(group_name.lower().replace(' ', '_'), 0)
        print(f"\n{group_name} (Weight: {group_weight}):")
        for feature in features:
            print(f"  - {feature}")
            total_features += 1

    print(f"\nTotal Number of Features: {total_features}")

    # Optionally, print out the raw weights dictionary
    print("\n=== Raw Feature Weights ===")
    for key, value in FEATURE_WEIGHTS.items():
        print(f"{key}: {value}")


# Call the function to print feature details
print_feature_details()


=== Feature Group Weights ===

Team IDs (Weight: 0):
  - Home Team ID
  - Away Team ID

Overall Performance (Weight: 1.0):
  - Home Team Points per Game (PPG)
  - Away Team Points per Game (PPG)
  - Home Team Win Rate
  - Away Team Win Rate
  - Home Team Draw Rate
  - Away Team Draw Rate

Goal Stats (Weight: 1.0):
  - Home Team Avg Goals For
  - Home Team Avg Goals Against
  - Away Team Avg Goals For
  - Away Team Avg Goals Against

Home/Away Specific Stats (Weight: 0):
  - Home Team Home PPG
  - Away Team Away PPG
  - Home Team Home Win Rate
  - Away Team Away Win Rate
  - Home Team Home Avg Goals For
  - Home Team Home Avg Goals Against
  - Away Team Away Avg Goals For
  - Away Team Away Avg Goals Against

Recent Form (Weight: 0):
  - Home Team Recent Form
  - Away Team Recent Form

Head-to-Head Stats (Weight: 0):
  - Head-to-Head Win Rate
  - Head-to-Head Points per Game
  - Head-to-Head Goal Difference

Game Stats (Weight: 1.0):
  - Home Team Avg Shots
  - Away Team Avg Shots
  - H

In [13]:
# Prepare train/test data
X = np.array([create_feature_vector(row) for _, row in df.iterrows()])
y = np.array([0 if r == 'W' else 1 if r == 'D' else 2 for r in df['Result']])

# List of feature names (for importance analysis)
feature_names = [
    # Team IDs
    'Home Team ID', 'Away Team ID',

    # Overall performance
    'Home PPG', 'Away PPG',
    'Home Win Rate', 'Away Win Rate',
    'Home Draw Rate', 'Away Draw Rate',

    # Goal stats
    'Home Avg Goals For', 'Home Avg Goals Against',
    'Away Avg Goals For', 'Away Avg Goals Against',

    # Home-away stats
    'Home PPG at Home', 'Away PPG Away',
    'Home Win Rate at Home', 'Away Win Rate Away',
    'Home Avg Goals For at Home', 'Home Avg Goals Against at Home',
    'Away Avg Goals For Away', 'Away Avg Goals Against Away',

    # Form
    'Home Recent Form', 'Away Recent Form',

    # H2H stats
    'H2H Home Win Rate', 'H2H Home PPG', 'H2H Goal Diff per Game',

    # Game stats
    'Home Avg Shots', 'Away Avg Shots',
    'Home Avg Corners', 'Away Avg Corners',

    # Matchup stats
    'Home Attack vs Away Defense', 'Away Attack vs Home Defense',
    'Home Goal Diff per Game', 'Away Goal Diff per Game'
]


In [14]:
# Train a Random Forest classifier with tuned parameters
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    class_weight='balanced',
    random_state=42
)
rf_model.fit(X, y)


In [15]:
# Evaluate the model with cross-validation
# Use TimeSeriesSplit for chronological data
tscv = TimeSeriesSplit(n_splits=5)
rf_cv_scores = cross_val_score(rf_model, X, y, cv=tscv)

print(f"\nRandom Forest Cross-validation accuracy: {np.mean(rf_cv_scores):.4f}")

# Use the Random Forest model for predictions
model = rf_model
print("Using Random Forest model for predictions")

# Display which features are being weighted more heavily
print("\nFeature Group Weights:")
for group, weight in FEATURE_WEIGHTS.items():
    print(f"{group}: {weight:.2f}")

# Feature importance
if hasattr(model, 'feature_importances_'):
    print("\nFeature Importance (Top 15):")
    feature_importance = sorted(zip(feature_names, model.feature_importances_),
                                key=lambda x: x[1], reverse=True)
    for i, (feature, importance) in enumerate(feature_importance[:15]):
        print(f"{i+1}. {feature}: {importance:.4f}")



Random Forest Cross-validation accuracy: 0.8222
Using Random Forest model for predictions

Feature Group Weights:
team_id: 1.00
overall_performance: 1.00
goal_stats: 1.00
home_away_stats: 1.00
form: 1.00
h2h_stats: 1.00
game_stats: 1.00
matchup_stats: 1.00

Feature Importance (Top 15):
1. H2H Goal Diff per Game: 0.2150
2. H2H Home PPG: 0.2107
3. H2H Home Win Rate: 0.1266
4. Away Attack vs Home Defense: 0.0390
5. Home Draw Rate: 0.0271
6. Away Draw Rate: 0.0237
7. Home Goal Diff per Game: 0.0216
8. Home Avg Goals Against at Home: 0.0203
9. Home PPG: 0.0200
10. Home Avg Goals Against: 0.0197
11. Home Attack vs Away Defense: 0.0194
12. Away Win Rate Away: 0.0190
13. Home Avg Shots: 0.0187
14. Away Team ID: 0.0179
15. Home PPG at Home: 0.0175


In [16]:
# Make predictions for Bucknell against all other teams
bucknell_predictions = []

# Get all other teams
other_teams = [team for team in teams if str(team) != 'Bucknell']

# For each opponent, predict Bucknell as both home and away
for opponent in other_teams:
    opponent_str = str(opponent)  # Ensure string

    # Bucknell as home team
    home_features = create_feature_vector({
        'home_team': 'Bucknell',
        'away_team': opponent_str,
        'home_team_id': team_to_id['Bucknell'],
        'away_team_id': team_to_id[opponent_str]
    })

    # Bucknell as away team
    away_features = create_feature_vector({
        'home_team': opponent_str,
        'away_team': 'Bucknell',
        'home_team_id': team_to_id[opponent_str],
        'away_team_id': team_to_id['Bucknell']
    })

    # Predict both scenarios
    home_pred = model.predict([home_features])[0]
    away_pred = model.predict([away_features])[0]

    # Get probability estimates
    home_probs = model.predict_proba([home_features])[0]
    away_probs = model.predict_proba([away_features])[0]

    # Convert to match results from Bucknell's perspective
    bucknell_home_result = 'Win' if home_pred == 0 else 'Draw' if home_pred == 1 else 'Loss'
    bucknell_away_result = 'Win' if away_pred == 2 else 'Draw' if away_pred == 1 else 'Loss'

    # Add predictions
    bucknell_predictions.append({
        'Opponent': opponent_str,
        'Bucknell at Home': bucknell_home_result,
        'Win Prob (Home)': round(home_probs[0] * 100, 1),
        'Draw Prob (Home)': round(home_probs[1] * 100, 1),
        'Loss Prob (Home)': round(home_probs[2] * 100, 1),
        'Bucknell Away': bucknell_away_result,
        'Win Prob (Away)': round(away_probs[2] * 100, 1),
        'Draw Prob (Away)': round(away_probs[1] * 100, 1),
        'Loss Prob (Away)': round(away_probs[0] * 100, 1)
    })


In [17]:
# Create predictions dataframe
bucknell_pred_df = pd.DataFrame(bucknell_predictions)

# Display Bucknell predictions
print("\nBucknell Match Predictions:")
print(bucknell_pred_df[['Opponent', 'Bucknell at Home', 'Bucknell Away']])

# Calculate expected points
home_points = sum([3 if r == 'Win' else 1 if r == 'Draw' else 0 for r in bucknell_pred_df['Bucknell at Home']])
away_points = sum([3 if r == 'Win' else 1 if r == 'Draw' else 0 for r in bucknell_pred_df['Bucknell Away']])
total_points = home_points + away_points
matches = len(bucknell_pred_df) * 2

print(f"\nExpected Points for Bucknell:")
print(f"Home Points: {home_points} from {len(bucknell_pred_df)} matches")
print(f"Away Points: {away_points} from {len(bucknell_pred_df)} matches")
print(f"Total Points: {total_points} from {matches} matches")
print(f"Expected PPG: {total_points / matches:.2f}")



Bucknell Match Predictions:
          Opponent Bucknell at Home Bucknell Away
0         American              Win           Win
1  Army West Point             Loss          Loss
2               BU             Loss          Loss
3          Colgate              Win           Win
4       Holy Cross              Win           Win
5        Lafayette              Win           Win
6           Lehigh              Win           Win
7           Loyola              Win           Win
8             Navy              Win           Win

Expected Points for Bucknell:
Home Points: 21 from 9 matches
Away Points: 21 from 9 matches
Total Points: 42 from 18 matches
Expected PPG: 2.33


In [18]:
# Detailed prediction breakdown
print("\nDetailed Bucknell Predictions:")
for _, row in bucknell_pred_df.iterrows():
    print(f"vs {row['Opponent']}:")
    print(f"  At Home: {row['Bucknell at Home']} (Win: {row['Win Prob (Home)']}%, Draw: {row['Draw Prob (Home)']}%, Loss: {row['Loss Prob (Home)']}%)")
    print(f"  Away: {row['Bucknell Away']} (Win: {row['Win Prob (Away)']}%, Draw: {row['Draw Prob (Away)']}%, Loss: {row['Loss Prob (Away)']}%)")



Detailed Bucknell Predictions:
vs American:
  At Home: Win (Win: 96.8%, Draw: 0.9%, Loss: 2.3%)
  Away: Win (Win: 95.1%, Draw: 4.0%, Loss: 0.9%)
vs Army West Point:
  At Home: Loss (Win: 10.8%, Draw: 11.0%, Loss: 78.2%)
  Away: Loss (Win: 9.0%, Draw: 5.8%, Loss: 85.2%)
vs BU:
  At Home: Loss (Win: 11.0%, Draw: 28.0%, Loss: 61.0%)
  Away: Loss (Win: 1.7%, Draw: 4.0%, Loss: 94.3%)
vs Colgate:
  At Home: Win (Win: 94.3%, Draw: 2.9%, Loss: 2.9%)
  Away: Win (Win: 88.5%, Draw: 7.1%, Loss: 4.4%)
vs Holy Cross:
  At Home: Win (Win: 92.3%, Draw: 2.7%, Loss: 5.0%)
  Away: Win (Win: 83.6%, Draw: 13.8%, Loss: 2.6%)
vs Lafayette:
  At Home: Win (Win: 86.5%, Draw: 9.4%, Loss: 4.1%)
  Away: Win (Win: 83.5%, Draw: 13.4%, Loss: 3.1%)
vs Lehigh:
  At Home: Win (Win: 94.7%, Draw: 3.5%, Loss: 1.8%)
  Away: Win (Win: 95.4%, Draw: 2.2%, Loss: 2.4%)
vs Loyola:
  At Home: Win (Win: 91.9%, Draw: 4.8%, Loss: 3.4%)
  Away: Win (Win: 67.7%, Draw: 16.7%, Loss: 15.6%)
vs Navy:
  At Home: Win (Win: 91.7%, Draw: 7.

In [19]:
print("\n===== HOW TO USE THIS MODEL =====")
print("To change feature weights, modify the FEATURE_WEIGHTS dictionary at the top of the file.")
print("Examples:")
print("  FEATURE_WEIGHTS['form'] = 3.0       # Triple the importance of recent form")
print("  FEATURE_WEIGHTS['h2h_stats'] = 2.0  # Double the importance of head-to-head stats")
print("  FEATURE_WEIGHTS['game_stats'] = 0.5 # Halve the importance of shots/corners")
print("\nAvailable feature groups to weight:")
for group in FEATURE_WEIGHTS:
    print(f"  - {group}")
print("\nAfter changing weights, rerun the model to see the impact on predictions.")


===== HOW TO USE THIS MODEL =====
To change feature weights, modify the FEATURE_WEIGHTS dictionary at the top of the file.
Examples:
  FEATURE_WEIGHTS['form'] = 3.0       # Triple the importance of recent form
  FEATURE_WEIGHTS['h2h_stats'] = 2.0  # Double the importance of head-to-head stats
  FEATURE_WEIGHTS['game_stats'] = 0.5 # Halve the importance of shots/corners

Available feature groups to weight:
  - team_id
  - overall_performance
  - goal_stats
  - home_away_stats
  - form
  - h2h_stats
  - game_stats
  - matchup_stats

After changing weights, rerun the model to see the impact on predictions.


In [20]:
# Example of weight configuration that emphasizes form and h2h stats
def example_weight_configuration():
    print("\n===== EXAMPLE: EMPHASIZING FORM AND H2H STATS =====")
    # Save original weights
    original_weights = FEATURE_WEIGHTS.copy()

    # New weights configuration
    FEATURE_WEIGHTS['form'] = 3.0
    FEATURE_WEIGHTS['h2h_stats'] = 2.5
    FEATURE_WEIGHTS['team_id'] = 0.5
    FEATURE_WEIGHTS['goal_stats'] = .1

    print("New weight configuration:")
    for group, weight in FEATURE_WEIGHTS.items():
        print(f"{group}: {weight:.2f}")

    # Regenerate features with new weights
    X_weighted = np.array([create_feature_vector(row) for _, row in df.iterrows()])

    # Train model with these weights
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_weighted, y)

    # Cross-validate
    cv_scores = cross_val_score(model, X_weighted, y, cv=tscv)
    accuracy = np.mean(cv_scores)

    print(f"\nModel accuracy with this configuration: {accuracy:.4f}")

    # Restore original weights
    for k, v in original_weights.items():
        FEATURE_WEIGHTS[k] = v

# Run example weight configuration if you'd like to see a demonstration
# example_weight_configuration()
example_weight_configuration()


===== EXAMPLE: EMPHASIZING FORM AND H2H STATS =====
New weight configuration:
team_id: 0.50
overall_performance: 1.00
goal_stats: 0.10
home_away_stats: 1.00
form: 3.00
h2h_stats: 2.50
game_stats: 1.00
matchup_stats: 1.00

Model accuracy with this configuration: 0.8444


In [21]:
print(sorted(zip(feature_names, rf_model.feature_importances_), key=lambda x: x[1], reverse=True))


[('H2H Goal Diff per Game', np.float64(0.21495819245064435)), ('H2H Home PPG', np.float64(0.2106891306807313)), ('H2H Home Win Rate', np.float64(0.12659998249593676)), ('Away Attack vs Home Defense', np.float64(0.0389817286298139)), ('Home Draw Rate', np.float64(0.027055120763096286)), ('Away Draw Rate', np.float64(0.02373334086917673)), ('Home Goal Diff per Game', np.float64(0.02161787162943292)), ('Home Avg Goals Against at Home', np.float64(0.020300323544539714)), ('Home PPG', np.float64(0.020035207709896564)), ('Home Avg Goals Against', np.float64(0.019723818659346727)), ('Home Attack vs Away Defense', np.float64(0.01940024032339894)), ('Away Win Rate Away', np.float64(0.019035858003979576)), ('Home Avg Shots', np.float64(0.018737365208162528)), ('Away Team ID', np.float64(0.01790775257773389)), ('Home PPG at Home', np.float64(0.01746770985248475)), ('Home Win Rate at Home', np.float64(0.016887230930329514)), ('Away Avg Goals Against', np.float64(0.016113444667302558)), ('Away Rece