In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')

print("========== LOADING DATA ==========")
# Load the match data from excel file
df = pd.read_excel('match_data.xlsx', sheet_name='Match Stats')

# Data exploration
print("\n========== DATASET OVERVIEW ==========")
print("Total matches in dataset:", len(df))
print("Teams in dataset:", sorted(set(df['home_team'].unique()) | set(df['away_team'].unique())))
print("Bucknell matches:", len(df[(df['home_team'] == 'Bucknell') | (df['away_team'] == 'Bucknell')]))
print("=======================================")

# Data preprocessing
print("\n========== DATA PREPROCESSING ==========")
# Convert date format if needed (assuming Date is already a column)
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# Create result column - from perspective of team1 vs team2
# W = team1 wins, L = team1 loses, D = draw
df['Result'] = np.where(df['goals_home'] > df['goals_away'], 'W',
                        np.where(df['goals_home'] < df['goals_away'], 'L', 'D'))

# Compute goal difference
df['goal_diff'] = df['goals_home'] - df['goals_away']
df['total_goals'] = df['goals_home'] + df['goals_away']
df['home_shot_conversion'] = df['goals_home'] / df['shots_home'].replace(0, 1)  # Avoid division by zero
df['away_shot_conversion'] = df['goals_away'] / df['shots_away'].replace(0, 1)  # Avoid division by zero
print("Preprocessing complete.")
print("========================================")

# Create team encodings
teams = sorted(set(df['home_team'].unique()) | set(df['away_team'].unique()))
team_to_id = {team: i for i, team in enumerate(teams)}
df['home_team_id'] = df['home_team'].map(team_to_id)
df['away_team_id'] = df['away_team'].map(team_to_id)

print("\n========== CALCULATING TEAM STATISTICS ==========")
# Create team statistics based on previous matches
team_stats = {}

# Initialize team stats
for team in teams:
    team_stats[team] = {
        'matches_played': 0,
        'wins': 0,
        'draws': 0,
        'losses': 0,
        'goals_for': 0,
        'goals_against': 0,
        'shots': 0,
        'shots_on_target': 0,
        'corners': 0,
        'fouls': 0
    }

# Calculate team stats from match data
for _, row in df.iterrows():
    home_team = row['home_team']
    away_team = row['away_team']

    # Update home team stats
    team_stats[home_team]['matches_played'] += 1
    if row['Result'] == 'W':
        team_stats[home_team]['wins'] += 1
    elif row['Result'] == 'D':
        team_stats[home_team]['draws'] += 1
    else:
        team_stats[home_team]['losses'] += 1

    team_stats[home_team]['goals_for'] += row['goals_home']
    team_stats[home_team]['goals_against'] += row['goals_away']
    team_stats[home_team]['shots'] += row['shots_home']
    team_stats[home_team]['corners'] += row['corners_home']
    team_stats[home_team]['fouls'] += row['fouls_home']

    # Update away team stats
    team_stats[away_team]['matches_played'] += 1
    if row['Result'] == 'L':
        team_stats[away_team]['wins'] += 1
    elif row['Result'] == 'D':
        team_stats[away_team]['draws'] += 1
    else:
        team_stats[away_team]['losses'] += 1

    team_stats[away_team]['goals_for'] += row['goals_away']
    team_stats[away_team]['goals_against'] += row['goals_home']
    team_stats[away_team]['shots'] += row['shots_away']
    team_stats[away_team]['corners'] += row['corners_away']
    team_stats[away_team]['fouls'] += row['fouls_away']

# Create calculated stats
for team in team_stats:
    stats = team_stats[team]
    matches = stats['matches_played']

    if matches > 0:
        stats['points'] = stats['wins'] * 3 + stats['draws']
        stats['ppg'] = stats['points'] / matches
        stats['goal_diff'] = stats['goals_for'] - stats['goals_against']
        stats['avg_goals_for'] = stats['goals_for'] / matches
        stats['avg_goals_against'] = stats['goals_against'] / matches
        stats['avg_shots'] = stats['shots'] / matches
        stats['avg_corners'] = stats['corners'] / matches
        stats['avg_fouls'] = stats['fouls'] / matches
    else:
        stats['points'] = 0
        stats['ppg'] = 0
        stats['goal_diff'] = 0
        stats['avg_goals_for'] = 0
        stats['avg_goals_against'] = 0
        stats['avg_shots'] = 0
        stats['avg_corners'] = 0
        stats['avg_fouls'] = 0
print("Team statistics calculated.")
print("=================================================")

# Create team statistics dataframe
team_stats_df = pd.DataFrame([
    {
        'Team': team,
        'Matches': stats['matches_played'],
        'Wins': stats['wins'],
        'Draws': stats['draws'],
        'Losses': stats['losses'],
        'GF': stats['goals_for'],
        'GA': stats['goals_against'],
        'GD': stats['goal_diff'],
        'Points': stats['points'],
        'PPG': stats['ppg'],
        'Avg GF': stats['avg_goals_for'],
        'Avg GA': stats['avg_goals_against']
    }
    for team, stats in team_stats.items()
]).sort_values('Points', ascending=False)

# Add team rank column
team_stats_df['Rank'] = range(1, len(team_stats_df) + 1)
team_stats_df = team_stats_df[['Rank', 'Team', 'Matches', 'Wins', 'Draws', 'Losses', 'Points', 'PPG', 'GF', 'GA', 'GD']]

print("\n========== TEAM STANDINGS TABLE ==========")
print(team_stats_df)
print("==========================================")

# Feature engineering for match prediction
print("\n========== TRAINING PREDICTION MODEL ==========")
def create_feature_vector(row):
    home_team = row['home_team']
    away_team = row['away_team']

    home_stats = team_stats[home_team]
    away_stats = team_stats[away_team]

    # Create feature vector with team performance metrics
    features = [
        row['home_team_id'],
        row['away_team_id'],
        home_stats['avg_goals_for'],
        home_stats['avg_goals_against'],
        away_stats['avg_goals_for'],
        away_stats['avg_goals_against'],
        home_stats['ppg'],
        away_stats['ppg'],
        home_stats['avg_shots'],
        away_stats['avg_shots'],
        home_stats['avg_corners'],
        away_stats['avg_corners']
    ]

    return features

# Prepare train/test data
X = np.array([create_feature_vector(row) for _, row in df.iterrows()])
y = np.array([0 if r == 'W' else 1 if r == 'D' else 2 for r in df['Result']])

# Train a Random Forest classifier
model = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    min_samples_split=2,
    class_weight='balanced',
    random_state=42
)
model.fit(X, y)
print("Model training complete.")
print("==============================================")

# Evaluate the model with cross-validation
cv_scores = cross_val_score(model, X, y, cv=5)
print("\n========== MODEL ACCURACY ==========")
print(f"Cross-validation accuracy: {np.mean(cv_scores):.4f}")
print("====================================")

# Feature importance
feature_names = [
    'Home Team ID', 'Away Team ID',
    'Home Avg Goals For', 'Home Avg Goals Against',
    'Away Avg Goals For', 'Away Avg Goals Against',
    'Home PPG', 'Away PPG',
    'Home Avg Shots', 'Away Avg Shots',
    'Home Avg Corners', 'Away Avg Corners'
]

print("\n========== FEATURE IMPORTANCE RANKING ==========")
for feature, importance in sorted(zip(feature_names, model.feature_importances_),
                                  key=lambda x: x[1], reverse=True):
    print(f"{feature}: {importance:.4f}")
print("==============================================")

print("\n========== PREDICTING FUTURE MATCHES ==========")
# Make predictions for Bucknell against all other teams
bucknell_predictions = []

# Get all other teams
other_teams = [team for team in teams if team != 'Bucknell']
print(f"Predicting Bucknell matches against {len(other_teams)} opponents...")

# For each opponent, predict Bucknell as both home and away
for opponent in other_teams:
    # Bucknell as home team
    home_features = create_feature_vector({
        'home_team': 'Bucknell',
        'away_team': opponent,
        'home_team_id': team_to_id['Bucknell'],
        'away_team_id': team_to_id[opponent]
    })

    # Bucknell as away team
    away_features = create_feature_vector({
        'home_team': opponent,
        'away_team': 'Bucknell',
        'home_team_id': team_to_id[opponent],
        'away_team_id': team_to_id['Bucknell']
    })

    # Predict both scenarios
    home_pred = model.predict([home_features])[0]
    away_pred = model.predict([away_features])[0]

    # Get probability estimates
    home_probs = model.predict_proba([home_features])[0]
    away_probs = model.predict_proba([away_features])[0]

    # Convert to match results from Bucknell's perspective
    bucknell_home_result = 'Win' if home_pred == 0 else 'Draw' if home_pred == 1 else 'Loss'
    bucknell_away_result = 'Win' if away_pred == 2 else 'Draw' if away_pred == 1 else 'Loss'

    # Add predictions
    bucknell_predictions.append({
        'Opponent': opponent,
        'Bucknell at Home': bucknell_home_result,
        'Win Prob (Home)': round(home_probs[0] * 100, 1),
        'Draw Prob (Home)': round(home_probs[1] * 100, 1),
        'Loss Prob (Home)': round(home_probs[2] * 100, 1),
        'Bucknell Away': bucknell_away_result,
        'Win Prob (Away)': round(away_probs[2] * 100, 1),
        'Draw Prob (Away)': round(away_probs[1] * 100, 1),
        'Loss Prob (Away)': round(away_probs[0] * 100, 1)
    })
print("Predictions complete.")
print("==============================================")

# Create predictions dataframe
bucknell_pred_df = pd.DataFrame(bucknell_predictions)

print("\n========== BUCKNELL MATCH PREDICTIONS SUMMARY ==========")
print(bucknell_pred_df[['Opponent', 'Bucknell at Home', 'Bucknell Away']])
print("=====================================================")

# Calculate expected points
home_points = sum([3 if r == 'Win' else 1 if r == 'Draw' else 0 for r in bucknell_pred_df['Bucknell at Home']])
away_points = sum([3 if r == 'Win' else 1 if r == 'Draw' else 0 for r in bucknell_pred_df['Bucknell Away']])
total_points = home_points + away_points
matches = len(bucknell_pred_df) * 2

print("\n========== EXPECTED POINTS PROJECTION ==========")
print(f"Home Points: {home_points} from {len(bucknell_pred_df)} matches")
print(f"Away Points: {away_points} from {len(bucknell_pred_df)} matches")
print(f"Total Points: {total_points} from {matches} matches")
print(f"Expected PPG: {total_points / matches:.2f}")
print("==============================================")

# Add a summary of Bucknell's predicted final position
print("\n========== BUCKNELL SEASON PROJECTION ==========")
# Calculate projected points
total_matches_per_team = len(teams) - 1  # Each team plays against every other team
matches_already_played = team_stats['Bucknell']['matches_played']
remaining_matches = total_matches_per_team * 2 - matches_already_played  # Home and away

# Current points + projected future points
current_points = team_stats['Bucknell']['points']
future_points = total_points / (len(other_teams) * 2) * remaining_matches
projected_total = current_points + future_points

print(f"Current Bucknell Points: {current_points}")
print(f"Projected Additional Points: {future_points:.1f}")
print(f"Projected Final Points: {projected_total:.1f}")

# Find where Bucknell would rank with these points
team_points = [(team, stats['points']) for team, stats in team_stats.items()]
team_points.sort(key=lambda x: x[1], reverse=True)
current_rank = [i+1 for i, (team, _) in enumerate(team_points) if team == 'Bucknell'][0]

print(f"Current Rank: {current_rank} of {len(teams)}")
print("==============================================")

# Add a summary of win/loss/draw totals for Bucknell predictions
print("\n========== BUCKNELL PREDICTION SUMMARY ==========")
home_wins = sum(1 for result in bucknell_pred_df['Bucknell at Home'] if result == 'Win')
home_draws = sum(1 for result in bucknell_pred_df['Bucknell at Home'] if result == 'Draw')
home_losses = sum(1 for result in bucknell_pred_df['Bucknell at Home'] if result == 'Loss')

away_wins = sum(1 for result in bucknell_pred_df['Bucknell Away'] if result == 'Win')
away_draws = sum(1 for result in bucknell_pred_df['Bucknell Away'] if result == 'Draw')
away_losses = sum(1 for result in bucknell_pred_df['Bucknell Away'] if result == 'Loss')

total_wins = home_wins + away_wins
total_draws = home_draws + away_draws
total_losses = home_losses + away_losses

print(f"Predicted Home Record: {home_wins}-{home_draws}-{home_losses} (W-D-L)")
print(f"Predicted Away Record: {away_wins}-{away_draws}-{away_losses} (W-D-L)")
print(f"Predicted Overall Record: {total_wins}-{total_draws}-{total_losses} (W-D-L)")
print(f"Predicted Win Percentage: {total_wins / (total_wins + total_draws + total_losses):.1%}")
print("==============================================")

print("\n========== DETAILED MATCH-BY-MATCH PREDICTIONS ==========")
for _, row in bucknell_pred_df.iterrows():
    print(f"vs {row['Opponent']}:")
    print(f"  At Home: {row['Bucknell at Home']} (Win: {row['Win Prob (Home)']}%, Draw: {row['Draw Prob (Home)']}%, Loss: {row['Loss Prob (Home)']}%)")
    print(f"  Away: {row['Bucknell Away']} (Win: {row['Win Prob (Away)']}%, Draw: {row['Draw Prob (Away)']}%, Loss: {row['Loss Prob (Away)']}%)")
print("========================================================")


Total matches in dataset: 59
Teams in dataset: ['American', 'Army West Point', 'BU', 'Boston U.', 'Bucknell', 'Colgate', 'Holy Cross', 'Lafayette', 'Lehigh', 'Loyola', 'Loyola Maryland', 'Navy']
Bucknell matches: 13

Preprocessing complete.

Team statistics calculated.

    Rank             Team  Matches  Wins  Draws  Losses  Points       PPG  GF  \
4      1         Bucknell       13    10      0       3      30  2.307692  18   
1      2  Army West Point       14     8      1       5      25  1.785714  21   
10     3  Loyola Maryland       12     4      5       3      17  1.416667  22   
5      4          Colgate       14     4      4       6      16  1.142857  20   
11     5             Navy       14     4      3       7      15  1.071429  14   
2      6               BU        6     4      2       0      14  2.333333  11   
3      7        Boston U.        7     3      4       0      13  1.857143  14   
7      8        Lafayette        9     2      4       3      10  1.111111   8   