In [19]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings('ignore')

In [21]:

import pandas as pd
import numpy as np

print("========== LOADING DATA ==========")
# Load the match data from excel file
df = pd.read_excel('match_data.xlsx', sheet_name='Match Stats')

# Data cleaning
print("\n========== DATA CLEANING ==========")
# Remove rows with '0' or NaN in team columns
df = df[df['home_team'] != 0]
df = df[df['away_team'] != 0]
df = df.dropna(subset=['home_team', 'away_team'])

# Convert all team names to strings and strip whitespace
df['home_team'] = df['home_team'].astype(str).str.strip()
df['away_team'] = df['away_team'].astype(str).str.strip()
print("Data cleaning complete.")

# Data exploration
print("\n========== DATASET OVERVIEW ==========")
print("Total matches in dataset:", len(df))
teams = sorted(set(df['home_team'].unique()) | set(df['away_team'].unique()))
print("Teams in dataset:", teams)
print(f"Total teams: {len(teams)}")
print("=======================================")

# Data preprocessing
print("\n========== DATA PREPROCESSING ==========")
# Convert date format if needed
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# Create result column
df['Result'] = np.where(df['goals_home'] > df['goals_away'], 'W',
                        np.where(df['goals_home'] < df['goals_away'], 'L', 'D'))

# Compute additional statistics
df['goal_diff'] = df['goals_home'] - df['goals_away']
df['total_goals'] = df['goals_home'] + df['goals_away']
df['home_shot_efficiency'] = np.where(df['shots_home'] > 0, df['goals_home'] / df['shots_home'], 0)
df['away_shot_efficiency'] = np.where(df['shots_away'] > 0, df['goals_away'] / df['shots_away'], 0)
print("Preprocessing complete.")
print("========================================")

# Create team encodings
team_to_id = {team: i for i, team in enumerate(teams)}
df['home_team_id'] = df['home_team'].map(team_to_id)
df['away_team_id'] = df['away_team'].map(team_to_id)

print("\n========== CALCULATING TEAM STATISTICS ==========")
# Create team statistics based on previous matches
team_stats = {}

# Initialize team stats
for team in teams:
    team_stats[team] = {
        'matches_played': 0,
        'wins': 0,
        'draws': 0,
        'losses': 0,
        'goals_for': 0,
        'goals_against': 0,
        'shots': 0,
        'corners': 0,
        'fouls': 0
    }

# Calculate team stats from match data
for _, row in df.iterrows():
    home_team = row['home_team']
    away_team = row['away_team']

    # Update home team stats
    team_stats[home_team]['matches_played'] += 1
    if row['Result'] == 'W':
        team_stats[home_team]['wins'] += 1
    elif row['Result'] == 'D':
        team_stats[home_team]['draws'] += 1
    else:
        team_stats[home_team]['losses'] += 1

    team_stats[home_team]['goals_for'] += row['goals_home']
    team_stats[home_team]['goals_against'] += row['goals_away']
    team_stats[home_team]['shots'] += row['shots_home']
    team_stats[home_team]['corners'] += row['corners_home']
    team_stats[home_team]['fouls'] += row['fouls_home']

    # Update away team stats
    team_stats[away_team]['matches_played'] += 1
    if row['Result'] == 'L':
        team_stats[away_team]['wins'] += 1
    elif row['Result'] == 'D':
        team_stats[away_team]['draws'] += 1
    else:
        team_stats[away_team]['losses'] += 1

    team_stats[away_team]['goals_for'] += row['goals_away']
    team_stats[away_team]['goals_against'] += row['goals_home']
    team_stats[away_team]['shots'] += row['shots_away']
    team_stats[away_team]['corners'] += row['corners_away']
    team_stats[away_team]['fouls'] += row['fouls_away']

# Calculate averages and derived statistics
for team in team_stats:
    stats = team_stats[team]
    matches = stats['matches_played']

    if matches > 0:
        stats['points'] = stats['wins'] * 3 + stats['draws']
        stats['ppg'] = stats['points'] / matches
        stats['goal_diff'] = stats['goals_for'] - stats['goals_against']
        stats['avg_goals_for'] = stats['goals_for'] / matches
        stats['avg_goals_against'] = stats['goals_against'] / matches
        stats['avg_shots'] = stats['shots'] / matches
        stats['avg_corners'] = stats['corners'] / matches
        stats['avg_fouls'] = stats['fouls'] / matches
        stats['win_rate'] = stats['wins'] / matches
    else:
        stats['points'] = 0
        stats['ppg'] = 0
        stats['goal_diff'] = 0
        stats['avg_goals_for'] = 0
        stats['avg_goals_against'] = 0
        stats['avg_shots'] = 0
        stats['avg_corners'] = 0
        stats['avg_fouls'] = 0
        stats['win_rate'] = 0
print("Team statistics calculated.")
print("=================================================")

# Create team statistics dataframe
team_stats_df = pd.DataFrame([
    {
        'Team': team,
        'Matches': stats['matches_played'],
        'Wins': stats['wins'],
        'Draws': stats['draws'],
        'Losses': stats['losses'],
        'GF': stats['goals_for'],
        'GA': stats['goals_against'],
        'GD': stats['goal_diff'],
        'Points': stats['points'],
        'PPG': stats['ppg'],
        'Win Rate': stats['win_rate'],
        'Avg GF': stats['avg_goals_for'],
        'Avg GA': stats['avg_goals_against']
    }
    for team, stats in team_stats.items()
]).sort_values('Points', ascending=False)

# Add team rank column
team_stats_df['Rank'] = range(1, len(team_stats_df) + 1)
team_stats_df = team_stats_df[['Rank', 'Team', 'Matches', 'Wins', 'Draws', 'Losses', 'Points', 'PPG', 'GF', 'GA', 'GD', 'Win Rate']]

print("\n========== CURRENT TEAM STANDINGS ==========")
print(team_stats_df)
print("============================================")

# Feature engineering for match prediction
print("\n========== TRAINING PREDICTION MODEL ==========")
def create_feature_vector(row):
    home_team = row['home_team']
    away_team = row['away_team']

    home_stats = team_stats[home_team]
    away_stats = team_stats[away_team]

    # Create feature vector with team performance metrics
    features = [
        row['home_team_id'],
        row['away_team_id'],
        home_stats['avg_goals_for'],
        home_stats['avg_goals_against'],
        away_stats['avg_goals_for'],
        away_stats['avg_goals_against'],
        home_stats['ppg'],
        away_stats['ppg'],
        home_stats['avg_shots'],
        away_stats['avg_shots'],
        home_stats['avg_corners'],
        away_stats['avg_corners'],
        home_stats['win_rate'],
        away_stats['win_rate']
    ]

    return features

# Prepare training data
X = np.array([create_feature_vector(row) for _, row in df.iterrows()])
y = np.array([0 if r == 'W' else 1 if r == 'D' else 2 for r in df['Result']])

# Train a Random Forest classifier
model = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    min_samples_split=2,
    class_weight='balanced',
    random_state=42
)
model.fit(X, y)
print("Model training complete.")
print("==============================================")

# Evaluate the model
cv_scores = cross_val_score(model, X, y, cv=5)
print("\n========== MODEL ACCURACY ==========")
print(f"Cross-validation accuracy: {np.mean(cv_scores):.4f}")
print("====================================")

# Feature importance
feature_names = [
    'Home Team ID', 'Away Team ID',
    'Home Avg Goals For', 'Home Avg Goals Against',
    'Away Avg Goals For', 'Away Avg Goals Against',
    'Home PPG', 'Away PPG',
    'Home Avg Shots', 'Away Avg Shots',
    'Home Avg Corners', 'Away Avg Corners',
    'Home Win Rate', 'Away Win Rate'
]

print("\n========== FEATURE IMPORTANCE RANKING ==========")
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
for i in range(len(feature_names)):
    print(f"{i+1}. {feature_names[indices[i]]}: {importances[indices[i]]:.4f}")
print("==============================================")

print("\n========== PREDICTING FUTURE MATCHES ==========")
# Generate all possible matchups for a single round-robin tournament
fixtures = []
for i, home_team in enumerate(teams):
    for away_team in teams[i+1:]:  # Each team plays each other once
        # Randomly assign home/away (50/50 chance)
        if np.random.random() > 0.5:
            fixtures.append((home_team, away_team))
        else:
            fixtures.append((away_team, home_team))

print(f"Generated {len(fixtures)} fixtures for the tournament.")

# Make predictions for all fixtures
match_predictions = []
for home_team, away_team in fixtures:
    # Create feature vector
    features = create_feature_vector({
        'home_team': home_team,
        'away_team': away_team,
        'home_team_id': team_to_id[home_team],
        'away_team_id': team_to_id[away_team]
    })

    # Predict match outcome
    pred = model.predict([features])[0]
    probs = model.predict_proba([features])[0]

    # Determine result and points
    if pred == 0:  # Home win
        result = f"{home_team} Win"
        home_points = 3
        away_points = 0
    elif pred == 1:  # Draw
        result = "Draw"
        home_points = 1
        away_points = 1
    else:  # Away win
        result = f"{away_team} Win"
        home_points = 0
        away_points = 3

    # Add to predictions
    match_predictions.append({
        'Home': home_team,
        'Away': away_team,
        'Prediction': result,
        'Home Win Prob': round(probs[0] * 100, 1),
        'Draw Prob': round(probs[1] * 100, 1),
        'Away Win Prob': round(probs[2] * 100, 1),
        'Home Points': home_points,
        'Away Points': away_points
    })

print("Predictions complete.")
print("==============================================")

# Create match predictions dataframe
match_df = pd.DataFrame(match_predictions)

print("\n========== PREDICTED MATCH RESULTS ==========")
# Show first 10 matches
print(match_df[['Home', 'Away', 'Prediction', 'Home Win Prob', 'Draw Prob', 'Away Win Prob']].head(10))
print("...")  # Indicate there are more matches
print(f"Total matches: {len(match_df)}")
print("===========================================")

# Calculate standings based on predictions
predicted_standings = {team: {'W': 0, 'D': 0, 'L': 0, 'Points': 0, 'GF': 0, 'GA': 0} for team in teams}

# Process each match prediction to update the standings
for _, match in match_df.iterrows():
    home_team = match['Home']
    away_team = match['Away']

    # Update win/draw/loss records
    if match['Home Points'] == 3:  # Home win
        predicted_standings[home_team]['W'] += 1
        predicted_standings[away_team]['L'] += 1
    elif match['Away Points'] == 3:  # Away win
        predicted_standings[away_team]['W'] += 1
        predicted_standings[home_team]['L'] += 1
    else:  # Draw
        predicted_standings[home_team]['D'] += 1
        predicted_standings[away_team]['D'] += 1

    # Update points
    predicted_standings[home_team]['Points'] += match['Home Points']
    predicted_standings[away_team]['Points'] += match['Away Points']

    # Estimate goals using team averages and result
    if match['Home Points'] == 3:  # Home win
        home_goals = max(1, round(team_stats[home_team]['avg_goals_for']))
        away_goals = max(0, round(team_stats[away_team]['avg_goals_against'] * 0.8))
    elif match['Away Points'] == 3:  # Away win
        home_goals = max(0, round(team_stats[home_team]['avg_goals_against'] * 0.8))
        away_goals = max(1, round(team_stats[away_team]['avg_goals_for']))
    else:  # Draw
        home_goals = max(0, round((team_stats[home_team]['avg_goals_for'] + team_stats[away_team]['avg_goals_against']) / 2))
        away_goals = home_goals

    # Update goals for/against
    predicted_standings[home_team]['GF'] += home_goals
    predicted_standings[home_team]['GA'] += away_goals
    predicted_standings[away_team]['GF'] += away_goals
    predicted_standings[away_team]['GA'] += home_goals

# Create predicted standings dataframe
predicted_standings_df = pd.DataFrame([
    {
        'Team': team,
        'Matches': stats['W'] + stats['D'] + stats['L'],
        'Wins': stats['W'],
        'Draws': stats['D'],
        'Losses': stats['L'],
        'GF': stats['GF'],
        'GA': stats['GA'],
        'GD': stats['GF'] - stats['GA'],
        'Points': stats['Points']
    }
    for team, stats in predicted_standings.items()
]).sort_values('Points', ascending=False)

# Add rank column
predicted_standings_df['Rank'] = range(1, len(predicted_standings_df) + 1)
predicted_standings_df = predicted_standings_df[['Rank', 'Team', 'Matches', 'Wins', 'Draws', 'Losses', 'GF', 'GA', 'GD', 'Points']]

print("\n========== PREDICTED FINAL STANDINGS ==========")
print(predicted_standings_df)
print("===============================================")

# Calculate win percentage for each team
for i, row in predicted_standings_df.iterrows():
    team = row['Team']
    matches = row['Matches']
    wins = row['Wins']
    win_pct = wins / matches if matches > 0 else 0
    predicted_standings_df.at[i, 'Win%'] = win_pct * 100

# Add points per game
predicted_standings_df['PPG'] = predicted_standings_df['Points'] / predicted_standings_df['Matches']

print("\n========== TEAM PERFORMANCE METRICS ==========")
team_metrics = predicted_standings_df[['Rank', 'Team', 'Points', 'PPG', 'Win%', 'GF', 'GA', 'GD']].copy()
team_metrics['Win%'] = team_metrics['Win%'].round(1)
team_metrics['PPG'] = team_metrics['PPG'].round(2)
print(team_metrics)
print("============================================")

# Summary statistics for each team
print("\n========== DETAILED TEAM PREDICTIONS ==========")
for team in teams:
    # Filter matches involving this team
    team_matches = match_df[(match_df['Home'] == team) | (match_df['Away'] == team)]

    # Calculate statistics
    matches_played = len(team_matches)
    wins = 0
    draws = 0
    losses = 0
    points = 0

    for _, match in team_matches.iterrows():
        if match['Home'] == team:
            if match['Home Points'] == 3:
                wins += 1
            elif match['Home Points'] == 1:
                draws += 1
            else:
                losses += 1
            points += match['Home Points']
        else:  # Away team
            if match['Away Points'] == 3:
                wins += 1
            elif match['Away Points'] == 1:
                draws += 1
            else:
                losses += 1
            points += match['Away Points']

    # Print team summary
    print(f"\n--- {team} ---")
    print(f"Predicted Record: {wins}-{draws}-{losses} (W-D-L)")
    print(f"Predicted Points: {points} ({points/matches_played:.2f} PPG)")
    print(f"Predicted Rank: {predicted_standings_df[predicted_standings_df['Team'] == team]['Rank'].values[0]}")

    # Print match-by-match predictions
    print("\nMatch Predictions:")
    for _, match in team_matches.iterrows():
        if match['Home'] == team:
            if match['Home Points'] == 3:
                result = "Win"
            elif match['Home Points'] == 1:
                result = "Draw"
            else:
                result = "Loss"
            print(f"  vs {match['Away']} (Home): {result} - Win Prob: {match['Home Win Prob']}%")
        else:
            if match['Away Points'] == 3:
                result = "Win"
            elif match['Away Points'] == 1:
                result = "Draw"
            else:
                result = "Loss"
            print(f"  vs {match['Home']} (Away): {result} - Win Prob: {match['Away Win Prob']}%")
print("=================================================")

# Export fixtures to CSV
try:
    match_df.to_csv("predicted_fixtures.csv", index=False)
    predicted_standings_df.to_csv("predicted_standings.csv", index=False)
    print("\nPrediction results exported to CSV files:")
    print("- predicted_fixtures.csv")
    print("- predicted_standings.csv")
except:
    print("\nNote: Could not export results to CSV files.")

    #Find parameter weights for model
    #combine BU  and Boston U.
    #one more year of data
    #find algorithm that ai model uses




Data cleaning complete.

Total matches in dataset: 58
Teams in dataset: ['American', 'Army West Point', 'BU', 'Bucknell', 'Colgate', 'Holy Cross', 'Lafayette', 'Lehigh', 'Loyola', 'Navy']
Total teams: 10

Preprocessing complete.

Team statistics calculated.

   Rank             Team  Matches  Wins  Draws  Losses  Points       PPG  GF  \
3     1         Bucknell       13    10      0       3      30  2.307692  18   
1     2  Army West Point       14     8      1       5      25  1.785714  21   
2     3               BU       12     6      6       0      24  2.000000  22   
8     4           Loyola       14     4      5       5      17  1.214286  22   
4     5          Colgate       13     4      4       5      16  1.230769  18   
9     6             Navy       14     4      3       7      15  1.071429  14   
6     7        Lafayette        9     2      4       3      10  1.111111   8   
0     8         American        9     2      2       5       8  0.888889  10   
5     9       Holy C

In [None]:
import shap
import matplotlib.pyplot as plt

# Create the explainer
explainer = shap.TreeExplainer(model)

# Calculate SHAP values for all matches
shap_values = explainer.shap_values(X)

# Create a summary plot
plt.figure(figsize=(10, 8))
shap.summary_plot(shap_values, X, feature_names=feature_names, plot_type="bar")
plt.savefig('shap_summary.png')

# Detailed plot for a specific match
i = 0  # index of match to explain
plt.figure(figsize=(10, 6))
shap.force_plot(explainer.expected_value[0], shap_values[0][i,:], X[i,:], feature_names=feature_names, matplotlib=True)
plt.savefig('shap_force_plot.png')