In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import cross_val_score, TimeSeriesSplit
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

# Load the match data from excel file
df = pd.read_excel('2023_2024patriot_league_match_data.xlsx', sheet_name='Match Stats')

# Filter out any matches involving team '0'
df = df[(df['home_team'] != '0') & (df['away_team'] != '0')]
print("Total matches after filtering team '0':", len(df))

# Ensure team names are strings to avoid sorting errors
df['home_team'] = df['home_team'].astype(str)
df['away_team'] = df['away_team'].astype(str)

# Standardize team names function
def standardize_team_name(name):
    """Standardize team names to handle duplicates"""
    if name in ['BU', 'Boston U.', 'Boston University']:
        return 'Boston University'  # Standardize all BU variants
    elif name in ['Loyola', 'Loyola Maryland']:
        return 'Loyola Maryland'    # Standardize all Loyola variants
    else:
        return name                 # Keep other names as is

# Apply standardization to the dataframe
df['home_team'] = df['home_team'].apply(standardize_team_name)
df['away_team'] = df['away_team'].apply(standardize_team_name)

# Create team encodings
teams = sorted(list(set(df['home_team'].unique()) | set(df['away_team'].unique())))
team_to_id = {team: i for i, team in enumerate(teams)}
df['home_team_id'] = df['home_team'].map(team_to_id)
df['away_team_id'] = df['away_team'].map(team_to_id)

print("Corrected Teams in dataset:", teams)
print("Total number of teams:", len(teams))
print("Bucknell matches:", len(df[(df['home_team'] == 'Bucknell') | (df['away_team'] == 'Bucknell')]))

# Ensure date column exists and is properly formatted
if 'Date' not in df.columns:
    # If date doesn't exist, create a placeholder with match sequence
    df['Date'] = pd.date_range(start='2022-01-01', periods=len(df), freq='D')
else:
    # Convert date format if needed
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# Sort by date to ensure chronological order
df = df.sort_values('Date')

# Create result column - from perspective of team1 vs team2
# W = team1 wins, L = team1 loses, D = draw
df['Result'] = np.where(df['goals_home'] > df['goals_away'], 'W',
                        np.where(df['goals_home'] < df['goals_away'], 'L', 'D'))

# Compute goal difference
df['goal_diff'] = df['goals_home'] - df['goals_away']
df['total_goals'] = df['goals_home'] + df['goals_away']
df['home_shot_conversion'] = df['goals_home'] / df['shots_home'].replace(0, 1)  # Avoid division by zero
df['away_shot_conversion'] = df['goals_away'] / df['shots_away'].replace(0, 1)  # Avoid division by zero

# Create team statistics based on previous matches
team_stats = {}

# Initialize team stats
for team in teams:
    team_stats[team] = {
        'matches_played': 0,
        'wins': 0,
        'draws': 0,
        'losses': 0,
        'goals_for': 0,
        'goals_against': 0,
        'shots': 0,
        'shots_on_target': 0,
        'corners': 0,
        'fouls': 0,
        'home_wins': 0,
        'home_draws': 0,
        'home_losses': 0,
        'away_wins': 0,
        'away_draws': 0,
        'away_losses': 0,
        'home_matches': 0,
        'away_matches': 0,
        'home_goals_for': 0,
        'home_goals_against': 0,
        'away_goals_for': 0,
        'away_goals_against': 0,
        'last_5_results': []  # Store last 5 results for form
    }

# Create head-to-head records
h2h_stats = {}
for team1 in teams:
    h2h_stats[team1] = {}
    for team2 in teams:
        if team1 != team2:
            h2h_stats[team1][team2] = {
                'matches': 0,
                'wins': 0,
                'draws': 0,
                'losses': 0,
                'goals_for': 0,
                'goals_against': 0
            }

# Calculate team stats from match data
for _, row in df.iterrows():
    home_team = row['home_team']
    away_team = row['away_team']

    # Update head-to-head records
    if home_team != away_team:  # Skip if same team (shouldn't happen in real data)
        h2h_stats[home_team][away_team]['matches'] += 1
        h2h_stats[away_team][home_team]['matches'] += 1

        if row['Result'] == 'W':  # Home team wins
            h2h_stats[home_team][away_team]['wins'] += 1
            h2h_stats[away_team][home_team]['losses'] += 1
        elif row['Result'] == 'L':  # Away team wins
            h2h_stats[home_team][away_team]['losses'] += 1
            h2h_stats[away_team][home_team]['wins'] += 1
        else:  # Draw
            h2h_stats[home_team][away_team]['draws'] += 1
            h2h_stats[away_team][home_team]['draws'] += 1

        h2h_stats[home_team][away_team]['goals_for'] += row['goals_home']
        h2h_stats[home_team][away_team]['goals_against'] += row['goals_away']
        h2h_stats[away_team][home_team]['goals_for'] += row['goals_away']
        h2h_stats[away_team][home_team]['goals_against'] += row['goals_home']

    # Update home team stats
    team_stats[home_team]['matches_played'] += 1
    team_stats[home_team]['home_matches'] += 1

    if row['Result'] == 'W':
        team_stats[home_team]['wins'] += 1
        team_stats[home_team]['home_wins'] += 1
    elif row['Result'] == 'D':
        team_stats[home_team]['draws'] += 1
        team_stats[home_team]['home_draws'] += 1
    else:
        team_stats[home_team]['losses'] += 1
        team_stats[home_team]['home_losses'] += 1

    team_stats[home_team]['goals_for'] += row['goals_home']
    team_stats[home_team]['goals_against'] += row['goals_away']
    team_stats[home_team]['home_goals_for'] += row['goals_home']
    team_stats[home_team]['home_goals_against'] += row['goals_away']
    team_stats[home_team]['shots'] += row['shots_home']
    team_stats[home_team]['corners'] += row['corners_home']
    team_stats[home_team]['fouls'] += row['fouls_home']

    # Add result to last 5 results (from home team perspective)
    team_stats[home_team]['last_5_results'].append(row['Result'])
    if len(team_stats[home_team]['last_5_results']) > 5:
        team_stats[home_team]['last_5_results'] = team_stats[home_team]['last_5_results'][-5:]

    # Update away team stats
    team_stats[away_team]['matches_played'] += 1
    team_stats[away_team]['away_matches'] += 1

    if row['Result'] == 'L':
        team_stats[away_team]['wins'] += 1
        team_stats[away_team]['away_wins'] += 1
    elif row['Result'] == 'D':
        team_stats[away_team]['draws'] += 1
        team_stats[away_team]['away_draws'] += 1
    else:
        team_stats[away_team]['losses'] += 1
        team_stats[away_team]['away_losses'] += 1

    team_stats[away_team]['goals_for'] += row['goals_away']
    team_stats[away_team]['goals_against'] += row['goals_home']
    team_stats[away_team]['away_goals_for'] += row['goals_away']
    team_stats[away_team]['away_goals_against'] += row['goals_home']
    team_stats[away_team]['shots'] += row['shots_away']
    team_stats[away_team]['corners'] += row['corners_away']
    team_stats[away_team]['fouls'] += row['fouls_away']

    # Add result to last 5 results (from away team perspective)
    away_result = 'W' if row['Result'] == 'L' else 'L' if row['Result'] == 'W' else 'D'
    team_stats[away_team]['last_5_results'].append(away_result)
    if len(team_stats[away_team]['last_5_results']) > 5:
        team_stats[away_team]['last_5_results'] = team_stats[away_team]['last_5_results'][-5:]

# Create calculated stats
for team in team_stats:
    stats = team_stats[team]
    matches = stats['matches_played']
    home_matches = max(stats['home_matches'], 1)  # Avoid division by zero
    away_matches = max(stats['away_matches'], 1)  # Avoid division by zero

    if matches > 0:
        stats['points'] = stats['wins'] * 3 + stats['draws']
        stats['ppg'] = stats['points'] / matches
        stats['win_rate'] = stats['wins'] / matches
        stats['draw_rate'] = stats['draws'] / matches
        stats['loss_rate'] = stats['losses'] / matches
        stats['goal_diff'] = stats['goals_for'] - stats['goals_against']
        stats['avg_goals_for'] = stats['goals_for'] / matches
        stats['avg_goals_against'] = stats['goals_against'] / matches
        stats['avg_shots'] = stats['shots'] / matches
        stats['avg_corners'] = stats['corners'] / matches
        stats['avg_fouls'] = stats['fouls'] / matches

        # Home-away specific stats
        stats['home_ppg'] = (stats['home_wins'] * 3 + stats['home_draws']) / home_matches
        stats['away_ppg'] = (stats['away_wins'] * 3 + stats['away_draws']) / away_matches
        stats['home_win_rate'] = stats['home_wins'] / home_matches
        stats['away_win_rate'] = stats['away_wins'] / away_matches
        stats['home_avg_goals_for'] = stats['home_goals_for'] / home_matches
        stats['home_avg_goals_against'] = stats['home_goals_against'] / home_matches
        stats['away_avg_goals_for'] = stats['away_goals_for'] / away_matches
        stats['away_avg_goals_against'] = stats['away_goals_against'] / away_matches

        # Form calculation (points from last 5 games)
        recent_results = stats['last_5_results']
        form_points = sum([3 if res == 'W' else 1 if res == 'D' else 0 for res in recent_results])
        stats['form'] = form_points / (len(recent_results) * 3)  # As percentage of maximum possible
    else:
        # Default values for teams with no matches
        stats['points'] = 0
        stats['ppg'] = 0
        stats['win_rate'] = 0
        stats['draw_rate'] = 0
        stats['loss_rate'] = 0
        stats['goal_diff'] = 0
        stats['avg_goals_for'] = 0
        stats['avg_goals_against'] = 0
        stats['avg_shots'] = 0
        stats['avg_corners'] = 0
        stats['avg_fouls'] = 0
        stats['home_ppg'] = 0
        stats['away_ppg'] = 0
        stats['home_win_rate'] = 0
        stats['away_win_rate'] = 0
        stats['home_avg_goals_for'] = 0
        stats['home_avg_goals_against'] = 0
        stats['away_avg_goals_for'] = 0
        stats['away_avg_goals_against'] = 0
        stats['form'] = 0.5  # Neutral form for teams with no data

# Create team statistics dataframe
team_stats_df = pd.DataFrame([
    {
        'Team': team,
        'Matches': stats['matches_played'],
        'Wins': stats['wins'],
        'Draws': stats['draws'],
        'Losses': stats['losses'],
        'GF': stats['goals_for'],
        'GA': stats['goals_against'],
        'GD': stats['goal_diff'],
        'Points': stats['points'],
        'PPG': stats['ppg'],
        'Win%': stats['win_rate'] * 100,
        'Home PPG': stats['home_ppg'],
        'Away PPG': stats['away_ppg'],
        'Form': stats['form'] * 100
    }
    for team, stats in team_stats.items()
]).sort_values('Points', ascending=False)

# Enhanced feature engineering for match prediction
def create_feature_vector(row):
    home_team = row['home_team']
    away_team = row['away_team']

    home_stats = team_stats[home_team]
    away_stats = team_stats[away_team]

    # Get head-to-head stats if available
    h2h = h2h_stats[home_team][away_team] if home_team != away_team and away_team in h2h_stats.get(home_team, {}) else {
        'matches': 0, 'wins': 0, 'draws': 0, 'losses': 0, 'goals_for': 0, 'goals_against': 0
    }

    # Calculate head-to-head strength (how well home team performs against away team)
    if h2h.get('matches', 0) > 0:
        h2h_win_rate = h2h['wins'] / h2h['matches']
        h2h_ppg = (h2h['wins'] * 3 + h2h['draws']) / h2h['matches']
        h2h_goal_diff = (h2h['goals_for'] - h2h['goals_against']) / h2h['matches']
    else:
        # Default values if no h2h history
        h2h_win_rate = 0.5
        h2h_ppg = home_stats['ppg'] - away_stats['ppg'] + 0.5
        h2h_goal_diff = 0

    # Create a comprehensive feature vector with enhanced stats
    features = [
        # Basic team IDs
        row['home_team_id'],
        row['away_team_id'],

        # Overall team performance
        home_stats['ppg'],
        away_stats['ppg'],
        home_stats['win_rate'],
        away_stats['win_rate'],
        home_stats['draw_rate'],
        away_stats['draw_rate'],

        # Goal stats
        home_stats['avg_goals_for'],
        home_stats['avg_goals_against'],
        away_stats['avg_goals_for'],
        away_stats['avg_goals_against'],

        # Strength at home/away
        home_stats['home_ppg'],
        away_stats['away_ppg'],
        home_stats['home_win_rate'],
        away_stats['away_win_rate'],
        home_stats['home_avg_goals_for'],
        home_stats['home_avg_goals_against'],
        away_stats['away_avg_goals_for'],
        away_stats['away_avg_goals_against'],

        # Recent form
        home_stats['form'],
        away_stats['form'],

        # Head-to-head stats
        h2h_win_rate,
        h2h_ppg,
        h2h_goal_diff,

        # Game stats
        home_stats['avg_shots'],
        away_stats['avg_shots'],
        home_stats['avg_corners'],
        away_stats['avg_corners'],

        # Strategic insights
        home_stats['avg_goals_for'] - away_stats['avg_goals_against'],  # Home attack vs Away defense
        away_stats['avg_goals_for'] - home_stats['avg_goals_against'],  # Away attack vs Home defense

        # Momentum (goal difference in last few games)
        home_stats['goal_diff'] / max(home_stats['matches_played'], 1),
        away_stats['goal_diff'] / max(away_stats['matches_played'], 1)
    ]

    return features

# Function to create a simplified model equation for Excel
def create_excel_equation(model, feature_names, output_class=0):
    """
    Create a simplified equation for Excel based on feature importances.
    For random forest, we use the feature importances as coefficients.

    Parameters:
    model: Trained model with feature_importances_ attribute
    feature_names: List of feature names
    output_class: Class index for multi-class problems (0=home win, 1=draw, 2=away win)

    Returns:
    Excel formula string and coefficient dictionary
    """
    if not hasattr(model, 'feature_importances_'):
        print("Model does not have feature importances attribute")
        return None, None

    # Get feature importances
    importances = model.feature_importances_

    # Create coefficients dictionary
    coefficients = {feature: importance for feature, importance in zip(feature_names, importances)}

    # Create Excel formula string
    excel_formula = "="
    excel_terms = []

    for feature, coef in coefficients.items():
        # Skip team IDs as they're categorical
        if 'Team ID' in feature:
            continue

        # Format coefficient to 4 decimal places
        coef_str = f"{coef:.4f}"

        # Add the term: coefficient * feature
        excel_terms.append(f"{coef_str}*{feature}")

    # Join terms with +
    excel_formula += "+".join(excel_terms)

    # Return the formula and coefficients
    return excel_formula, coefficients

# Class for custom weighted predictions
class CustomWeightedPredictor:
    def __init__(self, base_model, feature_names, custom_weights=None):
        self.base_model = base_model
        self.feature_names = feature_names

        # Default weights if none provided
        if custom_weights is None:
            self.custom_weights = {
                'H2H Home PPG': 0.25,
                'H2H Goal Diff per Game': 0.20,
                'H2H Home Win Rate': 0.15,
                'Away Attack vs Home Defense': 0.08,
                'Home Draw Rate': 0.02,
                'Home PPG': 0.04,
                'Away PPG': 0.04,
                'Home Recent Form': 0.06,
                'Away Recent Form': 0.06,
                'Home Avg Goals For': 0.05,
                'Away Avg Goals For': 0.05
            }
        else:
            self.custom_weights = custom_weights

        # Create weight array
        self.weight_array = np.zeros(len(feature_names))
        for feature, weight in self.custom_weights.items():
            if feature in feature_names:
                idx = feature_names.index(feature)
                self.weight_array[idx] = weight
            else:
                print(f"Warning: Feature '{feature}' not found in model")

        # Normalize weights
        if np.sum(self.weight_array) > 0:
            self.weight_array = self.weight_array / np.sum(self.weight_array)

        # Print the custom weights
        print("\nCustomized Feature Importance:")
        feature_importance = sorted(zip(feature_names, self.weight_array),
                                    key=lambda x: x[1], reverse=True)
        for i, (feature, importance) in enumerate(feature_importance[:15]):
            print(f"{i+1}. {feature}: {importance:.4f}")

    def predict(self, X):
        """
        Makes predictions using the base model
        """
        return self.base_model.predict(X)

    def predict_proba(self, X):
        """
        Makes probability predictions using the base model
        """
        return self.base_model.predict_proba(X)

    def get_excel_formula(self, output_class=0):
        """
        Create Excel formula using custom weights
        """
        # Create Excel formula string
        excel_formula = "="
        excel_terms = []

        # Sort weights by importance
        sorted_weights = sorted(zip(self.feature_names, self.weight_array),
                                key=lambda x: x[1], reverse=True)

        for feature, weight in sorted_weights:
            # Skip team IDs as they're categorical
            if 'Team ID' in feature:
                continue

            # Format weight to 4 decimal places
            weight_str = f"{weight:.4f}"

            # Add the term: weight * feature
            if weight > 0.001:  # Only include non-zero weights
                excel_terms.append(f"{weight_str}*{feature}")

        # Join terms with +
        excel_formula += "+".join(excel_terms)

        return excel_formula

# Create a simplified logistic regression model for interpretability
def create_logistic_regression_model(X, y, feature_names):
    """
    Create a logistic regression model that's easily interpretable

    Parameters:
    X: Feature matrix
    y: Target vector
    feature_names: List of feature names

    Returns:
    Trained logistic regression model
    """
    # Scale features for better convergence
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Train a logistic regression model
    lr_model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, C=1.0)
    lr_model.fit(X_scaled, y)

    # Print coefficients
    print("\nLogistic Regression Coefficients (for easier interpretation):")
    for i, class_name in enumerate(['Home Win', 'Draw', 'Away Win']):
        print(f"\nFor {class_name}:")
        coeffs = sorted(zip(feature_names, lr_model.coef_[i]), key=lambda x: abs(x[1]), reverse=True)
        for feature, coeff in coeffs[:10]:  # Top 10 coefficients
            print(f"  {feature}: {coeff:.4f}")

    # Create Excel formula from logistic regression
    formulas = {}
    for i, class_name in enumerate(['Home Win', 'Draw', 'Away Win']):
        formula = "="
        terms = []
        # Add intercept
        terms.append(f"{lr_model.intercept_[i]:.4f}")

        for feature, coeff in zip(feature_names, lr_model.coef_[i]):
            # Skip team IDs or zero/near-zero coefficients
            if 'Team ID' in feature or abs(coeff) < 0.01:
                continue

            # Format coefficient
            coeff_str = f"{coeff:.4f}"

            # Add the term: coefficient * feature
            terms.append(f"{coeff_str}*{feature}")

        # Join terms with +
        formula += "+".join(terms)
        formulas[class_name] = formula

    return lr_model, scaler, formulas

# Create a function to generate predictions
def generate_predictions(model, custom_weighted=False):
    bucknell_predictions = []

    # Get all other teams
    other_teams = [team for team in teams if team != 'Bucknell']

    # For each opponent, predict Bucknell as both home and away
    for opponent in other_teams:
        # Bucknell as home team
        home_features = create_feature_vector({
            'home_team': 'Bucknell',
            'away_team': opponent,
            'home_team_id': team_to_id['Bucknell'],
            'away_team_id': team_to_id[opponent]
        })

        # Bucknell as away team
        away_features = create_feature_vector({
            'home_team': opponent,
            'away_team': 'Bucknell',
            'home_team_id': team_to_id[opponent],
            'away_team_id': team_to_id['Bucknell']
        })

        # Predict both scenarios
        if custom_weighted:
            home_pred = model.predict([home_features])[0]
            away_pred = model.predict([away_features])[0]
            home_probs = model.predict_proba([home_features])[0]
            away_probs = model.predict_proba([away_features])[0]
        else:
            home_pred = model.predict([home_features])[0]
            away_pred = model.predict([away_features])[0]
            home_probs = model.predict_proba([home_features])[0]
            away_probs = model.predict_proba([away_features])[0]

        # Convert to match results from Bucknell's perspective
        bucknell_home_result = 'Win' if home_pred == 0 else 'Draw' if home_pred == 1 else 'Loss'
        bucknell_away_result = 'Win' if away_pred == 2 else 'Draw' if away_pred == 1 else 'Loss'

        # Add predictions
        bucknell_predictions.append({
            'Opponent': opponent,
            'Bucknell at Home': bucknell_home_result,
            'Win Prob (Home)': round(home_probs[0] * 100, 1),
            'Draw Prob (Home)': round(home_probs[1] * 100, 1),
            'Loss Prob (Home)': round(home_probs[2] * 100, 1),
            'Bucknell Away': bucknell_away_result,
            'Win Prob (Away)': round(away_probs[2] * 100, 1),
            'Draw Prob (Away)': round(away_probs[1] * 100, 1),
            'Loss Prob (Away)': round(away_probs[0] * 100, 1)
        })

    return pd.DataFrame(bucknell_predictions)

# Prepare train/test data
X = np.array([create_feature_vector(row) for _, row in df.iterrows()])
y = np.array([0 if r == 'W' else 1 if r == 'D' else 2 for r in df['Result']])

# List of feature names (for importance analysis)
feature_names = [
    'Home Team ID', 'Away Team ID',
    'Home PPG', 'Away PPG',
    'Home Win Rate', 'Away Win Rate',
    'Home Draw Rate', 'Away Draw Rate',
    'Home Avg Goals For', 'Home Avg Goals Against',
    'Away Avg Goals For', 'Away Avg Goals Against',
    'Home PPG at Home', 'Away PPG Away',
    'Home Win Rate at Home', 'Away Win Rate Away',
    'Home Avg Goals For at Home', 'Home Avg Goals Against at Home',
    'Away Avg Goals For Away', 'Away Avg Goals Against Away',
    'Home Recent Form', 'Away Recent Form',
    'H2H Home Win Rate', 'H2H Home PPG', 'H2H Goal Diff per Game',
    'Home Avg Shots', 'Away Avg Shots',
    'Home Avg Corners', 'Away Avg Corners',
    'Home Attack vs Away Defense', 'Away Attack vs Home Defense',
    'Home Goal Diff per Game', 'Away Goal Diff per Game'
]

# Try both Random Forest and Gradient Boosting
print("\nTraining models with enhanced features...")

# Train a Random Forest classifier with tuned parameters
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    class_weight='balanced',
    random_state=42
)
rf_model.fit(X, y)

# Train a Gradient Boosting classifier
gb_model = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42
)
gb_model.fit(X, y)

# Evaluate the models with cross-validation
# Use TimeSeriesSplit for chronological data
tscv = TimeSeriesSplit(n_splits=5)
rf_cv_scores = cross_val_score(rf_model, X, y, cv=tscv)
gb_cv_scores = cross_val_score(gb_model, X, y, cv=tscv)

print(f"\nRandom Forest Cross-validation accuracy: {np.mean(rf_cv_scores):.4f}")
print(f"Gradient Boosting Cross-validation accuracy: {np.mean(gb_cv_scores):.4f}")

# Use the better performing model
if np.mean(rf_cv_scores) >= np.mean(gb_cv_scores):
    model = rf_model
    print("Using Random Forest model for predictions (better performance)")
else:
    model = gb_model
    print("Using Gradient Boosting model for predictions (better performance)")

# Feature importance before customization
if hasattr(model, 'feature_importances_'):
    print("\nOriginal Feature Importance:")
    feature_importance = sorted(zip(feature_names, model.feature_importances_),
                                key=lambda x: x[1], reverse=True)
    for i, (feature, importance) in enumerate(feature_importance[:15]):
        print(f"{i+1}. {feature}: {importance:.4f}")

# Train a logistic regression model for interpretability
lr_model, scaler, lr_formulas = create_logistic_regression_model(X, y, feature_names)

# Custom weights - modify these values as needed
custom_weights = {
    'H2H Home PPG': 0.25,                # Increased from ~0.21
    'H2H Goal Diff per Game': 0.20,      # Similar to original
    'H2H Home Win Rate': 0.15,           # Increased from ~0.14
    'Away Attack vs Home Defense': 0.08, # Increased from ~0.05
    'Home Draw Rate': 0.02,              # Decreased from ~0.03
    'Home PPG': 0.04,                    # Increased from ~0.02
    'Away PPG': 0.04,                    # New emphasis
    'Home Recent Form': 0.06,            # New emphasis on form
    'Away Recent Form': 0.06,            # New emphasis on form
    'Home Avg Goals For': 0.05,          # New emphasis on offense
    'Away Avg Goals For': 0.05           # New emphasis on offense
}

# Create a custom weighted predictor
custom_model = CustomWeightedPredictor(model, feature_names, custom_weights)

# Generate Excel formulas
rf_formula, rf_coefs = create_excel_equation(model, feature_names, output_class=0)
custom_formula = custom_model.get_excel_formula(output_class=0)

print("\n==== EXCEL FORMULAS ====")
print("\n1. Random Forest Model Formula (Home Win Probability):")
print(rf_formula)

print("\n2. Custom Weighted Formula (Based on your custom weights):")
print(custom_formula)

print("\n3. Logistic Regression Formulas (more suitable for Excel):")
for outcome, formula in lr_formulas.items():
    print(f"\nFor {outcome}:")
    print(formula)

# Generate and print predictions for Random Forest model
print("\n=== Random Forest Model Predictions ===")
rf_bucknell_pred_df = generate_predictions(rf_model)
print("\nBucknell Match Predictions with Random Forest:")
print(rf_bucknell_pred_df[['Opponent', 'Bucknell at Home', 'Bucknell Away']])

# Calculate expected points for Random Forest model
rf_home_points = sum([3 if r == 'Win' else 1 if r == 'Draw' else 0 for r in rf_bucknell_pred_df['Bucknell at Home']])
rf_away_points = sum([3 if r == 'Win' else 1 if r == 'Draw' else 0 for r in rf_bucknell_pred_df['Bucknell Away']])
rf_total_points = rf_home_points + rf_away_points
rf_matches = len(rf_bucknell_pred_df) * 2

print(f"\nExpected Points for Bucknell with Random Forest Model:")
print(f"Home Points: {rf_home_points} from {len(rf_bucknell_pred_df)} matches")
print(f"Away Points: {rf_away_points} from {len(rf_bucknell_pred_df)} matches")
print(f"Total Points: {rf_total_points} from {rf_matches} matches")
print(f"Expected PPG: {rf_total_points / rf_matches:.2f}")

print("\nDetailed Bucknell Predictions with Random Forest Model:")
for _, row in rf_bucknell_pred_df.iterrows():
    print(f"vs {row['Opponent']}:")
    print(f"  At Home: {row['Bucknell at Home']} (Win: {row['Win Prob (Home)']}%, Draw: {row['Draw Prob (Home)']}%, Loss: {row['Loss Prob (Home)']}%)")
    print(f"  Away: {row['Bucknell Away']} (Win: {row['Win Prob (Away)']}%, Draw: {row['Draw Prob (Away)']}%, Loss: {row['Loss Prob (Away)']}%)")

# Existing custom model predictions
print("\n=== Custom Weighted Model Predictions ===")
bucknell_pred_df = generate_predictions(custom_model, custom_weighted=True)
print("\nBucknell Match Predictions with Customized Weights:")
print(bucknell_pred_df[['Opponent', 'Bucknell at Home', 'Bucknell Away']])

# Calculate expected points
home_points = sum([3 if r == 'Win' else 1 if r == 'Draw' else 0 for r in bucknell_pred_df['Bucknell at Home']])
away_points = sum([3 if r == 'Win' else 1 if r == 'Draw' else 0 for r in bucknell_pred_df['Bucknell Away']])
total_points = home_points + away_points
matches = len(bucknell_pred_df) * 2

print(f"\nExpected Points for Bucknell with Customized Model:")
print(f"Home Points: {home_points} from {len(bucknell_pred_df)} matches")
print(f"Away Points: {away_points} from {len(bucknell_pred_df)} matches")
print(f"Total Points: {total_points} from {matches} matches")
print(f"Expected PPG: {total_points / matches:.2f}")

print("\nDetailed Bucknell Predictions with Customized Weights:")
for _, row in bucknell_pred_df.iterrows():
    print(f"vs {row['Opponent']}:")
    print(f"  At Home: {row['Bucknell at Home']} (Win: {row['Win Prob (Home)']}%, Draw: {row['Draw Prob (Home)']}%, Loss: {row['Loss Prob (Home)']}%)")
    print(f"  Away: {row['Bucknell Away']} (Win: {row['Win Prob (Away)']}%, Draw: {row['Draw Prob (Away)']}%, Loss: {row['Loss Prob (Away)']}%)")

# Add code to test the logistic regression equation with real values
print("\n=== Testing Logistic Regression Equation with Real Values ===")

# Get a real match example
test_match = {
    'home_team': 'Bucknell',
    'away_team': 'Lafayette',  # Replace with an actual opponent
    'home_team_id': team_to_id['Bucknell'],
    'away_team_id': team_to_id['Lafayette']
}

# Create feature vector for this match
test_features = create_feature_vector(test_match)

# Apply the logistic regression model
# First, scale the features
scaled_features = scaler.transform([test_features])
# Get predicted probabilities
lr_probs = lr_model.predict_proba(scaled_features)[0]

print(f"\nTest Match: {test_match['home_team']} vs {test_match['away_team']}")
print("Feature Values Used in Equation:")
for i, (feature, value) in enumerate(zip(feature_names, test_features)):
    print(f"{feature}: {value:.4f}")

print("\nLogistic Regression Model Prediction Probabilities:")
print(f"Home Win: {lr_probs[0]:.4f} ({lr_probs[0]*100:.1f}%)")
print(f"Draw: {lr_probs[1]:.4f} ({lr_probs[1]*100:.1f}%)")
print(f"Away Win: {lr_probs[2]:.4f} ({lr_probs[2]*100:.1f}%)")

# Manual calculation for demonstration
print("\nManual Calculation of Home Win Probability:")
home_win_formula = lr_formulas['Home Win']
print(f"Formula: {home_win_formula}")

# Calculate the formula manually using test_features and feature_names
manual_calc = lr_model.intercept_[0]
print(f"Starting with intercept: {manual_calc:.4f}")

for feature, coeff in zip(feature_names, lr_model.coef_[0]):
    if abs(coeff) >= 0.01:  # Skip near-zero coefficients
        feature_idx = feature_names.index(feature)
        feature_val = test_features[feature_idx]
        term_value = coeff * feature_val
        print(f"  + {coeff:.4f} * {feature} ({feature_val:.4f}) = {term_value:.4f}")
        manual_calc += term_value

print(f"Raw logit score for Home Win: {manual_calc:.4f}")

# Convert logit to probability (using softmax, considering all three classes)
logits = np.array([manual_calc] + [0, 0])  # Just for demonstration
for i in range(1, 3):
    logits[i] = lr_model.intercept_[i]
    for feature, coeff in zip(feature_names, lr_model.coef_[i]):
        feature_idx = feature_names.index(feature)
        feature_val = test_features[feature_idx]
        logits[i] += coeff * feature_val

# Apply softmax to get probabilities
exp_logits = np.exp(logits)
probs = exp_logits / np.sum(exp_logits)

print("Calculated probabilities using manual formula:")
print(f"Home Win: {probs[0]:.4f} ({probs[0]*100:.1f}%)")
print(f"Draw: {probs[1]:.4f} ({probs[1]*100:.1f}%)")
print(f"Away Win: {probs[2]:.4f} ({probs[2]*100:.1f}%)")

Total matches after filtering team '0': 181
Corrected Teams in dataset: ['American', 'Army West Point', 'Boston University', 'Bucknell', 'Colgate', 'Holy Cross', 'Lafayette', 'Lehigh', 'Loyola Maryland', 'Navy']
Total number of teams: 10
Bucknell matches: 37

Training models with enhanced features...

Random Forest Cross-validation accuracy: 0.6933
Gradient Boosting Cross-validation accuracy: 0.7000
Using Gradient Boosting model for predictions (better performance)

Original Feature Importance:
1. H2H Home PPG: 0.4583
2. H2H Home Win Rate: 0.1482
3. Away Attack vs Home Defense: 0.0813
4. Home Attack vs Away Defense: 0.0731
5. Home Avg Shots: 0.0366
6. H2H Goal Diff per Game: 0.0342
7. Away Avg Shots: 0.0309
8. Away Team ID: 0.0209
9. Home Recent Form: 0.0148
10. Home PPG: 0.0107
11. Home Draw Rate: 0.0091
12. Home Goal Diff per Game: 0.0089
13. Home Avg Goals For: 0.0087
14. Home Avg Goals Against: 0.0074
15. Away Win Rate Away: 0.0057

Logistic Regression Coefficients (for easier inte

In [2]:

# Export to excel - Create separate files for team stats and H2H stats
detailed_team_stats = []
h2h_stats_data = []  # New list for head-to-head data

for team in teams:
    if team == '0':  # Skip the '0' team if it exists
        continue

    stats = team_stats[team]

    # Create a dictionary with all stats for this team (excluding H2H)
    team_data = {
        'Team': team,
        # Basic stats
        'Matches_Played': stats['matches_played'],
        'Wins': stats['wins'],
        'Draws': stats['draws'],
        'Losses': stats['losses'],
        'Goals_For': stats['goals_for'],
        'Goals_Against': stats['goals_against'],
        'Goal_Diff': stats.get('goal_diff', 0),
        'Points': stats.get('points', 0),
        'PPG': stats.get('ppg', 0),

        # Home/Away breakdown
        'Home_Matches': stats['home_matches'],
        'Away_Matches': stats['away_matches'],
        'Home_Wins': stats['home_wins'],
        'Home_Draws': stats['home_draws'],
        'Home_Losses': stats['home_losses'],
        'Away_Wins': stats['away_wins'],
        'Away_Draws': stats['away_draws'],
        'Away_Losses': stats['away_losses'],

        # Advanced stats
        'Win_Rate': stats.get('win_rate', 0) * 100,  # Convert to percentage
        'Draw_Rate': stats.get('draw_rate', 0) * 100,
        'Loss_Rate': stats.get('loss_rate', 0) * 100,
        'Home_Win_Rate': stats.get('home_win_rate', 0) * 100,
        'Away_Win_Rate': stats.get('away_win_rate', 0) * 100,
        'Home_PPG': stats.get('home_ppg', 0),
        'Away_PPG': stats.get('away_ppg', 0),

        # Offensive/Defensive metrics
        'Avg_Goals_For': stats.get('avg_goals_for', 0),
        'Avg_Goals_Against': stats.get('avg_goals_against', 0),
        'Home_Avg_Goals_For': stats.get('home_avg_goals_for', 0),
        'Home_Avg_Goals_Against': stats.get('home_avg_goals_against', 0),
        'Away_Avg_Goals_For': stats.get('away_avg_goals_for', 0),
        'Away_Avg_Goals_Against': stats.get('away_avg_goals_against', 0),

        # Game stats
        'Avg_Shots': stats.get('avg_shots', 0),
        'Avg_Corners': stats.get('avg_corners', 0),
        'Avg_Fouls': stats.get('avg_fouls', 0),

        # Form
        'Form': stats.get('form', 0) * 100  # Convert to percentage
    }

    detailed_team_stats.append(team_data)

    # Process head-to-head stats separately for the second Excel file
    for opponent in teams:
        if opponent == '0' or opponent == team:  # Skip the '0' team and self matchups
            continue

        if opponent in h2h_stats.get(team, {}):
            h2h = h2h_stats[team][opponent]

            # Calculate H2H metrics if there are matches
            if h2h['matches'] > 0:
                h2h_win_rate = h2h['wins'] / h2h['matches'] * 100  # As percentage
                h2h_ppg = (h2h['wins'] * 3 + h2h['draws']) / h2h['matches']
                h2h_goal_diff = (h2h['goals_for'] - h2h['goals_against']) / h2h['matches']
            else:
                h2h_win_rate = 0
                h2h_ppg = 0
                h2h_goal_diff = 0

            # Add H2H stats to a separate list with a different structure
            h2h_record = {
                'Team': team,
                'Opponent': opponent,
                'Matches': h2h['matches'],
                'Wins': h2h['wins'],
                'Draws': h2h['draws'],
                'Losses': h2h['losses'],
                'Goals_For': h2h['goals_for'],
                'Goals_Against': h2h['goals_against'],
                'Win_Rate': h2h_win_rate,
                'PPG': h2h_ppg,
                'Goal_Diff_Per_Game': h2h_goal_diff
            }
            h2h_stats_data.append(h2h_record)

# Create DataFrames from collected data
detailed_stats_df = pd.DataFrame(detailed_team_stats)
h2h_stats_df = pd.DataFrame(h2h_stats_data)

# Export to separate Excel files
team_stats_filename = 'patriotleage_stats.xlsx'
h2h_stats_filename = 'patriot_league_h2h_statistics.xlsx'

detailed_stats_df.to_excel(team_stats_filename, index=False)
h2h_stats_df.to_excel(h2h_stats_filename, index=False)

print(f"\nTeam statistics exported to {team_stats_filename}")
print(f"Head-to-head statistics exported to {h2h_stats_filename}")


Team statistics exported to patriotleage_stats.xlsx
Head-to-head statistics exported to patriot_league_h2h_statistics.xlsx
