In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

print("========== LOADING DATA ==========")
# Load the match data from excel file
df = pd.read_excel('match_data.xlsx', sheet_name='Match Stats')

# Data exploration
print("\n========== DATASET OVERVIEW ==========")
print("Total matches in dataset:", len(df))
teams = sorted(set(df['home_team'].unique()) | set(df['away_team'].unique()))
print("Teams in dataset:", teams)
print(f"Total teams: {len(teams)}")
print("=======================================")

# Data preprocessing
print("\n========== DATA PREPROCESSING ==========")
# Convert date format if needed
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# Create result column
df['Result'] = np.where(df['goals_home'] > df['goals_away'], 'W',
                        np.where(df['goals_home'] < df['goals_away'], 'L', 'D'))

# Compute additional statistics
df['goal_diff'] = df['goals_home'] - df['goals_away']
df['total_goals'] = df['goals_home'] + df['goals_away']
df['home_shot_efficiency'] = np.where(df['shots_home'] > 0, df['goals_home'] / df['shots_home'], 0)
df['away_shot_efficiency'] = np.where(df['shots_away'] > 0, df['goals_away'] / df['shots_away'], 0)
print("Preprocessing complete.")
print("========================================")

# Create team encodings
team_to_id = {team: i for i, team in enumerate(teams)}
df['home_team_id'] = df['home_team'].map(team_to_id)
df['away_team_id'] = df['away_team'].map(team_to_id)

print("\n========== CALCULATING TEAM STATISTICS ==========")
# Create team statistics based on previous matches
team_stats = {}

# Initialize team stats
for team in teams:
    team_stats[team] = {
        'matches_played': 0,
        'wins': 0,
        'draws': 0,
        'losses': 0,
        'goals_for': 0,
        'goals_against': 0,
        'shots': 0,
        'corners': 0,
        'fouls': 0
    }

# Calculate team stats from match data
for _, row in df.iterrows():
    home_team = row['home_team']
    away_team = row['away_team']

    # Update home team stats
    team_stats[home_team]['matches_played'] += 1
    if row['Result'] == 'W':
        team_stats[home_team]['wins'] += 1
    elif row['Result'] == 'D':
        team_stats[home_team]['draws'] += 1
    else:
        team_stats[home_team]['losses'] += 1

    team_stats[home_team]['goals_for'] += row['goals_home']
    team_stats[home_team]['goals_against'] += row['goals_away']
    team_stats[home_team]['shots'] += row['shots_home']
    team_stats[home_team]['corners'] += row['corners_home']
    team_stats[home_team]['fouls'] += row['fouls_home']

    # Update away team stats
    team_stats[away_team]['matches_played'] += 1
    if row['Result'] == 'L':
        team_stats[away_team]['wins'] += 1
    elif row['Result'] == 'D':
        team_stats[away_team]['draws'] += 1
    else:
        team_stats[away_team]['losses'] += 1

    team_stats[away_team]['goals_for'] += row['goals_away']
    team_stats[away_team]['goals_against'] += row['goals_home']
    team_stats[away_team]['shots'] += row['shots_away']
    team_stats[away_team]['corners'] += row['corners_away']
    team_stats[away_team]['fouls'] += row['fouls_away']

# Calculate averages and derived statistics
for team in team_stats:
    stats = team_stats[team]
    matches = stats['matches_played']

    if matches > 0:
        stats['points'] = stats['wins'] * 3 + stats['draws']
        stats['ppg'] = stats['points'] / matches
        stats['goal_diff'] = stats['goals_for'] - stats['goals_against']
        stats['avg_goals_for'] = stats['goals_for'] / matches
        stats['avg_goals_against'] = stats['goals_against'] / matches
        stats['avg_shots'] = stats['shots'] / matches
        stats['avg_corners'] = stats['corners'] / matches
        stats['avg_fouls'] = stats['fouls'] / matches
        stats['win_rate'] = stats['wins'] / matches
    else:
        stats['points'] = 0
        stats['ppg'] = 0
        stats['goal_diff'] = 0
        stats['avg_goals_for'] = 0
        stats['avg_goals_against'] = 0
        stats['avg_shots'] = 0
        stats['avg_corners'] = 0
        stats['avg_fouls'] = 0
        stats['win_rate'] = 0
print("Team statistics calculated.")
print("=================================================")

# Create team statistics dataframe
team_stats_df = pd.DataFrame([
    {
        'Team': team,
        'Matches': stats['matches_played'],
        'Wins': stats['wins'],
        'Draws': stats['draws'],
        'Losses': stats['losses'],
        'GF': stats['goals_for'],
        'GA': stats['goals_against'],
        'GD': stats['goal_diff'],
        'Points': stats['points'],
        'PPG': stats['ppg'],
        'Win Rate': stats['win_rate'],
        'Avg GF': stats['avg_goals_for'],
        'Avg GA': stats['avg_goals_against']
    }
    for team, stats in team_stats.items()
]).sort_values('Points', ascending=False)

# Add team rank column
team_stats_df['Rank'] = range(1, len(team_stats_df) + 1)
team_stats_df = team_stats_df[['Rank', 'Team', 'Matches', 'Wins', 'Draws', 'Losses', 'Points', 'PPG', 'GF', 'GA', 'GD', 'Win Rate']]

print("\n========== CURRENT TEAM STANDINGS ==========")
print(team_stats_df)
print("============================================")

# Feature engineering for match prediction
print("\n========== TRAINING PREDICTION MODEL ==========")
def create_feature_vector(row):
    home_team = row['home_team']
    away_team = row['away_team']

    home_stats = team_stats[home_team]
    away_stats = team_stats[away_team]

    # Create feature vector with team performance metrics
    features = [
        row['home_team_id'],
        row['away_team_id'],
        home_stats['avg_goals_for'],
        home_stats['avg_goals_against'],
        away_stats['avg_goals_for'],
        away_stats['avg_goals_against'],
        home_stats['ppg'],
        away_stats['ppg'],
        home_stats['avg_shots'],
        away_stats['avg_shots'],
        home_stats['avg_corners'],
        away_stats['avg_corners'],
        home_stats['win_rate'],
        away_stats['win_rate']
    ]

    return features

# Prepare training data
X = np.array([create_feature_vector(row) for _, row in df.iterrows()])
y = np.array([0 if r == 'W' else 1 if r == 'D' else 2 for r in df['Result']])

# Scale the features (important for logistic regression)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Create a custom feature weight multiplier (adjust these to emphasize different features)
feature_weights = np.ones(X.shape[1])
# Example: Make shots more important by multiplying by 3
feature_weights[8] = 3.0  # Home Avg Shots
feature_weights[9] = 3.0  # Away Avg Shots
# Example: Make goals against more important
feature_weights[3] = 2.0  # Home Avg Goals Against
feature_weights[5] = 2.0  # Away Avg Goals Against

# Apply weights to scaled features
X_weighted = X_scaled.copy()
for i, weight in enumerate(feature_weights):
    X_weighted[:, i] *= weight

# Train a Logistic Regression classifier
model = LogisticRegression(
    class_weight='balanced',
    multi_class='multinomial',
    solver='lbfgs',
    max_iter=1000,
    random_state=42
)
model.fit(X_weighted, y)
print("Model training complete.")
print("==============================================")

# Evaluate the model
cv_scores = cross_val_score(model, X_weighted, y, cv=5)
print("\n========== MODEL ACCURACY ==========")
print(f"Cross-validation accuracy: {np.mean(cv_scores):.4f}")
print("====================================")

# Feature names for reference
feature_names = [
    'Home Team ID', 'Away Team ID',
    'Home Avg Goals For', 'Home Avg Goals Against',
    'Away Avg Goals For', 'Away Avg Goals Against',
    'Home PPG', 'Away PPG',
    'Home Avg Shots', 'Away Avg Shots',
    'Home Avg Corners', 'Away Avg Corners',
    'Home Win Rate', 'Away Win Rate'
]

print("\n========== MODEL COEFFICIENTS ==========")
# For each class (Home Win, Draw, Away Win)
outcomes = ['Home Win', 'Draw', 'Away Win']
for i, outcome in enumerate(outcomes):
    print(f"\nCoefficients for {outcome}:")
    # Sort coefficients by absolute magnitude
    coefs = model.coef_[i]
    sorted_indices = np.argsort(np.abs(coefs))[::-1]

    for idx in sorted_indices:
        sign = '+' if coefs[idx] > 0 else '-'
        print(f"{sign} {abs(coefs[idx]):.4f} × {feature_names[idx]}")

    # Intercept for this class
    print(f"Intercept: {model.intercept_[i]:.4f}")

# Create a visualization of the coefficients
plt.figure(figsize=(15, 10))
for i, outcome in enumerate(outcomes):
    plt.subplot(3, 1, i+1)
    sorted_indices = np.argsort(model.coef_[i])
    plt.barh([feature_names[j] for j in sorted_indices], model.coef_[i][sorted_indices])
    plt.title(f'Feature Coefficients for {outcome}')
    plt.xlabel('Coefficient Value')
plt.tight_layout()
plt.savefig('logistic_regression_coefficients.png')

print("\n========== PREDICTING FUTURE MATCHES ==========")
# Generate all possible matchups for a single round-robin tournament
fixtures = []
for i, home_team in enumerate(teams):
    for away_team in teams[i+1:]:  # Each team plays each other once
        # Randomly assign home/away (50/50 chance)
        if np.random.random() > 0.5:
            fixtures.append((home_team, away_team))
        else:
            fixtures.append((away_team, home_team))

print(f"Generated {len(fixtures)} fixtures for the tournament.")

# Make predictions for all fixtures
match_predictions = []
for home_team, away_team in fixtures:
    # Create feature vector
    features = create_feature_vector({
        'home_team': home_team,
        'away_team': away_team,
        'home_team_id': team_to_id[home_team],
        'away_team_id': team_to_id[away_team]
    })

    # Scale and weight features the same way as during training
    features_scaled = scaler.transform([features])[0]
    features_weighted = features_scaled.copy()
    for i, weight in enumerate(feature_weights):
        features_weighted[i] *= weight

    # Predict match outcome
    pred = model.predict([features_weighted])[0]
    probs = model.predict_proba([features_weighted])[0]

    # Determine result and points
    if pred == 0:  # Home win
        result = f"{home_team} Win"
        home_points = 3
        away_points = 0
    elif pred == 1:  # Draw
        result = "Draw"
        home_points = 1
        away_points = 1
    else:  # Away win
        result = f"{away_team} Win"
        home_points = 0
        away_points = 3

    # Add to predictions
    match_predictions.append({
        'Home': home_team,
        'Away': away_team,
        'Prediction': result,
        'Home Win Prob': round(probs[0] * 100, 1),
        'Draw Prob': round(probs[1] * 100, 1),
        'Away Win Prob': round(probs[2] * 100, 1),
        'Home Points': home_points,
        'Away Points': away_points
    })

print("Predictions complete.")
print("=============================================")

# Continue with the rest of your code for displaying/analyzing predictions


Total matches in dataset: 59


TypeError: '<' not supported between instances of 'str' and 'int'