In [281]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [282]:
# Load the match data from excel file
df = pd.read_excel('match_data.xlsx', sheet_name='Sheet2')


In [283]:
# Data preprocessing
# Convert date to datetime format
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

In [284]:
# Create encodings for categorical variables
df["home_team_code"] = df["home_team"].astype("category").cat.codes
df["away_team_code"] = df["away_team"].astype("category").cat.codes
df["stadium_code"] = df["Stadium"].astype("category").cat.codes
df["day_of_week"] = df["Date"].dt.dayofweek

In [285]:
# Create target variable based on goals scored
# Using a more verbose approach to avoid potential issues with np.where
result_list = []
for idx, row in df.iterrows():
    if row['goals_home'] > row['goals_away']:
        result_list.append('W')
    elif row['goals_home'] < row['goals_away']:
        result_list.append('L')
    else:
        result_list.append('D')

df["Result"] = result_list

In [286]:
# Map results to target values
df["target"] = df["Result"].map({"W": 1, "L": 0, "D": 2})

In [287]:
# Calculate home and away points per game
# Group by home team and calculate average goals
home_stats = df.groupby('home_team')['goals_home'].mean().reset_index()
home_stats.columns = ['team', 'home_ppg']

In [288]:
# Group by away team and calculate average goals
away_stats = df.groupby('away_team')['goals_away'].mean().reset_index()
away_stats.columns = ['team', 'away_ppg']

In [289]:
# Merge with the main dataframe
df = pd.merge(df, home_stats, left_on='home_team', right_on='team', how='left')
df = pd.merge(df, away_stats, left_on='away_team', right_on='team', how='left')

In [290]:
# Drop the unnecessary columns
df.drop(['team_x', 'team_y'], axis=1, inplace=True, errors='ignore')

In [291]:
# Define predictors (features) for the model
predictors = ["home_team_code", "away_team_code", "stadium_code", "day_of_week",
              "home_ppg", "away_ppg"]

In [292]:
# Split data into training and testing sets
# Since all data is from Sep-Oct 2024, use a random split instead of date-based
from sklearn.model_selection import train_test_split

In [293]:
 #Use 70% for training, 15% for testing, 15% for future predictions
train_data, temp_data = train_test_split(df, test_size=0.3, random_state=42)
test_data, future_data = train_test_split(temp_data, test_size=0.5, random_state=42)

In [294]:
# Initialize and train the model
rf = RandomForestClassifier(
    n_estimators=1000,
    max_depth=100,
    min_samples_split=5,
    class_weight='balanced',
    random_state=100
)
rf.fit(train_data[predictors], train_data["target"])

In [295]:
# Make predictions on test data
preds = rf.predict(test_data[predictors])

In [296]:
# Evaluate model accuracy
accuracy = accuracy_score(test_data["target"], preds)
print(f"Model accuracy: {accuracy:.4f}")

Model accuracy: 0.6667


In [297]:
# Create confusion matrix
combined = pd.DataFrame(dict(actual=test_data["target"], predicted=preds))
confusion_matrix = pd.crosstab(index=combined["actual"], columns=combined["predicted"])
print("\nConfusion Matrix:")
print(confusion_matrix)


Confusion Matrix:
predicted  0  1
actual         
0          2  0
1          1  2
2          1  0


In [298]:
# Make predictions for future matches
future_predictions = rf.predict(future_data[predictors])
future_data["prediction"] = future_predictions

In [299]:
# Analyze predictions for specific teams
# Using actual teams from your dataset
team1 = "Bucknell"
team2 = "Lehigh"
team3 = "Loyola Maryland"

In [300]:
team1_matches = future_data[(future_data['home_team'] == team1) | (future_data['away_team'] == team1)]
team2_matches = future_data[(future_data['home_team'] == team2) | (future_data['away_team'] == team2)]
team3_matches = future_data[(future_data['home_team'] == team3) | (future_data['away_team'] == team3)]

In [301]:
# Function to calculate points based on predictions
def calculate_points(team_matches, team_name):
    points = 0
    for _, match in team_matches.iterrows():
        is_home = match['home_team'] == team_name
        prediction = match['prediction']

        # Home team wins (1) gets 3 points
        if is_home and prediction == 1:
            points += 3
        # Away team wins (0) for home team means away team gets 3 points
        elif not is_home and prediction == 0:
            points += 3
        # Draw (2) gets 1 point for both teams
        elif prediction == 2:
            points += 1

    return points

In [302]:
# Calculate points for each team
team1_points = calculate_points(team1_matches, team1)
team2_points = calculate_points(team2_matches, team2)
team3_points = calculate_points(team3_matches, team3)

In [303]:
# Current points (replace with actual current points from your league standings)
team1_current_points = 10  # Replace with actual current points
team2_current_points = 12  # Replace with actual current points
team3_current_points = 9   # Replace with actual current points

In [304]:
print(f"\nProjected points from remaining matches:")
print(f"{team1}: {team1_points} points")
print(f"{team2}: {team2_points} points")
print(f"{team3}: {team3_points} points")


Projected points from remaining matches:
Bucknell: 0 points
Lehigh: 0 points
Loyola Maryland: 3 points


In [305]:
print(f"\nProjected final standings:")
print(f"{team1}: {team1_current_points + team1_points} points")
print(f"{team2}: {team2_current_points + team2_points} points")
print(f"{team3}: {team3_current_points + team3_points} points")


Projected final standings:
Bucknell: 10 points
Lehigh: 12 points
Loyola Maryland: 12 points


In [306]:
# Display detailed predictions for future matches
#1 means home team wins, 0 means away team wins, 2 means draw
print("\nDetailed predictions for future matches:")
print(future_data[['Date', 'home_team', 'away_team', 'prediction']].head(10))


Detailed predictions for future matches:
         Date   home_team        away_team  prediction
12 2024-10-05     Colgate  Loyola Maryland           2
6  2024-09-25        Navy  Loyola Maryland           1
25 2024-10-19  Holy Cross  Loyola Maryland           2
30 2024-10-26     Colgate  Army West Point           1
39 2024-09-21    Bucknell          Colgate           0
8  2024-09-28   Lafayette  Loyola Maryland           2
13 2024-10-05    American        Lafayette           0


In [307]:
# Map prediction codes to result labels for better readability
prediction_map = {1: "Home Win", 0: "Away Win", 2: "Draw"}
future_data['prediction_result'] = future_data['prediction'].map(prediction_map)

print("\nUpcoming matches with readable predictions:")
print(future_data[['Date', 'home_team', 'away_team', 'prediction_result']].head(10))


Upcoming matches with readable predictions:
         Date   home_team        away_team prediction_result
12 2024-10-05     Colgate  Loyola Maryland              Draw
6  2024-09-25        Navy  Loyola Maryland          Home Win
25 2024-10-19  Holy Cross  Loyola Maryland              Draw
30 2024-10-26     Colgate  Army West Point          Home Win
39 2024-09-21    Bucknell          Colgate          Away Win
8  2024-09-28   Lafayette  Loyola Maryland              Draw
13 2024-10-05    American        Lafayette          Away Win
