In [43]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [44]:
# Load the match data from excel file
df = pd.read_excel('match_data.xlsx', sheet_name='Sheet2')

In [45]:
# Data preprocessing
# Convert date to datetime format
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# Create encodings for categorical variables
df["home_team_code"] = df["home_team"].astype("category").cat.codes
df["away_team_code"] = df["away_team"].astype("category").cat.codes
df["stadium_code"] = df["Stadium"].astype("category").cat.codes
df["day_of_week"] = df["Date"].dt.dayofweek

In [46]:
# Create target variable based on goals scored
result_list = []
for idx, row in df.iterrows():
    if row['goals_home'] > row['goals_away']:
        result_list.append('W')
    elif row['goals_home'] < row['goals_away']:
        result_list.append('L')
    else:
        result_list.append('D')

df["Result"] = result_list

# Map results to target values
df["target"] = df["Result"].map({"W": 1, "L": 0, "D": 2})

In [47]:
# Calculate home and away points per game
# Group by home team and calculate average goals
home_stats = df.groupby('home_team')['goals_home'].mean().reset_index()
home_stats.columns = ['team', 'home_ppg']

# Group by away team and calculate average goals
away_stats = df.groupby('away_team')['goals_away'].mean().reset_index()
away_stats.columns = ['team', 'away_ppg']

# Merge with the main dataframe
df = pd.merge(df, home_stats, left_on='home_team', right_on='team', how='left')
df = pd.merge(df, away_stats, left_on='away_team', right_on='team', how='left')

In [48]:
# Drop the unnecessary columns
df.drop(['team_x', 'team_y'], axis=1, inplace=True, errors='ignore')

In [49]:
# Define predictors (features) for the model
predictors = ["home_team_code", "away_team_code", "stadium_code", "day_of_week",
              "home_ppg", "away_ppg"]

# Split data using all non-Bucknell matches for training and keep Bucknell matches for prediction
bucknell_matches = df[(df['home_team'] == 'Bucknell') | (df['away_team'] == 'Bucknell')]
other_matches = df[(df['home_team'] != 'Bucknell') & (df['away_team'] != 'Bucknell')]

print(f"Total Bucknell matches found: {len(bucknell_matches)}")

Total Bucknell matches found: 6


In [50]:
# Use other matches for training
training_data, testing_data = train_test_split(other_matches, test_size=0.2, random_state=42)

In [51]:
# Initialize and train the model
rf = RandomForestClassifier(
    n_estimators=1000,
    max_depth=15,
    min_samples_split=2,
    class_weight='balanced',
    random_state=42
)
rf.fit(training_data[predictors], training_data["target"])

In [52]:
# Evaluate model accuracy
test_preds = rf.predict(testing_data[predictors])
accuracy = accuracy_score(testing_data["target"], test_preds)
print(f"\nModel accuracy on non-Bucknell matches: {accuracy:.4f}")


Model accuracy on non-Bucknell matches: 0.1250


In [53]:
# Make predictions for Bucknell matches
bucknell_predictions = rf.predict(bucknell_matches[predictors])
bucknell_matches.loc[:, "prediction"] = bucknell_predictions

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bucknell_matches.loc[:, "prediction"] = bucknell_predictions


In [54]:
# Map prediction codes to result labels for better readability
prediction_map = {1: "Home Win", 0: "Away Win", 2: "Draw"}
bucknell_matches.loc[:, 'prediction_result'] = bucknell_matches['prediction'].map(prediction_map)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bucknell_matches.loc[:, 'prediction_result'] = bucknell_matches['prediction'].map(prediction_map)


In [55]:
# Display actual vs predicted results for Bucknell
bucknell_matches.loc[:, 'actual_result'] = bucknell_matches['Result'].map({"W": "Home Win", "L": "Away Win", "D": "Draw"})

print("\nBucknell matches with predictions:")
print(bucknell_matches[['Date', 'home_team', 'away_team', 'goals_home', 'goals_away',
                        'actual_result', 'prediction_result']])



Bucknell matches with predictions:
         Date home_team        away_team  goals_home  goals_away  \
14 2024-10-11      Navy         Bucknell           1           0   
19 2024-10-16  Bucknell           Loyola           0           1   
32 2024-10-30  Bucknell           Lehigh           0           2   
37 2024-10-05  Bucknell       Holy Cross           0           1   
39 2024-09-21  Bucknell          Colgate           0           3   
41 2024-09-28  Bucknell  Army West Point           1           0   

   actual_result prediction_result  
14      Home Win          Home Win  
19      Away Win              Draw  
32      Away Win              Draw  
37      Away Win              Draw  
39      Away Win              Draw  
41      Home Win              Draw  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bucknell_matches.loc[:, 'actual_result'] = bucknell_matches['Result'].map({"W": "Home Win", "L": "Away Win", "D": "Draw"})


In [56]:
# Calculate predicted points for Bucknell
bucknell_points = 0
for _, match in bucknell_matches.iterrows():
    is_home = match['home_team'] == 'Bucknell'
    prediction = match['prediction']

    # Home team wins (1)
    if is_home and prediction == 1:
        bucknell_points += 3
    # Away team wins (0)
    elif not is_home and prediction == 0:
        bucknell_points += 3
    # Draw (2)
    elif prediction == 2:
        bucknell_points += 1

print(f"\nPredicted points for Bucknell: {bucknell_points}")


Predicted points for Bucknell: 5


In [57]:
# Calculate home/away win rates
home_wins = len(bucknell_matches[(bucknell_matches['home_team'] == 'Bucknell') &
                                 (bucknell_matches['prediction'] == 1)])
away_wins = len(bucknell_matches[(bucknell_matches['away_team'] == 'Bucknell') &
                                 (bucknell_matches['prediction'] == 0)])
draws = len(bucknell_matches[bucknell_matches['prediction'] == 2])

total_home_matches = len(bucknell_matches[bucknell_matches['home_team'] == 'Bucknell'])
total_away_matches = len(bucknell_matches[bucknell_matches['away_team'] == 'Bucknell'])

print(f"\nBucknell home win rate: {home_wins}/{total_home_matches if total_home_matches > 0 else 'No home matches'}")
print(f"Bucknell away win rate: {away_wins}/{total_away_matches if total_away_matches > 0 else 'No away matches'}")
print(f"Bucknell draws: {draws}/{len(bucknell_matches)}")


Bucknell home win rate: 0/5
Bucknell away win rate: 0/1
Bucknell draws: 5/6


In [58]:
# Compare predictions to actual results if available
matches_with_results = bucknell_matches[bucknell_matches['Result'].notna()]
if len(matches_with_results) > 0:
    correct_predictions = sum(matches_with_results['prediction'] == matches_with_results['target'])
    print(f"\nAccuracy for Bucknell matches: {correct_predictions}/{len(matches_with_results)} " +
          f"({correct_predictions/len(matches_with_results):.2%})")


Accuracy for Bucknell matches: 1/6 (16.67%)


In [59]:
# Print detailed match-by-match predictions
print("\nDetailed Bucknell match predictions:")
for _, match in bucknell_matches.iterrows():
    date = match['Date'].strftime('%Y-%m-%d')
    if match['home_team'] == 'Bucknell':
        opponent = match['away_team']
        location = "Home"
    else:
        opponent = match['home_team']
        location = "Away"

    prediction = match['prediction_result']

    print(f"{date} | Bucknell vs {opponent} ({location}) | Prediction: {prediction}")


Detailed Bucknell match predictions:
2024-10-11 | Bucknell vs Navy (Away) | Prediction: Home Win
2024-10-16 | Bucknell vs Loyola (Home) | Prediction: Draw
2024-10-30 | Bucknell vs Lehigh (Home) | Prediction: Draw
2024-10-05 | Bucknell vs Holy Cross (Home) | Prediction: Draw
2024-09-21 | Bucknell vs Colgate (Home) | Prediction: Draw
2024-09-28 | Bucknell vs Army West Point (Home) | Prediction: Draw
