# Final Project
| Group 1:       |     |
| -------------- | --- |
| Student Names:             |UCIDs:|
| Samiul Haque               |30169845|
| Elias Poitras-Whitecalf    |30193066|
| Ryan Graham                |30171130|



### Imports


In [1]:
import kagglehub
import pandas as pd
import seaborn as sns
import numpy as np
import sklearn
import matplotlib
import warnings
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import sklearn.model_selection as sk
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from dateutil.relativedelta import relativedelta
from datetime import datetime
warnings.filterwarnings('ignore') #ignoring some deprication warnings

path = kagglehub.dataset_download("evangora/premier-league-data")
path += '/matches.csv'

print("Path to dataset files:", path)

Path to dataset files: C:\Users\samiu\.cache\kagglehub\datasets\evangora\premier-league-data\versions\1/matches.csv


In [2]:
data = pd.read_csv(path)
data.head()

Unnamed: 0.1,Unnamed: 0,Season,Date,Home,xG,Home Goals,Away Goals,xG.1,Away,Attendance,Venue
0,0,2023/2024,2023-08-11,Burnley,0.3,0.0,3.0,1.9,Manchester City,21572.0,Turf Moor
1,1,2023/2024,2023-08-12,Arsenal,0.8,2.0,1.0,1.2,Nott'ham Forest,59984.0,Emirates Stadium
2,2,2023/2024,2023-08-12,Everton,2.7,0.0,1.0,1.5,Fulham,39940.0,Goodison Park
3,3,2023/2024,2023-08-12,Sheffield Utd,0.5,0.0,1.0,1.9,Crystal Palace,31194.0,Bramall Lane
4,4,2023/2024,2023-08-12,Brighton,4.0,4.0,1.0,1.5,Luton Town,31872.0,The American Express Community Stadium


### Data Cleaning

In [3]:
print(f"BEFORE DATA CLEANSING: \n {data.isnull().sum()}\n") 
data['Date'] = pd.to_datetime(data['Date'], errors='coerce')
print("Original dataset shape:", data.shape)
max_date = data['Date'].max()
min_date = max_date - relativedelta(years=30) #only want the most recent 30 years of data

data = data[data['Date'] >= min_date]
print("\nFiltered dataset shape (last 30 years):", data.shape)

data = data.drop(["Unnamed: 0","Season","xG", "xG.1"], axis=1) #these columns not needed for our model 
data = data.dropna(subset=['Attendance'])
print("\nAFTER DATA CLEANSING:")
print(f"\n {data.dtypes}")
print(f"\n {data.isnull().sum()}") 

BEFORE DATA CLEANSING: 
 Unnamed: 0        0
Season            0
Date           9961
Home           9961
xG            57869
Home Goals     9961
Away Goals     9961
xG.1          57869
Away           9961
Attendance    49083
Venue         48123
dtype: int64

Original dataset shape: (60529, 11)

Filtered dataset shape (last 30 years): (11482, 11)

AFTER DATA CLEANSING:

 Date          datetime64[ns]
Home                  object
Home Goals           float64
Away Goals           float64
Away                  object
Attendance           float64
Venue                 object
dtype: object

 Date          0
Home          0
Home Goals    0
Away Goals    0
Away          0
Attendance    0
Venue         0
dtype: int64


In [4]:
data['Month'] = data['Date'].dt.month
data['day_of_week'] = data['Date'].dt.dayofweek
data['Year'] = data['Date'].dt.year
data = data.drop(['Date'], axis=1)

print(f"\n {data.dtypes}")
data.head()


 Home            object
Home Goals     float64
Away Goals     float64
Away            object
Attendance     float64
Venue           object
Month            int32
day_of_week      int32
Year             int32
dtype: object


Unnamed: 0,Home,Home Goals,Away Goals,Away,Attendance,Venue,Month,day_of_week,Year
0,Burnley,0.0,3.0,Manchester City,21572.0,Turf Moor,8,4,2023
1,Arsenal,2.0,1.0,Nott'ham Forest,59984.0,Emirates Stadium,8,5,2023
2,Everton,0.0,1.0,Fulham,39940.0,Goodison Park,8,5,2023
3,Sheffield Utd,0.0,1.0,Crystal Palace,31194.0,Bramall Lane,8,5,2023
4,Brighton,4.0,1.0,Luton Town,31872.0,The American Express Community Stadium,8,5,2023


### need scaling and encoding in pipeline for each model
### add param grid aswell (Gridsearch)

### KNN


In [None]:
#Sam

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# define categorical and numerical features
categorical_features = ['Home', 'Away', 'Venue', 'Month', 'day_of_week']
numerical_features = [col for col in data.columns if col not in categorical_features + ['Attendance']]

X = data.drop(columns=['Attendance'])
y = data['Attendance']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# preprocessing pipeline
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

# KNN pipeline
knn_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('knn', KNeighborsRegressor())
])

param_grid = {
    'knn__n_neighbors': [3, 5, 7, 10],
    'knn__weights': ['uniform', 'distance'],
    'knn__metric': ['euclidean', 'manhattan']
}

grid_search = GridSearchCV(knn_pipeline, param_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print("Best Parameters:", grid_search.best_params_)
print("Training R^2 Score:", best_model.score(X_train, y_train))
print("R^2 Score:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))


Best Parameters: {'knn__metric': 'manhattan', 'knn__n_neighbors': 10, 'knn__weights': 'distance'}
Training R^2 Score: 1.0
R^2 Score: 0.8873952166785928
MAE: 2872.6951132063477
RMSE: 4629.221017639981


### Gradient Boosted Forest

In [None]:
#Ryan - mine

### Linear Regression

In [None]:
#Elias 