In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
train_df = pd.read_csv('training_dataset.csv')
test_df = pd.read_csv('testing_dataset.csv')

In [3]:
train_df.shape, test_df.shape

((15461, 258), (3866, 258))

In [4]:
# Print all column names of the train DataFrame
print(train_df.columns.tolist())

['home_team_id', 'away_team_id', 'year', 'home_team_goal', 'away_team_goal', 'home_player_overall_rating_1', 'home_player_overall_rating_2', 'home_player_overall_rating_3', 'home_player_overall_rating_4', 'home_player_overall_rating_5', 'home_player_overall_rating_6', 'home_player_overall_rating_7', 'home_player_overall_rating_8', 'home_player_overall_rating_9', 'home_player_overall_rating_10', 'home_player_overall_rating_11', 'home_player_potential_1', 'home_player_potential_2', 'home_player_potential_3', 'home_player_potential_4', 'home_player_potential_5', 'home_player_potential_6', 'home_player_potential_7', 'home_player_potential_8', 'home_player_potential_9', 'home_player_potential_10', 'home_player_potential_11', 'home_player_crossing_1', 'home_player_crossing_2', 'home_player_crossing_3', 'home_player_crossing_4', 'home_player_crossing_5', 'home_player_crossing_6', 'home_player_crossing_7', 'home_player_crossing_8', 'home_player_crossing_9', 'home_player_crossing_10', 'home_pla

In [5]:
# 1. Home and Away Team Average Rating
home_rating_cols = [f'home_player_overall_rating_{i}' for i in range(1, 12)]
away_rating_cols = [f'away_player_overall_rating_{i}' for i in range(1, 12)]

train_df['home_team_avg_rating'] = train_df[home_rating_cols].mean(axis=1)
train_df['away_team_avg_rating'] = train_df[away_rating_cols].mean(axis=1)

test_df['home_team_avg_rating'] = test_df[home_rating_cols].mean(axis=1)
test_df['away_team_avg_rating'] = test_df[away_rating_cols].mean(axis=1)

# 2. Team Rating Difference
train_df['rating_difference'] = train_df['home_team_avg_rating'] - train_df['away_team_avg_rating']
test_df['rating_difference'] = test_df['home_team_avg_rating'] - test_df['away_team_avg_rating']

# 3. Average Player Potential
home_potential_cols = [f'home_player_potential_{i}' for i in range(1, 12)]
away_potential_cols = [f'away_player_potential_{i}' for i in range(1, 12)]

train_df['home_team_avg_potential'] = train_df[home_potential_cols].mean(axis=1)
train_df['away_team_avg_potential'] = train_df[away_potential_cols].mean(axis=1)

test_df['home_team_avg_potential'] = test_df[home_potential_cols].mean(axis=1)
test_df['away_team_avg_potential'] = test_df[away_potential_cols].mean(axis=1)

# 4. Team Potential Difference
train_df['potential_difference'] = train_df['home_team_avg_potential'] - train_df['away_team_avg_potential']
test_df['potential_difference'] = test_df['home_team_avg_potential'] - test_df['away_team_avg_potential']

# 5. Average Player Crossing Ability
home_crossing_cols = [f'home_player_crossing_{i}' for i in range(1, 12)]
away_crossing_cols = [f'away_player_crossing_{i}' for i in range(1, 12)]

train_df['home_team_avg_crossing'] = train_df[home_crossing_cols].mean(axis=1)
train_df['away_team_avg_crossing'] = train_df[away_crossing_cols].mean(axis=1)

test_df['home_team_avg_crossing'] = test_df[home_crossing_cols].mean(axis=1)
test_df['away_team_avg_crossing'] = test_df[away_crossing_cols].mean(axis=1)

# 6. Average Player Preferred Foot (Proportion Left-Footed)
home_left_foot_cols = [f'home_player_preferred_foot_left_{i}' for i in range(1, 12)]
away_left_foot_cols = [f'away_player_preferred_foot_left_{i}' for i in range(1, 12)]

train_df['home_team_left_foot_ratio'] = train_df[home_left_foot_cols].mean(axis=1)
train_df['away_team_left_foot_ratio'] = train_df[away_left_foot_cols].mean(axis=1)

test_df['home_team_left_foot_ratio'] = test_df[home_left_foot_cols].mean(axis=1)
test_df['away_team_left_foot_ratio'] = test_df[away_left_foot_cols].mean(axis=1)

In [6]:
train_df.shape, test_df.shape

((15461, 268), (3866, 268))

In [7]:
# Print all column names of the train DataFrame
print(train_df.columns.tolist())

['home_team_id', 'away_team_id', 'year', 'home_team_goal', 'away_team_goal', 'home_player_overall_rating_1', 'home_player_overall_rating_2', 'home_player_overall_rating_3', 'home_player_overall_rating_4', 'home_player_overall_rating_5', 'home_player_overall_rating_6', 'home_player_overall_rating_7', 'home_player_overall_rating_8', 'home_player_overall_rating_9', 'home_player_overall_rating_10', 'home_player_overall_rating_11', 'home_player_potential_1', 'home_player_potential_2', 'home_player_potential_3', 'home_player_potential_4', 'home_player_potential_5', 'home_player_potential_6', 'home_player_potential_7', 'home_player_potential_8', 'home_player_potential_9', 'home_player_potential_10', 'home_player_potential_11', 'home_player_crossing_1', 'home_player_crossing_2', 'home_player_crossing_3', 'home_player_crossing_4', 'home_player_crossing_5', 'home_player_crossing_6', 'home_player_crossing_7', 'home_player_crossing_8', 'home_player_crossing_9', 'home_player_crossing_10', 'home_pla

In [8]:
# 1. Home/Away Team High Attacking Work Rate Count
train_df['home_high_attacking_work_rate_count'] = train_df[[f'home_player_attacking_work_rate_high_{i}' for i in range(1, 12)]].sum(axis=1)
train_df['away_high_attacking_work_rate_count'] = train_df[[f'away_player_attacking_work_rate_high_{i}' for i in range(1, 12)]].sum(axis=1)

test_df['home_high_attacking_work_rate_count'] = test_df[[f'home_player_attacking_work_rate_high_{i}' for i in range(1, 12)]].sum(axis=1)
test_df['away_high_attacking_work_rate_count'] = test_df[[f'away_player_attacking_work_rate_high_{i}' for i in range(1, 12)]].sum(axis=1)

# 2. Home/Away Team High Defensive Work Rate Count
train_df['home_high_defensive_work_rate_count'] = train_df[[f'home_player_defensive_work_rate_high_{i}' for i in range(1, 12)]].sum(axis=1)
train_df['away_high_defensive_work_rate_count'] = train_df[[f'away_player_defensive_work_rate_high_{i}' for i in range(1, 12)]].sum(axis=1)

test_df['home_high_defensive_work_rate_count'] = test_df[[f'home_player_defensive_work_rate_high_{i}' for i in range(1, 12)]].sum(axis=1)
test_df['away_high_defensive_work_rate_count'] = test_df[[f'away_player_defensive_work_rate_high_{i}' for i in range(1, 12)]].sum(axis=1)

# 3. Home/Away Team Medium Attacking Work Rate Count
train_df['home_medium_attacking_work_rate_count'] = train_df[[f'home_player_attacking_work_rate_medium_{i}' for i in range(1, 12)]].sum(axis=1)
train_df['away_medium_attacking_work_rate_count'] = train_df[[f'away_player_attacking_work_rate_medium_{i}' for i in range(1, 12)]].sum(axis=1)

test_df['home_medium_attacking_work_rate_count'] = test_df[[f'home_player_attacking_work_rate_medium_{i}' for i in range(1, 12)]].sum(axis=1)
test_df['away_medium_attacking_work_rate_count'] = test_df[[f'away_player_attacking_work_rate_medium_{i}' for i in range(1, 12)]].sum(axis=1)

# 4. Home/Away Team Medium Defensive Work Rate Count
train_df['home_medium_defensive_work_rate_count'] = train_df[[f'home_player_defensive_work_rate_medium_{i}' for i in range(1, 12)]].sum(axis=1)
train_df['away_medium_defensive_work_rate_count'] = train_df[[f'away_player_defensive_work_rate_medium_{i}' for i in range(1, 12)]].sum(axis=1)

test_df['home_medium_defensive_work_rate_count'] = test_df[[f'home_player_defensive_work_rate_medium_{i}' for i in range(1, 12)]].sum(axis=1)
test_df['away_medium_defensive_work_rate_count'] = test_df[[f'away_player_defensive_work_rate_medium_{i}' for i in range(1, 12)]].sum(axis=1)

# 5. Home/Away Average Player Coordinates
train_df['home_avg_player_coordinate'] = train_df[[f'coordinates_{i}' for i in range(1, 12)]].mean(axis=1)
train_df['away_avg_player_coordinate'] = train_df[[f'coordinates_{i}' for i in range(1, 12)]].mean(axis=1)

test_df['home_avg_player_coordinate'] = test_df[[f'coordinates_{i}' for i in range(1, 12)]].mean(axis=1)
test_df['away_avg_player_coordinate'] = test_df[[f'coordinates_{i}' for i in range(1, 12)]].mean(axis=1)

In [9]:
# Average Coordinate Position for Home and Away Teams
train_df['home_avg_coordinate'] = train_df[[f'coordinates_{i}' for i in range(1, 12)]].mean(axis=1)
train_df['away_avg_coordinate'] = train_df[[f'coordinates_{i}' for i in range(1, 12)]].mean(axis=1)

test_df['home_avg_coordinate'] = test_df[[f'coordinates_{i}' for i in range(1, 12)]].mean(axis=1)
test_df['away_avg_coordinate'] = test_df[[f'coordinates_{i}' for i in range(1, 12)]].mean(axis=1)

# Coordinate Spread (Standard Deviation) for Home and Away Teams
train_df['home_coordinate_std'] = train_df[[f'coordinates_{i}' for i in range(1, 12)]].std(axis=1)
train_df['away_coordinate_std'] = train_df[[f'coordinates_{i}' for i in range(1, 12)]].std(axis=1)

test_df['home_coordinate_std'] = test_df[[f'coordinates_{i}' for i in range(1, 12)]].std(axis=1)
test_df['away_coordinate_std'] = test_df[[f'coordinates_{i}' for i in range(1, 12)]].std(axis=1)

# Coordinate Range (Max - Min) for Home and Away Teams
train_df['home_coordinate_range'] = train_df[[f'coordinates_{i}' for i in range(1, 12)]].max(axis=1) - train_df[[f'coordinates_{i}' for i in range(1, 12)]].min(axis=1)
train_df['away_coordinate_range'] = train_df[[f'coordinates_{i}' for i in range(1, 12)]].max(axis=1) - train_df[[f'coordinates_{i}' for i in range(1, 12)]].min(axis=1)

test_df['home_coordinate_range'] = test_df[[f'coordinates_{i}' for i in range(1, 12)]].max(axis=1) - test_df[[f'coordinates_{i}' for i in range(1, 12)]].min(axis=1)
test_df['away_coordinate_range'] = test_df[[f'coordinates_{i}' for i in range(1, 12)]].max(axis=1) - test_df[[f'coordinates_{i}' for i in range(1, 12)]].min(axis=1)

# Median Coordinate for Home and Away Teams
train_df['home_coordinate_median'] = train_df[[f'coordinates_{i}' for i in range(1, 12)]].median(axis=1)
train_df['away_coordinate_median'] = train_df[[f'coordinates_{i}' for i in range(1, 12)]].median(axis=1)

test_df['home_coordinate_median'] = test_df[[f'coordinates_{i}' for i in range(1, 12)]].median(axis=1)
test_df['away_coordinate_median'] = test_df[[f'coordinates_{i}' for i in range(1, 12)]].median(axis=1)

# Coordinate Quartiles for Home and Away Teams
train_df['home_coordinate_q1'] = train_df[[f'coordinates_{i}' for i in range(1, 12)]].quantile(0.25, axis=1)
train_df['home_coordinate_q3'] = train_df[[f'coordinates_{i}' for i in range(1, 12)]].quantile(0.75, axis=1)
train_df['away_coordinate_q1'] = train_df[[f'coordinates_{i}' for i in range(1, 12)]].quantile(0.25, axis=1)
train_df['away_coordinate_q3'] = train_df[[f'coordinates_{i}' for i in range(1, 12)]].quantile(0.75, axis=1)

test_df['home_coordinate_q1'] = test_df[[f'coordinates_{i}' for i in range(1, 12)]].quantile(0.25, axis=1)
test_df['home_coordinate_q3'] = test_df[[f'coordinates_{i}' for i in range(1, 12)]].quantile(0.75, axis=1)
test_df['away_coordinate_q1'] = test_df[[f'coordinates_{i}' for i in range(1, 12)]].quantile(0.25, axis=1)
test_df['away_coordinate_q3'] = test_df[[f'coordinates_{i}' for i in range(1, 12)]].quantile(0.75, axis=1)

# Coordinate Density (Proximity Count) for Home and Away Teams within 10-unit range
proximity_threshold = 10
train_df['home_coordinate_density'] = train_df[[f'coordinates_{i}' for i in range(1, 12)]].apply(lambda row: sum(abs(row - row.mean()) < proximity_threshold), axis=1)
train_df['away_coordinate_density'] = train_df[[f'coordinates_{i}' for i in range(1, 12)]].apply(lambda row: sum(abs(row - row.mean()) < proximity_threshold), axis=1)

test_df['home_coordinate_density'] = test_df[[f'coordinates_{i}' for i in range(1, 12)]].apply(lambda row: sum(abs(row - row.mean()) < proximity_threshold), axis=1)
test_df['away_coordinate_density'] = test_df[[f'coordinates_{i}' for i in range(1, 12)]].apply(lambda row: sum(abs(row - row.mean()) < proximity_threshold), axis=1)

# Coordinate Gradient (Difference in Average Position between Home and Away Teams)
train_df['coordinate_gradient'] = train_df['home_avg_coordinate'] - train_df['away_avg_coordinate']
test_df['coordinate_gradient'] = test_df['home_avg_coordinate'] - test_df['away_avg_coordinate']

In [10]:
train_df.shape, test_df.shape

((15461, 293), (3866, 293))

In [11]:
# Print all column names of the train DataFrame
print(train_df.columns.tolist())

['home_team_id', 'away_team_id', 'year', 'home_team_goal', 'away_team_goal', 'home_player_overall_rating_1', 'home_player_overall_rating_2', 'home_player_overall_rating_3', 'home_player_overall_rating_4', 'home_player_overall_rating_5', 'home_player_overall_rating_6', 'home_player_overall_rating_7', 'home_player_overall_rating_8', 'home_player_overall_rating_9', 'home_player_overall_rating_10', 'home_player_overall_rating_11', 'home_player_potential_1', 'home_player_potential_2', 'home_player_potential_3', 'home_player_potential_4', 'home_player_potential_5', 'home_player_potential_6', 'home_player_potential_7', 'home_player_potential_8', 'home_player_potential_9', 'home_player_potential_10', 'home_player_potential_11', 'home_player_crossing_1', 'home_player_crossing_2', 'home_player_crossing_3', 'home_player_crossing_4', 'home_player_crossing_5', 'home_player_crossing_6', 'home_player_crossing_7', 'home_player_crossing_8', 'home_player_crossing_9', 'home_player_crossing_10', 'home_pla

In [12]:
reduced_train_data = train_df[['year', 'home_team_id', 'away_team_id', 'home_team_goal', 'away_team_goal',
                      'home_team_avg_rating', 'away_team_avg_rating', 'rating_difference', 'home_team_avg_potential',
                       'away_team_avg_potential', 'potential_difference', 'home_team_avg_crossing', 'away_team_avg_crossing',
                       'home_team_left_foot_ratio', 'away_team_left_foot_ratio', 'home_high_attacking_work_rate_count',
                       'away_high_attacking_work_rate_count', 'home_high_defensive_work_rate_count',
                       'away_high_defensive_work_rate_count', 'home_medium_attacking_work_rate_count',
                       'away_medium_attacking_work_rate_count', 'home_medium_defensive_work_rate_count',
                       'away_medium_defensive_work_rate_count', 'home_avg_player_coordinate', 'away_avg_player_coordinate',
                       'home_avg_coordinate', 'away_avg_coordinate', 'home_coordinate_std', 'away_coordinate_std',
                       'home_coordinate_range', 'away_coordinate_range', 'home_coordinate_median', 'away_coordinate_median',
                       'home_coordinate_q1', 'home_coordinate_q3', 'away_coordinate_q1', 'away_coordinate_q3',
                       'home_coordinate_density', 'away_coordinate_density', 'coordinate_gradient']]

In [13]:
reduced_test_data = test_df[['year', 'home_team_id', 'away_team_id', 'home_team_goal', 'away_team_goal',
                      'home_team_avg_rating', 'away_team_avg_rating', 'rating_difference', 'home_team_avg_potential',
                       'away_team_avg_potential', 'potential_difference', 'home_team_avg_crossing', 'away_team_avg_crossing',
                       'home_team_left_foot_ratio', 'away_team_left_foot_ratio', 'home_high_attacking_work_rate_count',
                       'away_high_attacking_work_rate_count', 'home_high_defensive_work_rate_count',
                       'away_high_defensive_work_rate_count', 'home_medium_attacking_work_rate_count',
                       'away_medium_attacking_work_rate_count', 'home_medium_defensive_work_rate_count',
                       'away_medium_defensive_work_rate_count', 'home_avg_player_coordinate', 'away_avg_player_coordinate',
                       'home_avg_coordinate', 'away_avg_coordinate', 'home_coordinate_std', 'away_coordinate_std',
                       'home_coordinate_range', 'away_coordinate_range', 'home_coordinate_median', 'away_coordinate_median',
                       'home_coordinate_q1', 'home_coordinate_q3', 'away_coordinate_q1', 'away_coordinate_q3',
                       'home_coordinate_density', 'away_coordinate_density', 'coordinate_gradient']]

In [14]:
# Print all column names of the train DataFrame
print(reduced_train_data.columns.tolist())

['year', 'home_team_id', 'away_team_id', 'home_team_goal', 'away_team_goal', 'home_team_avg_rating', 'away_team_avg_rating', 'rating_difference', 'home_team_avg_potential', 'away_team_avg_potential', 'potential_difference', 'home_team_avg_crossing', 'away_team_avg_crossing', 'home_team_left_foot_ratio', 'away_team_left_foot_ratio', 'home_high_attacking_work_rate_count', 'away_high_attacking_work_rate_count', 'home_high_defensive_work_rate_count', 'away_high_defensive_work_rate_count', 'home_medium_attacking_work_rate_count', 'away_medium_attacking_work_rate_count', 'home_medium_defensive_work_rate_count', 'away_medium_defensive_work_rate_count', 'home_avg_player_coordinate', 'away_avg_player_coordinate', 'home_avg_coordinate', 'away_avg_coordinate', 'home_coordinate_std', 'away_coordinate_std', 'home_coordinate_range', 'away_coordinate_range', 'home_coordinate_median', 'away_coordinate_median', 'home_coordinate_q1', 'home_coordinate_q3', 'away_coordinate_q1', 'away_coordinate_q3', 'hom

In [15]:
reduced_train_data.shape, reduced_test_data.shape

((15461, 40), (3866, 40))

In [41]:
from sklearn.preprocessing import StandardScaler

# Assuming train_df and test_df are your dataframes
scaler = StandardScaler()
scaled_features = scaler.fit_transform(reduced_train_data.drop(columns=['home_team_goal', 'away_team_goal']))  # adjust accordingly
reduced_train_data_scaled = pd.DataFrame(scaled_features, columns=reduced_train_data.columns.drop(['home_team_goal', 'away_team_goal']))

In [42]:
scaled_features = scaler.fit_transform(reduced_test_data.drop(columns=['home_team_goal', 'away_team_goal']))  # adjust accordingly
reduced_test_data_scaled = pd.DataFrame(scaled_features, columns=reduced_test_data.columns.drop(['home_team_goal', 'away_team_goal']))

In [43]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

In [44]:
# Step 1: Assuming your data is loaded into a DataFrame called `train`
# Standardizing the data
features = reduced_train_data.drop(columns=['year', 'home_team_id', 'away_team_id', 'home_team_goal', 'away_team_goal'])
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

In [45]:
# Step 2: Apply PCA
pca = PCA()  # Start with all components to analyze explained variance
pca.fit(scaled_features)
explained_variance = pca.explained_variance_ratio_

In [46]:
# Step 3: Choose Number of Components
# Decide on the number of components to keep. For example, we can choose components
# that explain 95% of the variance.
cumulative_variance = np.cumsum(explained_variance)
n_components = np.argmax(cumulative_variance >= 0.95) + 1
print(f"Number of components explaining 95% of variance: {n_components}")

Number of components explaining 95% of variance: 12


In [47]:
# Step 4: Apply PCA with selected number of components
pca = PCA(n_components=n_components)
principal_components = pca.fit_transform(scaled_features)

In [48]:
# Step 5: Create a new DataFrame with reduced dimensions
reduced_train_data = pd.DataFrame(principal_components, columns=[f'PC{i+1}' for i in range(n_components)])

# Optional: Add back the target columns for use in modeling or further analysis
reduced_train_data[['year', 'home_team_id', 'away_team_id', 'home_team_goal', 'away_team_goal']] = train_df[['year', 'home_team_id', 'away_team_id', 'home_team_goal', 'away_team_goal']]

reduced_train_data

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,year,home_team_id,away_team_id,home_team_goal,away_team_goal
0,2.724429,-1.907256,-2.623855,1.719569,-0.922370,-0.962967,-1.253615,-0.476036,0.032124,-1.954659,1.545993,1.329240,2012,3,241,0,2
1,4.766831,-3.723791,-1.401960,-1.712564,-2.010460,1.612211,1.767170,-3.268669,-0.439655,0.488881,1.382822,0.131018,2013,32,229,4,0
2,-3.981062,2.023503,3.241320,-1.191261,-0.762034,-0.579076,-0.150251,-0.681908,0.919211,-0.328448,-0.127384,0.830523,2014,245,12,2,0
3,-1.928791,-0.120702,4.415725,3.799329,0.252410,-0.918285,-0.740016,-0.052281,-0.633349,1.862393,0.154801,1.274719,2011,141,233,3,0
4,-4.033523,1.583380,2.738028,-2.657715,-0.079536,-0.916236,1.023986,-1.464871,1.538152,-0.952823,0.869558,0.717163,2015,206,144,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15456,-0.805586,-0.432064,2.125792,-3.452090,1.541997,-0.581859,-0.870796,-0.320821,1.943523,1.092848,0.366545,0.910746,2010,103,226,1,1
15457,-4.297407,1.860759,3.775535,1.887415,-0.889741,-0.386914,0.563820,0.881678,-0.219436,-0.063816,1.188697,1.161871,2010,226,213,2,1
15458,-4.291861,1.627730,3.368028,-1.459004,-0.560148,-0.491514,0.209844,-0.456101,1.746953,-0.355598,0.151706,0.560489,2013,206,141,1,1
15459,1.278301,-0.258789,-3.837613,-0.757885,2.402503,0.431762,-0.130086,-0.439709,-0.138387,1.163497,0.331493,0.258456,2013,271,251,2,4


In [49]:
# Assuming you have a test dataset called `test` and you've already standardized the training data
# Standardizing the test data using the scaler fitted on training data
scaled_test_features = scaler.transform(reduced_test_data.drop(columns=['year', 'home_team_id', 'away_team_id', 'home_team_goal', 'away_team_goal']))

test_principal_components = pca.transform(scaled_test_features)

reduced_test_data = pd.DataFrame(test_principal_components, columns=[f'PC{i+1}' for i in range(n_components)])

reduced_test_data[['year', 'home_team_id', 'away_team_id', 'home_team_goal', 'away_team_goal']] = test_df[['year', 'home_team_id', 'away_team_id', 'home_team_goal', 'away_team_goal']]

reduced_test_data

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,year,home_team_id,away_team_id,home_team_goal,away_team_goal
0,1.630561,-2.249250,-0.524155,-0.549864,-0.902698,-1.776331,0.247365,0.543704,0.563802,-0.128153,-1.382953,0.325771,2009,134,140,1,3
1,0.827909,-2.398722,1.075982,-1.042393,-0.586072,-1.643190,0.202886,0.679349,-1.683617,0.151475,-0.484141,-0.967544,2009,94,224,4,2
2,2.077500,-2.251081,-1.693700,-1.253705,-0.493027,-1.482156,0.385394,0.772172,0.240634,0.179421,-1.135315,-1.478514,2009,6,143,2,0
3,1.370962,-2.350428,-0.459176,-0.527006,-0.709688,-1.420650,-0.194863,0.150315,0.024506,-0.154762,-1.063859,-0.548190,2009,146,140,1,2
4,1.733803,-2.492851,-3.056308,1.357638,0.268925,0.494911,-1.452418,0.166040,0.291524,-0.525000,-0.926695,-1.674425,2009,136,150,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3861,0.102823,-0.071965,-0.954431,-1.406009,-2.065335,1.035571,0.201178,0.189953,-0.430278,0.047835,-0.094133,-0.474428,2014,229,124,2,3
3862,0.472760,1.819889,-1.826294,3.481171,-1.114361,-0.492959,-0.910258,-0.807561,-0.878684,-1.747709,0.797991,0.050545,2014,124,236,3,1
3863,-1.228708,2.043410,-2.065226,-2.606827,-1.102444,-0.354894,-0.037752,0.563403,-1.128659,0.000188,1.795434,-0.135639,2014,129,124,3,1
3864,-1.470017,2.314941,-2.232498,2.434983,-1.484617,-1.500124,2.308043,-0.442000,-0.354484,0.645846,-1.057850,0.687516,2014,124,55,3,1


In [16]:
reduced_train_data.shape, reduced_test_data.shape

((15461, 40), (3866, 40))

In [17]:
def determine_winner(row):
    if row['home_team_goal'] > row['away_team_goal']:
        return 1
    elif row['home_team_goal'] < row['away_team_goal']:
        return 0
    else:
        return 1  # If it's a tie, return 1 which means home team won

def calculate_goal_difference(row):
    if row['home_team_goal'] > row['away_team_goal']:
        return row['home_team_goal'] - row['away_team_goal']
    elif row['home_team_goal'] < row['away_team_goal']:
        return row['away_team_goal'] - row['home_team_goal']
    else:
        return 0  # If it's a tie, goal difference is zero

# Apply the functions to create the new columns
reduced_train_data['winner'] = reduced_train_data.apply(determine_winner, axis=1)
reduced_train_data['winby'] = reduced_train_data.apply(calculate_goal_difference, axis=1)

reduced_train_data['winner'] = reduced_train_data['winner'].astype('int')
reduced_train_data['winner'] = reduced_train_data['winner'].astype('category')
reduced_train_data['winby'] = reduced_train_data['winby'].astype('int')

reduced_train_data = reduced_train_data[reduced_train_data['winby'] != 0]

reduced_train_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reduced_train_data['winner'] = reduced_train_data.apply(determine_winner, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reduced_train_data['winby'] = reduced_train_data.apply(calculate_goal_difference, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reduced_train_data['winner'] = re

Unnamed: 0,year,home_team_id,away_team_id,home_team_goal,away_team_goal,home_team_avg_rating,away_team_avg_rating,rating_difference,home_team_avg_potential,away_team_avg_potential,...,away_coordinate_median,home_coordinate_q1,home_coordinate_q3,away_coordinate_q1,away_coordinate_q3,home_coordinate_density,away_coordinate_density,coordinate_gradient,winner,winby
0,2012,3,241,0,2,62.636364,60.272727,2.363636,70.090909,65.818182,...,43.0,27.0,63.5,27.0,63.5,3,3,0.0,0,2
1,2013,32,229,4,0,60.454545,64.363636,-3.909091,67.454545,69.818182,...,51.0,21.5,65.5,21.5,65.5,3,3,0.0,1,4
2,2014,245,12,2,0,76.272727,80.454545,-4.181818,80.909091,83.363636,...,52.0,34.0,59.5,34.0,59.5,4,4,0.0,1,2
3,2011,141,233,3,0,83.363636,74.090909,9.272727,86.909091,79.545455,...,51.0,30.5,65.5,30.5,65.5,3,3,0.0,1,3
4,2015,206,144,0,1,74.818182,81.545455,-6.727273,80.727273,83.000000,...,52.0,33.0,59.5,33.0,59.5,4,4,0.0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15452,2010,257,69,1,0,65.727273,73.181818,-7.454545,75.000000,81.000000,...,51.0,21.5,65.5,21.5,65.5,3,3,0.0,1,1
15453,2015,19,269,3,2,73.363636,70.272727,3.090909,77.090909,75.000000,...,52.0,33.0,59.5,33.0,59.5,4,4,0.0,1,1
15457,2010,226,213,2,1,82.181818,75.636364,6.545455,86.454545,81.272727,...,52.0,33.0,59.5,33.0,59.5,4,4,0.0,1,1
15459,2013,271,251,2,4,65.454545,65.545455,-0.090909,69.818182,71.272727,...,51.0,30.5,65.5,30.5,65.5,3,3,0.0,0,2


In [18]:
def determine_winner(row):
    if row['home_team_goal'] > row['away_team_goal']:
        return 1
    elif row['home_team_goal'] < row['away_team_goal']:
        return 0
    else:
        return 1  # If it's a tie, return 1 which means home team won

def calculate_goal_difference(row):
    if row['home_team_goal'] > row['away_team_goal']:
        return row['home_team_goal'] - row['away_team_goal']
    elif row['home_team_goal'] < row['away_team_goal']:
        return row['away_team_goal'] - row['home_team_goal']
    else:
        return 0  # If it's a tie, goal difference is zero

# Apply the functions to create the new columns
reduced_test_data['winner'] = reduced_test_data.apply(determine_winner, axis=1)
reduced_test_data['winby'] = reduced_test_data.apply(calculate_goal_difference, axis=1)

# Apply the functions to create the new columns
reduced_test_data['winner'] = reduced_test_data.apply(determine_winner, axis=1)
reduced_test_data['winby'] = reduced_test_data.apply(calculate_goal_difference, axis=1)

reduced_test_data['winner'] = reduced_test_data['winner'].astype('int')
reduced_test_data['winner'] = reduced_test_data['winner'].astype('category')
reduced_test_data['winby'] = reduced_test_data['winby'].astype('int')

reduced_test_data = reduced_test_data[reduced_test_data['winby'] != 0]

reduced_test_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reduced_test_data['winner'] = reduced_test_data.apply(determine_winner, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reduced_test_data['winby'] = reduced_test_data.apply(calculate_goal_difference, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reduced_test_data['winner'] = reduced

Unnamed: 0,year,home_team_id,away_team_id,home_team_goal,away_team_goal,home_team_avg_rating,away_team_avg_rating,rating_difference,home_team_avg_potential,away_team_avg_potential,...,away_coordinate_median,home_coordinate_q1,home_coordinate_q3,away_coordinate_q1,away_coordinate_q3,home_coordinate_density,away_coordinate_density,coordinate_gradient,winner,winby
0,2009,134,140,1,3,64.727273,65.090909,-0.363636,66.909091,69.454545,...,43.0,27.0,63.5,27.0,63.5,3,3,0.0,0,2
1,2009,94,224,4,2,66.545455,69.090909,-2.545455,72.636364,74.454545,...,43.0,27.0,63.5,27.0,63.5,3,3,0.0,1,2
2,2009,6,143,2,0,62.000000,63.818182,-1.818182,66.636364,68.818182,...,43.0,27.0,63.5,27.0,63.5,3,3,0.0,1,2
3,2009,146,140,1,2,63.090909,65.000000,-1.909091,67.545455,70.545455,...,43.0,27.0,63.5,27.0,63.5,3,3,0.0,0,1
5,2009,199,6,2,0,61.363636,60.818182,0.545455,67.272727,66.545455,...,43.0,27.0,63.5,27.0,63.5,3,3,0.0,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3860,2014,123,129,1,2,59.909091,62.454545,-2.545455,62.727273,68.545455,...,52.0,33.0,59.5,33.0,59.5,4,4,0.0,0,1
3861,2014,229,124,2,3,65.000000,68.090909,-3.090909,69.818182,73.636364,...,50.0,27.5,60.5,27.5,60.5,4,4,0.0,0,1
3862,2014,124,236,3,1,69.181818,60.454545,8.727273,74.181818,66.818182,...,47.0,32.5,63.0,32.5,63.0,4,4,0.0,1,2
3863,2014,129,124,3,1,63.000000,68.181818,-5.181818,68.363636,72.909091,...,52.0,33.0,59.5,33.0,59.5,4,4,0.0,1,2


In [19]:
X_train = reduced_train_data[[col for col in reduced_train_data.columns if col not in ['year', 'home_team_id', 'away_team_id', 'home_team_goal', 'away_team_goal', 'winner', 'winby']]]
y_train = reduced_train_data['winner']

X_test = reduced_test_data[[col for col in reduced_test_data.columns if col not in ['year','home_team_id', 'away_team_id', 'home_team_goal', 'away_team_goal', 'winner', 'winby']]]
y_test = reduced_test_data['winner']

In [20]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [21]:
# 5. Gradient Boosting Classifier
gradient_boosting = GradientBoostingClassifier()
gradient_boosting.fit(X_train, y_train)
y_pred_gradient_boosting = gradient_boosting.predict(X_test)

print("\nGradient Boosting Classifier:")
print(confusion_matrix(y_test, y_pred_gradient_boosting))
print(classification_report(y_test, y_pred_gradient_boosting))
print("Accuracy:", accuracy_score(y_test, y_pred_gradient_boosting))


Gradient Boosting Classifier:
[[ 490  607]
 [ 274 1500]]
              precision    recall  f1-score   support

           0       0.64      0.45      0.53      1097
           1       0.71      0.85      0.77      1774

    accuracy                           0.69      2871
   macro avg       0.68      0.65      0.65      2871
weighted avg       0.68      0.69      0.68      2871

Accuracy: 0.6931382793451759


In [22]:
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

In [23]:
# Define individual models
model1 = GradientBoostingClassifier()
model2 = RandomForestClassifier()
model3 = SVC(probability=True)  # SVC needs probability=True for soft voting

# Create a Voting Classifier
voting_clf = VotingClassifier(estimators=[
    ('gb', model1), ('rf', model2), ('svc', model3)], voting='soft')  # 'hard' for majority voting, 'soft' for probability averaging

# Fit the model
voting_clf.fit(X_train, y_train)

# Make predictions
y_pred_voting = voting_clf.predict(X_test)

# Evaluate the model
print("Voting Classifier:")
print(confusion_matrix(y_test, y_pred_voting))
print(classification_report(y_test, y_pred_voting))
print("Accuracy:", accuracy_score(y_test, y_pred_voting))

Voting Classifier:
[[ 482  615]
 [ 262 1512]]
              precision    recall  f1-score   support

           0       0.65      0.44      0.52      1097
           1       0.71      0.85      0.78      1774

    accuracy                           0.69      2871
   macro avg       0.68      0.65      0.65      2871
weighted avg       0.69      0.69      0.68      2871

Accuracy: 0.6945315221177291


In [24]:
X_train = reduced_train_data[[col for col in reduced_train_data.columns if col not in ['year', 'home_team_id', 'away_team_id', 'home_team_goal', 'away_team_goal', 'winner', 'winby']]]
y_train = reduced_train_data['winby']

X_test = reduced_test_data[[col for col in reduced_test_data.columns if col not in ['year','home_team_id', 'away_team_id', 'home_team_goal', 'away_team_goal', 'winner', 'winby']]]
y_test = reduced_test_data['winby']

In [25]:
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [26]:
# Function to evaluate and print regression model performance
def evaluate_model(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    correct_predictions = np.round(y_pred) == y_true
    accuracy = np.mean(correct_predictions)  # Mean of correct predictions
    print(f"Mean Absolute Error: {mae:.4f}")
    print(f"Mean Squared Error: {mse:.4f}")
    print(f"Root Mean Squared Error: {rmse:.4f}")
    print(f"R² Score: {r2:.4f}\n")
    print(f"Accuracy Score: {accuracy:.4f}\n")

In [28]:
# 3. Support Vector Regressor
svr_model = SVR(kernel='rbf')
svr_model.fit(X_train, y_train)
y_pred_svr = np.round(svr_model.predict(X_test)).astype(int)  # Round and convert to int
print("Support Vector Regressor Performance:")
evaluate_model(y_test, y_pred_svr)

Support Vector Regressor Performance:
Mean Absolute Error: 0.8192
Mean Squared Error: 1.5639
Root Mean Squared Error: 1.2506
R² Score: -0.3317

Accuracy Score: 0.4343



In [27]:
from sklearn.ensemble import VotingRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Define individual regression models
model1 = SVR(kernel='rbf')
model2 = LinearRegression()
model3 = RandomForestRegressor(n_estimators=100, random_state=42)

# Create a Voting Regressor
voting_regressor = VotingRegressor(estimators=[
    ('svr', model1), ('lr', model2), ('rf', model3)
])

# Fit the model
voting_regressor.fit(X_train, y_train)

# Make predictions
y_pred_voting = voting_regressor.predict(X_test)

# Evaluate the model
evaluate_model(y_test, y_pred_voting)

Mean Absolute Error: 0.8295
Mean Squared Error: 1.1674
Root Mean Squared Error: 1.0805
R² Score: 0.0060

Accuracy Score: 0.3260



In [29]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge

# Define individual base models
base_models = [
    ('svr', SVR(kernel='rbf')),
    ('lr', LinearRegression()),
    ('rf', RandomForestRegressor(n_estimators=100, random_state=42))
]

# Define the meta-model (this model will learn from the base model predictions)
meta_model = Ridge()

# Create a Stacking Regressor
stacking_regressor = StackingRegressor(estimators=base_models, final_estimator=meta_model)

# Fit the stacking regressor
stacking_regressor.fit(X_train, y_train)

# Make predictions
y_pred_stacking = stacking_regressor.predict(X_test)

# Evaluate the model
evaluate_model(y_test, y_pred_stacking)

Mean Absolute Error: 0.8328
Mean Squared Error: 1.1479
Root Mean Squared Error: 1.0714
R² Score: 0.0225

Accuracy Score: 0.2971



In [30]:
final_test_df = pd.read_csv('final_testing_dataset.csv')

In [31]:
# 1. Home and Away Team Average Rating
home_rating_cols = [f'home_player_overall_rating_{i}' for i in range(1, 12)]
away_rating_cols = [f'away_player_overall_rating_{i}' for i in range(1, 12)]

final_test_df['home_team_avg_rating'] = final_test_df[home_rating_cols].mean(axis=1)
final_test_df['away_team_avg_rating'] = final_test_df[away_rating_cols].mean(axis=1)

# 2. Team Rating Difference
final_test_df['rating_difference'] = final_test_df['home_team_avg_rating'] - final_test_df['away_team_avg_rating']

# 3. Average Player Potential
home_potential_cols = [f'home_player_potential_{i}' for i in range(1, 12)]
away_potential_cols = [f'away_player_potential_{i}' for i in range(1, 12)]

final_test_df['home_team_avg_potential'] = final_test_df[home_potential_cols].mean(axis=1)
final_test_df['away_team_avg_potential'] = final_test_df[away_potential_cols].mean(axis=1)

# 4. Team Potential Difference
final_test_df['potential_difference'] = final_test_df['home_team_avg_potential'] - final_test_df['away_team_avg_potential']

# 5. Average Player Crossing Ability
home_crossing_cols = [f'home_player_crossing_{i}' for i in range(1, 12)]
away_crossing_cols = [f'away_player_crossing_{i}' for i in range(1, 12)]

final_test_df['home_team_avg_crossing'] = final_test_df[home_crossing_cols].mean(axis=1)
final_test_df['away_team_avg_crossing'] = final_test_df[away_crossing_cols].mean(axis=1)

# 6. Average Player Preferred Foot (Proportion Left-Footed)
home_left_foot_cols = [f'home_player_preferred_foot_left_{i}' for i in range(1, 12)]
away_left_foot_cols = [f'away_player_preferred_foot_left_{i}' for i in range(1, 12)]

final_test_df['home_team_left_foot_ratio'] = final_test_df[home_left_foot_cols].mean(axis=1)
final_test_df['away_team_left_foot_ratio'] = final_test_df[away_left_foot_cols].mean(axis=1)

In [32]:
# 1. Home/Away Team High Attacking Work Rate Count
final_test_df['home_high_attacking_work_rate_count'] = final_test_df[[f'home_player_attacking_work_rate_high_{i}' for i in range(1, 12)]].sum(axis=1)
final_test_df['away_high_attacking_work_rate_count'] = final_test_df[[f'away_player_attacking_work_rate_high_{i}' for i in range(1, 12)]].sum(axis=1)

# 2. Home/Away Team High Defensive Work Rate Count
final_test_df['home_high_defensive_work_rate_count'] = final_test_df[[f'home_player_defensive_work_rate_high_{i}' for i in range(1, 12)]].sum(axis=1)
final_test_df['away_high_defensive_work_rate_count'] = final_test_df[[f'away_player_defensive_work_rate_high_{i}' for i in range(1, 12)]].sum(axis=1)

# 3. Home/Away Team Medium Attacking Work Rate Count
final_test_df['home_medium_attacking_work_rate_count'] = final_test_df[[f'home_player_attacking_work_rate_medium_{i}' for i in range(1, 12)]].sum(axis=1)
final_test_df['away_medium_attacking_work_rate_count'] = final_test_df[[f'away_player_attacking_work_rate_medium_{i}' for i in range(1, 12)]].sum(axis=1)

# 4. Home/Away Team Medium Defensive Work Rate Count
final_test_df['home_medium_defensive_work_rate_count'] = final_test_df[[f'home_player_defensive_work_rate_medium_{i}' for i in range(1, 12)]].sum(axis=1)
final_test_df['away_medium_defensive_work_rate_count'] = final_test_df[[f'away_player_defensive_work_rate_medium_{i}' for i in range(1, 12)]].sum(axis=1)

# 5. Home/Away Average Player Coordinates
final_test_df['home_avg_player_coordinate'] = final_test_df[[f'coordinates_{i}' for i in range(1, 12)]].mean(axis=1)
final_test_df['away_avg_player_coordinate'] = final_test_df[[f'coordinates_{i}' for i in range(1, 12)]].mean(axis=1)

In [33]:
# Average Coordinate Position for Home and Away Teams
final_test_df['home_avg_coordinate'] = final_test_df[[f'coordinates_{i}' for i in range(1, 12)]].mean(axis=1)
final_test_df['away_avg_coordinate'] = final_test_df[[f'coordinates_{i}' for i in range(1, 12)]].mean(axis=1)

# Coordinate Spread (Standard Deviation) for Home and Away Teams
final_test_df['home_coordinate_std'] = final_test_df[[f'coordinates_{i}' for i in range(1, 12)]].std(axis=1)
final_test_df['away_coordinate_std'] = final_test_df[[f'coordinates_{i}' for i in range(1, 12)]].std(axis=1)

# Coordinate Range (Max - Min) for Home and Away Teams
final_test_df['home_coordinate_range'] = final_test_df[[f'coordinates_{i}' for i in range(1, 12)]].max(axis=1) - final_test_df[[f'coordinates_{i}' for i in range(1, 12)]].min(axis=1)
final_test_df['away_coordinate_range'] = final_test_df[[f'coordinates_{i}' for i in range(1, 12)]].max(axis=1) - final_test_df[[f'coordinates_{i}' for i in range(1, 12)]].min(axis=1)

# Median Coordinate for Home and Away Teams
final_test_df['home_coordinate_median'] = final_test_df[[f'coordinates_{i}' for i in range(1, 12)]].median(axis=1)
final_test_df['away_coordinate_median'] = final_test_df[[f'coordinates_{i}' for i in range(1, 12)]].median(axis=1)

# Coordinate Quartiles for Home and Away Teams
final_test_df['home_coordinate_q1'] = final_test_df[[f'coordinates_{i}' for i in range(1, 12)]].quantile(0.25, axis=1)
final_test_df['home_coordinate_q3'] = final_test_df[[f'coordinates_{i}' for i in range(1, 12)]].quantile(0.75, axis=1)
final_test_df['away_coordinate_q1'] = final_test_df[[f'coordinates_{i}' for i in range(1, 12)]].quantile(0.25, axis=1)
final_test_df['away_coordinate_q3'] = final_test_df[[f'coordinates_{i}' for i in range(1, 12)]].quantile(0.75, axis=1)

# Coordinate Density (Proximity Count) for Home and Away Teams within 10-unit range
proximity_threshold = 10
final_test_df['home_coordinate_density'] = final_test_df[[f'coordinates_{i}' for i in range(1, 12)]].apply(lambda row: sum(abs(row - row.mean()) < proximity_threshold), axis=1)
final_test_df['away_coordinate_density'] = final_test_df[[f'coordinates_{i}' for i in range(1, 12)]].apply(lambda row: sum(abs(row - row.mean()) < proximity_threshold), axis=1)

# Coordinate Gradient (Difference in Average Position between Home and Away Teams)
final_test_df['coordinate_gradient'] = final_test_df['home_avg_coordinate'] - final_test_df['away_avg_coordinate']

In [36]:
final_test_df.shape

(1000, 290)

In [39]:
reduced_final_test_df = final_test_df[['home_team_id', 'away_team_id',
                      'home_team_avg_rating', 'away_team_avg_rating', 'rating_difference', 'home_team_avg_potential',
                       'away_team_avg_potential', 'potential_difference', 'home_team_avg_crossing', 'away_team_avg_crossing',
                       'home_team_left_foot_ratio', 'away_team_left_foot_ratio', 'home_high_attacking_work_rate_count',
                       'away_high_attacking_work_rate_count', 'home_high_defensive_work_rate_count',
                       'away_high_defensive_work_rate_count', 'home_medium_attacking_work_rate_count',
                       'away_medium_attacking_work_rate_count', 'home_medium_defensive_work_rate_count',
                       'away_medium_defensive_work_rate_count', 'home_avg_player_coordinate', 'away_avg_player_coordinate',
                       'home_avg_coordinate', 'away_avg_coordinate', 'home_coordinate_std', 'away_coordinate_std',
                       'home_coordinate_range', 'away_coordinate_range', 'home_coordinate_median', 'away_coordinate_median',
                       'home_coordinate_q1', 'home_coordinate_q3', 'away_coordinate_q1', 'away_coordinate_q3',
                       'home_coordinate_density', 'away_coordinate_density', 'coordinate_gradient']]

In [42]:
reduced_final_test_df = reduced_final_test_df.drop(columns=['home_team_id', 'away_team_id'])

In [43]:
reduced_final_test_df.shape

(1000, 35)

In [45]:
# Make predictions
y_pred_voting_final = voting_clf.predict(reduced_final_test_df)
y_pred_voting_final

array([0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,

In [48]:
# Make predictions
y_pred_stacking_final = voting_regressor.predict(reduced_final_test_df)
y_pred_stacking_final = np.round(y_pred_stacking_final).astype(int)
y_pred_stacking_final

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2,
       2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2,

In [49]:
results_df = pd.DataFrame({
    'winner': y_pred_voting_final,
    'winby': y_pred_stacking_final
})
results_df

Unnamed: 0,winner,winby
0,0,2
1,1,2
2,1,2
3,1,2
4,1,2
...,...,...
995,1,2
996,1,2
997,0,2
998,1,2


In [52]:
from google.colab import files
results_df.to_csv('results_df.csv', index=False)
files.download('results_df.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [54]:
from google.colab import files
reduced_train_data.to_csv('reduced_train_data.csv', index=False)
files.download('reduced_train_data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [56]:
from google.colab import files
reduced_test_data.to_csv('reduced_test_data.csv', index=False)
files.download('reduced_test_data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [58]:
from google.colab import files
reduced_final_test_df.to_csv('reduced_final_test_df.csv', index=False)
files.download('reduced_final_test_df.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>