# Creating the dataset

The purpose of this notebook is to establish the process of converting the rows for all the players in both squads in the game, to a single row for use as the neural network input

## Imports

In [1]:
import pandas as pd
import numpy as np
from db_connection import local_pl_stats_connector


## Method

Create database connection variable

In [2]:
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

db = local_pl_stats_connector

Generate a dataframe containing all the career stats of the players (up to and including season of the game in question) 
playing in a specific match

In [3]:
def create_player_stats_for_match(game_season: str, home_team_id: str, away_team_id: str, less_than_or_equal_to:str) -> pd.DataFrame:
	return db.get_df(f"""
		SELECT 
			hpn.*, m.id AS match_id, m.competition_id, m.home_team_id, m.away_team_id, m.referee_id, 
			m.home_goals, m.away_goals, m.home_shots, m.away_shots, m.home_shots_on_target, 
			m.away_shots_on_target, m.home_corners, m.away_corners, m.home_fouls, m.away_fouls, 
			m.home_yellow_cards, m.away_yellow_cards, m.home_red_cards, m.away_red_cards
 		FROM historic_player_per_ninety hpn
		JOIN match m
			ON m.season = '{game_season}'
			AND m.home_team_id = '{home_team_id}'
			AND m.away_team_id = '{away_team_id}'
		WHERE player_id IN ( 
			SELECT player_id FROM historic_player_per_ninety hpn
			JOIN match m
				ON m.season = '{game_season}'
				AND m.home_team_id = '{home_team_id}'
				AND m.away_team_id = '{away_team_id}'
				AND hpn.team_id IN (m.home_team_id, m.away_team_id)
			WHERE hpn.season {less_than_or_equal_to} '{game_season}'
		)
			AND hpn.season {less_than_or_equal_to} '{game_season}'
	""")

Collect match facts for a particular game based on the season it occurred and the home and away teams

In [4]:
def create_match_facts_for_match(game_season: str, home_team_id: str, away_team_id: str) -> pd.DataFrame:
	"""
		Generate a dataframe containing all the match facts based on the season and teams
	"""
	return db.get_df(f"""
		SELECT * FROM match 
		WHERE season = '{game_season}' and home_team_id = '{home_team_id}' and away_team_id = '{away_team_id}'
	""")

Create a list of lists of the home and away team ids, and the season for all the games in the matches table, for use in matching the players to the correct matches and teams

In [5]:
def get_match_column_values(all_matches: pd.DataFrame) -> list:
	columns_to_extract = ["home_team_id", "away_team_id", "season", "id"]
	values_list = []

	for index, row in all_matches.iterrows():
		row_values = [row[column] for column in columns_to_extract]
		values_list.append(row_values)

	return values_list

Columns being excluded, along with the output columns for the NN

In [6]:
output_columns = [
	"home_goals", "away_goals", "home_shots", "away_shots", "home_shots_on_target", "away_shots_on_target",
	"home_corners", "away_corners", "home_fouls", "away_fouls", "home_yellow_cards", "away_yellow_cards",
	"home_red_cards", "away_red_cards"
]
match_columns = [
	"match_id", "competition_id", "home_team_id", "away_team_id", "referee_id",
	"home_goals", "away_goals", "home_shots", "away_shots", "home_shots_on_target", "away_shots_on_target",
	"home_corners", "away_corners", "home_fouls", "away_fouls", "home_yellow_cards", "away_yellow_cards",
	"home_red_cards", "away_red_cards"
]
stats_columns = [
	"goals","assists","non_penalty_goals","penalties_scored","penalties_attempted","yellow_cards","red_cards","expected_goals",
	"non_penalty_expected_goals","expected_assisted_goals","progressive_carries","progressive_passes","progressive_passes_received","total_passing_distance","total_progressive_passing_distance","short_passes_completed","short_passes_attempted","medium_passes_completed","medium_passes_attempted",
	"long_passes_completed","long_passes_attempted","expected_assists","key_passes","passes_into_final_third","passes_into_penalty_area","crosses_into_penalty_area","shots","shots_on_target","average_shot_distance","shots_from_free_kicks",
	"shots_from_penalties","touches","touches_in_defensive_penalty_area","touches_in_defensive_third","touches_in_middle_third","touches_in_attacking_third","touches_in_attacking_penalty_area","live_ball_touches","take_ons_attempted","take_ons_succeeded","times_tackled_during_take_on",
	"carries","total_carrying_distance","progressive_carrying_distance","carries_into_final_third","carries_into_penalty_area","miscontrols","dispossessed","passes_received","tackles","tackles_won","defensive_third_tackles",
	"middle_third_tackles","attacking_third_tackles","dribblers_tackled","dribbler_tackles_attempted","shots_blocked","passes_blocked","interceptions","clearances","errors_leading_to_shot","goals_against","shots_on_target_against","saves","clean_sheets","penalties_faced","penalties_allowed","penalties_saved","penalties_missed"
]
player_stats_columns = ["player_id", "minutes_played","ninetys"] + stats_columns
pure_stats_columns = ["minutes_played"] + stats_columns
team_stats_columns = ["team_id"] + stats_columns

Create an object with dataframes as values, with each dataframe containing the players and stats that played in every game in the match table

In [7]:
all_matches = db.get_df("SELECT * FROM match")
match_values = get_match_column_values(all_matches)

complete_dataset = pd.DataFrame()
players_in_match = {}
df = ''
columns_to_remove = ["_plus_", "_minus", "_divided_by_",]

for match in match_values:
	
	season = match[2]
	home_team_id = match[0]
	away_team_id = match[1]
	match_id = match[3]

	df = create_player_stats_for_match(season, home_team_id, away_team_id, "<=")

	columns = [col for col in df.columns if any(word in col for word in columns_to_remove)]
	df = df.drop(columns=columns)

	players_in_match[match_id] = df

In [8]:
def get_all_players_in_match(season: str, home_team_id: str, away_team_id: str) -> pd.DataFrame:
	columns_to_remove = ["_plus_", "_minus", "_divided_by_",]

	df = create_player_stats_for_match(season, home_team_id, away_team_id)

	columns = [col for col in df.columns if any(word in col for word in columns_to_remove)]
	df = df.drop(columns=columns)

	return df

Group by the player id and calculate the yearly mean of each stat over their careers

In [9]:
df = players_in_match["m-00001"]

specified_team_ids = ['t-00001', 't-00013']

unique_player_ids = df['player_id'].unique().tolist()

for player_id in unique_player_ids:
    teams_played_for = df[df["player_id"] == player_id]["team_id"].unique().tolist()
    if specified_team_ids[0] in teams_played_for:
        df.loc[df["player_id"] == player_id, "team_id"] = specified_team_ids[0]
    if specified_team_ids[1] in teams_played_for:
        df.loc[df["player_id"] == player_id, "team_id"] = specified_team_ids[1]
        
# Apply the custom aggregation function to "team_id" while grouping by "player_id"
df[player_stats_columns] = (
    df[player_stats_columns]
    .groupby("player_id")
    .sum()
    .div(df.groupby("player_id")["season"].nunique(), axis=0)
	.reset_index()
)

df = df[df.index < df["player_id"].nunique()]

df.head()

Unnamed: 0,player_id,team_id,minutes_played,ninetys,goals,assists,non_penalty_goals,penalties_scored,penalties_attempted,yellow_cards,red_cards,expected_goals,non_penalty_expected_goals,expected_assisted_goals,progressive_carries,progressive_passes,progressive_passes_received,total_passing_distance,total_progressive_passing_distance,short_passes_completed,short_passes_attempted,medium_passes_completed,medium_passes_attempted,long_passes_completed,long_passes_attempted,expected_assists,key_passes,passes_into_final_third,passes_into_penalty_area,crosses_into_penalty_area,shots,shots_on_target,average_shot_distance,shots_from_free_kicks,shots_from_penalties,touches,touches_in_defensive_penalty_area,touches_in_defensive_third,touches_in_middle_third,touches_in_attacking_third,touches_in_attacking_penalty_area,live_ball_touches,take_ons_attempted,take_ons_succeeded,times_tackled_during_take_on,carries,total_carrying_distance,progressive_carrying_distance,carries_into_final_third,carries_into_penalty_area,miscontrols,dispossessed,passes_received,tackles,tackles_won,defensive_third_tackles,middle_third_tackles,attacking_third_tackles,dribblers_tackled,dribbler_tackles_attempted,shots_blocked,passes_blocked,interceptions,clearances,errors_leading_to_shot,goals_against,shots_on_target_against,saves,clean_sheets,penalties_faced,penalties_allowed,penalties_saved,penalties_missed,season,match_id,competition_id,home_team_id,away_team_id,referee_id,home_goals,away_goals,home_shots,away_shots,home_shots_on_target,away_shots_on_target,home_corners,away_corners,home_fouls,away_fouls,home_yellow_cards,away_yellow_cards,home_red_cards,away_red_cards
0,p-00001,t-00001,366.0,4.1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.2,0.2,0.0,0.0,6.0,1.0,4270.0,1894.0,72.0,74.0,134.0,146.0,17.0,22.0,0.0,0.0,6.0,0.0,0.0,2.0,2.0,10.4,0.0,0.0,288.0,28.0,141.0,140.0,9.0,4.0,288.0,0.0,0.0,0.0,158.0,484.0,257.0,0.0,0.0,2.0,1.0,179.0,2.0,2.0,0.0,2.0,0.0,0.0,1.0,3.0,1.0,6.0,22.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017-2018,m-00001,1,t-00001,t-00013,r-00001,4,3,27,6,10,3,9,4,9,12,0,1,0,0
1,p-00002,t-00001,2225.0,24.7,2.0,0.0,2.0,0.0,0.0,4.0,0.0,1.9,1.9,0.4,25.0,124.0,10.0,31024.0,12028.0,523.0,574.0,841.0,910.0,185.0,241.0,0.7,7.0,137.0,3.0,0.0,9.0,2.0,9.4,0.0,0.0,2061.0,163.0,864.0,1110.0,96.0,16.0,2061.0,3.0,2.0,1.0,1253.0,6119.0,3571.0,25.0,0.0,12.0,4.0,1358.0,43.0,21.0,27.0,12.0,4.0,19.0,33.0,15.0,13.0,54.0,138.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017-2018,m-00001,1,t-00001,t-00013,r-00001,4,3,27,6,10,3,9,4,9,12,0,1,0,0
2,p-00003,t-00001,1716.0,19.0,3.0,9.0,3.0,0.0,0.0,2.0,0.0,2.3,2.3,5.8,74.0,116.0,150.0,11351.0,2875.0,464.0,542.0,249.0,309.0,40.0,106.0,4.4,44.0,70.0,32.0,4.0,36.0,16.0,40.1,1.0,0.0,1294.0,9.0,138.0,546.0,627.0,58.0,1294.0,56.0,30.0,26.0,792.0,4735.0,2508.0,69.0,13.0,43.0,42.0,984.0,33.0,24.0,13.0,14.0,6.0,10.0,46.0,2.0,22.0,15.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017-2018,m-00001,1,t-00001,t-00013,r-00001,4,3,27,6,10,3,9,4,9,12,0,1,0,0
3,p-00004,t-00001,1846.0,20.5,7.0,8.0,7.0,0.0,0.0,0.0,0.0,6.1,6.1,5.4,61.0,134.0,161.0,16341.0,4329.0,556.0,627.0,390.0,457.0,69.0,97.0,5.3,28.0,102.0,36.0,3.0,56.0,25.0,16.6,0.0,0.0,1480.0,16.0,174.0,770.0,552.0,91.0,1480.0,36.0,29.0,7.0,1082.0,5745.0,2709.0,54.0,18.0,49.0,37.0,1214.0,35.0,27.0,18.0,17.0,0.0,8.0,44.0,3.0,11.0,24.0,6.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017-2018,m-00001,1,t-00001,t-00013,r-00001,4,3,27,6,10,3,9,4,9,12,0,1,0,0
4,p-00005,t-00013,2202.0,24.5,14.0,4.0,12.0,2.0,2.0,1.0,0.0,13.7,12.1,4.4,39.0,59.0,233.0,6693.0,1185.0,369.0,450.0,120.0,170.0,13.0,25.0,4.0,36.0,18.0,31.0,3.0,66.0,32.0,13.6,0.0,2.0,980.0,8.0,58.0,308.0,620.0,173.0,978.0,45.0,33.0,12.0,621.0,2501.0,906.0,12.0,27.0,62.0,50.0,756.0,24.0,17.0,6.0,11.0,7.0,4.0,27.0,3.0,16.0,7.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017-2018,m-00001,1,t-00001,t-00013,r-00001,4,3,27,6,10,3,9,4,9,12,0,1,0,0


In [10]:
def group_stats_by_player_for_home_and_away_teams(df: pd.DataFrame) -> pd.DataFrame:
    home_team_id = df["home_team_id"].iloc[0]
    away_team_id = df["away_team_id"].iloc[0]

    specified_team_ids = [home_team_id, away_team_id]
    unique_player_ids = df['player_id'].unique().tolist()

    for player_id in unique_player_ids:
        teams_played_for = df[df["player_id"] == player_id]["team_id"].unique().tolist()
        if specified_team_ids[0] in teams_played_for:
            df.loc[df["player_id"] == player_id, "team_id"] = specified_team_ids[0]
        if specified_team_ids[1] in teams_played_for:
            df.loc[df["player_id"] == player_id, "team_id"] = specified_team_ids[1]
            
    df[player_stats_columns] = (
        df[player_stats_columns]
        .groupby("player_id")
        .sum()
        .div(df.groupby("player_id")["season"].nunique(), axis=0)
        .reset_index()
    )

    df = df[df.index < df["player_id"].nunique()]

    return df

Per 90 stats - the stats produced by each player per 90 mins of the season

In [11]:
ninety_mins_per_season = 38

df.loc[:, pure_stats_columns] = df[pure_stats_columns].apply(lambda x: x / ninety_mins_per_season)

df.head()

Unnamed: 0,player_id,team_id,minutes_played,ninetys,goals,assists,non_penalty_goals,penalties_scored,penalties_attempted,yellow_cards,red_cards,expected_goals,non_penalty_expected_goals,expected_assisted_goals,progressive_carries,progressive_passes,progressive_passes_received,total_passing_distance,total_progressive_passing_distance,short_passes_completed,short_passes_attempted,medium_passes_completed,medium_passes_attempted,long_passes_completed,long_passes_attempted,expected_assists,key_passes,passes_into_final_third,passes_into_penalty_area,crosses_into_penalty_area,shots,shots_on_target,average_shot_distance,shots_from_free_kicks,shots_from_penalties,touches,touches_in_defensive_penalty_area,touches_in_defensive_third,touches_in_middle_third,touches_in_attacking_third,touches_in_attacking_penalty_area,live_ball_touches,take_ons_attempted,take_ons_succeeded,times_tackled_during_take_on,carries,total_carrying_distance,progressive_carrying_distance,carries_into_final_third,carries_into_penalty_area,miscontrols,dispossessed,passes_received,tackles,tackles_won,defensive_third_tackles,middle_third_tackles,attacking_third_tackles,dribblers_tackled,dribbler_tackles_attempted,shots_blocked,passes_blocked,interceptions,clearances,errors_leading_to_shot,goals_against,shots_on_target_against,saves,clean_sheets,penalties_faced,penalties_allowed,penalties_saved,penalties_missed,season,match_id,competition_id,home_team_id,away_team_id,referee_id,home_goals,away_goals,home_shots,away_shots,home_shots_on_target,away_shots_on_target,home_corners,away_corners,home_fouls,away_fouls,home_yellow_cards,away_yellow_cards,home_red_cards,away_red_cards
0,p-00001,t-00001,9.631579,4.1,0.026316,0.0,0.026316,0.0,0.0,0.0,0.0,0.005263,0.005263,0.0,0.0,0.157895,0.026316,112.368421,49.842105,1.894737,1.947368,3.526316,3.842105,0.447368,0.578947,0.0,0.0,0.157895,0.0,0.0,0.052632,0.052632,0.273684,0.0,0.0,7.578947,0.736842,3.710526,3.684211,0.236842,0.105263,7.578947,0.0,0.0,0.0,4.157895,12.736842,6.763158,0.0,0.0,0.052632,0.026316,4.710526,0.052632,0.052632,0.0,0.052632,0.0,0.0,0.026316,0.078947,0.026316,0.157895,0.578947,0.052632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017-2018,m-00001,1,t-00001,t-00013,r-00001,4,3,27,6,10,3,9,4,9,12,0,1,0,0
1,p-00002,t-00001,58.552632,24.7,0.052632,0.0,0.052632,0.0,0.0,0.105263,0.0,0.05,0.05,0.010526,0.657895,3.263158,0.263158,816.421053,316.526316,13.763158,15.105263,22.131579,23.947368,4.868421,6.342105,0.018421,0.184211,3.605263,0.078947,0.0,0.236842,0.052632,0.247368,0.0,0.0,54.236842,4.289474,22.736842,29.210526,2.526316,0.421053,54.236842,0.078947,0.052632,0.026316,32.973684,161.026316,93.973684,0.657895,0.0,0.315789,0.105263,35.736842,1.131579,0.552632,0.710526,0.315789,0.105263,0.5,0.868421,0.394737,0.342105,1.421053,3.631579,0.078947,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017-2018,m-00001,1,t-00001,t-00013,r-00001,4,3,27,6,10,3,9,4,9,12,0,1,0,0
2,p-00003,t-00001,45.157895,19.0,0.078947,0.236842,0.078947,0.0,0.0,0.052632,0.0,0.060526,0.060526,0.152632,1.947368,3.052632,3.947368,298.710526,75.657895,12.210526,14.263158,6.552632,8.131579,1.052632,2.789474,0.115789,1.157895,1.842105,0.842105,0.105263,0.947368,0.421053,1.055263,0.026316,0.0,34.052632,0.236842,3.631579,14.368421,16.5,1.526316,34.052632,1.473684,0.789474,0.684211,20.842105,124.605263,66.0,1.815789,0.342105,1.131579,1.105263,25.894737,0.868421,0.631579,0.342105,0.368421,0.157895,0.263158,1.210526,0.052632,0.578947,0.394737,0.263158,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017-2018,m-00001,1,t-00001,t-00013,r-00001,4,3,27,6,10,3,9,4,9,12,0,1,0,0
3,p-00004,t-00001,48.578947,20.5,0.184211,0.210526,0.184211,0.0,0.0,0.0,0.0,0.160526,0.160526,0.142105,1.605263,3.526316,4.236842,430.026316,113.921053,14.631579,16.5,10.263158,12.026316,1.815789,2.552632,0.139474,0.736842,2.684211,0.947368,0.078947,1.473684,0.657895,0.436842,0.0,0.0,38.947368,0.421053,4.578947,20.263158,14.526316,2.394737,38.947368,0.947368,0.763158,0.184211,28.473684,151.184211,71.289474,1.421053,0.473684,1.289474,0.973684,31.947368,0.921053,0.710526,0.473684,0.447368,0.0,0.210526,1.157895,0.078947,0.289474,0.631579,0.157895,0.026316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017-2018,m-00001,1,t-00001,t-00013,r-00001,4,3,27,6,10,3,9,4,9,12,0,1,0,0
4,p-00005,t-00013,57.947368,24.5,0.368421,0.105263,0.315789,0.052632,0.052632,0.026316,0.0,0.360526,0.318421,0.115789,1.026316,1.552632,6.131579,176.131579,31.184211,9.710526,11.842105,3.157895,4.473684,0.342105,0.657895,0.105263,0.947368,0.473684,0.815789,0.078947,1.736842,0.842105,0.357895,0.0,0.052632,25.789474,0.210526,1.526316,8.105263,16.315789,4.552632,25.736842,1.184211,0.868421,0.315789,16.342105,65.815789,23.842105,0.315789,0.710526,1.631579,1.315789,19.894737,0.631579,0.447368,0.157895,0.289474,0.184211,0.105263,0.710526,0.078947,0.421053,0.184211,0.131579,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017-2018,m-00001,1,t-00001,t-00013,r-00001,4,3,27,6,10,3,9,4,9,12,0,1,0,0


In [12]:
def create_per_90_stats(df: pd.DataFrame) -> pd.DataFrame:
	ninety_mins_per_season = 38

	df.loc[:, pure_stats_columns] = df[pure_stats_columns].apply(lambda x: x / ninety_mins_per_season)
	return df

Normalize stats per 90 - the stats for each player taking into account their average involvement per 90

In [13]:
minutes_per_game = 90

df[pure_stats_columns] = df[pure_stats_columns].apply(lambda x: x * (df["minutes_played"] / 90))
df = df.drop(columns=["minutes_played", "ninetys"])
# pure_stats_columns.remove("minutes_played")
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[pure_stats_columns] = df[pure_stats_columns].apply(lambda x: x * (df["minutes_played"] / 90))


Unnamed: 0,player_id,team_id,goals,assists,non_penalty_goals,penalties_scored,penalties_attempted,yellow_cards,red_cards,expected_goals,non_penalty_expected_goals,expected_assisted_goals,progressive_carries,progressive_passes,progressive_passes_received,total_passing_distance,total_progressive_passing_distance,short_passes_completed,short_passes_attempted,medium_passes_completed,medium_passes_attempted,long_passes_completed,long_passes_attempted,expected_assists,key_passes,passes_into_final_third,passes_into_penalty_area,crosses_into_penalty_area,shots,shots_on_target,average_shot_distance,shots_from_free_kicks,shots_from_penalties,touches,touches_in_defensive_penalty_area,touches_in_defensive_third,touches_in_middle_third,touches_in_attacking_third,touches_in_attacking_penalty_area,live_ball_touches,take_ons_attempted,take_ons_succeeded,times_tackled_during_take_on,carries,total_carrying_distance,progressive_carrying_distance,carries_into_final_third,carries_into_penalty_area,miscontrols,dispossessed,passes_received,tackles,tackles_won,defensive_third_tackles,middle_third_tackles,attacking_third_tackles,dribblers_tackled,dribbler_tackles_attempted,shots_blocked,passes_blocked,interceptions,clearances,errors_leading_to_shot,goals_against,shots_on_target_against,saves,clean_sheets,penalties_faced,penalties_allowed,penalties_saved,penalties_missed,season,match_id,competition_id,home_team_id,away_team_id,referee_id,home_goals,away_goals,home_shots,away_shots,home_shots_on_target,away_shots_on_target,home_corners,away_corners,home_fouls,away_fouls,home_yellow_cards,away_yellow_cards,home_red_cards,away_red_cards
0,p-00001,t-00001,0.002816,0.0,0.002816,0.0,0.0,0.0,0.0,0.000563,0.000563,0.0,0.0,0.016898,0.002816,12.025392,5.33398,0.20277,0.208403,0.377378,0.411173,0.047876,0.061958,0.0,0.0,0.016898,0.0,0.0,0.005633,0.005633,0.029289,0.0,0.0,0.81108,0.078855,0.397091,0.394275,0.025346,0.011265,0.81108,0.0,0.0,0.0,0.444968,1.363066,0.723777,0.0,0.0,0.005633,0.002816,0.504109,0.005633,0.005633,0.0,0.005633,0.0,0.0,0.002816,0.008449,0.002816,0.016898,0.061958,0.005633,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017-2018,m-00001,1,t-00001,t-00013,r-00001,4,3,27,6,10,3,9,4,9,12,0,1,0,0
1,p-00002,t-00001,0.034241,0.0,0.034241,0.0,0.0,0.068483,0.0,0.032529,0.032529,0.006848,0.428016,2.122961,0.171207,531.151123,205.927208,8.954101,9.827255,14.398469,15.579794,3.167321,4.126077,0.011984,0.119845,2.345529,0.051362,0.0,0.154086,0.034241,0.160934,0.0,0.0,35.285665,2.790666,14.792244,19.003924,1.643583,0.27393,35.285665,0.051362,0.034241,0.017121,21.452178,104.761273,61.13785,0.428016,0.0,0.205448,0.068483,23.249846,0.736188,0.359534,0.462258,0.205448,0.068483,0.325292,0.564982,0.25681,0.222568,0.924515,2.36265,0.051362,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017-2018,m-00001,1,t-00001,t-00013,r-00001,4,3,27,6,10,3,9,4,9,12,0,1,0,0
2,p-00003,t-00001,0.039612,0.118837,0.039612,0.0,0.0,0.026408,0.0,0.030369,0.030369,0.076584,0.977101,1.531671,1.980609,149.879317,37.961681,6.126685,7.156602,3.287812,4.080055,0.528163,1.399631,0.058098,0.580979,0.924284,0.42253,0.052816,0.475346,0.211265,0.529483,0.013204,0.0,17.086057,0.118837,1.822161,7.209418,8.278947,0.765836,17.086057,0.739428,0.396122,0.343306,10.457618,62.521237,33.115789,0.91108,0.171653,0.567775,0.554571,12.992798,0.435734,0.316898,0.171653,0.184857,0.079224,0.132041,0.607387,0.026408,0.290489,0.198061,0.132041,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017-2018,m-00001,1,t-00001,t-00013,r-00001,4,3,27,6,10,3,9,4,9,12,0,1,0,0
3,p-00004,t-00001,0.099431,0.113635,0.099431,0.0,0.0,0.0,0.0,0.086647,0.086647,0.076704,0.866467,1.903386,2.286904,232.11362,61.49072,7.89763,8.90614,5.539705,6.491397,0.980102,1.377824,0.075283,0.397722,1.448846,0.511357,0.042613,0.795445,0.355109,0.235793,0.0,0.0,21.022468,0.22727,2.47156,10.937365,7.840813,1.292598,21.022468,0.511357,0.411927,0.099431,15.369129,81.604109,38.47964,0.767036,0.255679,0.696014,0.525562,17.244106,0.497153,0.383518,0.255679,0.241474,0.0,0.113635,0.624992,0.042613,0.156248,0.340905,0.085226,0.014204,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017-2018,m-00001,1,t-00001,t-00013,r-00001,4,3,27,6,10,3,9,4,9,12,0,1,0,0
4,p-00005,t-00013,0.237211,0.067775,0.203324,0.033887,0.033887,0.016944,0.0,0.232128,0.205018,0.074552,0.660803,0.999677,3.947876,113.404017,20.078255,6.252216,7.624654,2.033241,2.880425,0.220268,0.423592,0.067775,0.609972,0.304986,0.525254,0.050831,1.118283,0.542198,0.230434,0.0,0.033887,16.604801,0.135549,0.982733,5.218652,10.505078,2.931256,16.570914,0.762465,0.559141,0.203324,10.522022,42.376131,15.35097,0.203324,0.457479,1.050508,0.847184,12.809418,0.406648,0.288042,0.101662,0.18638,0.118606,0.067775,0.457479,0.050831,0.271099,0.118606,0.084718,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017-2018,m-00001,1,t-00001,t-00013,r-00001,4,3,27,6,10,3,9,4,9,12,0,1,0,0


In [14]:
def create_contribution_per_90_stats(df: pd.DataFrame) -> pd.DataFrame:
	minutes_per_game = 90

	df[pure_stats_columns] = df[pure_stats_columns].apply(lambda x: x * (df["minutes_played"] / minutes_per_game))
	df = df.drop(columns=["minutes_played", "ninetys"])
	# pure_stats_columns.remove("minutes_played")
	return df

In [15]:
df = df.drop(columns=["player_id"])
df[team_stats_columns] = df[team_stats_columns].groupby("team_id").sum().reset_index()
df = df[df.index < df["team_id"].nunique()]
df

Unnamed: 0,team_id,goals,assists,non_penalty_goals,penalties_scored,penalties_attempted,yellow_cards,red_cards,expected_goals,non_penalty_expected_goals,expected_assisted_goals,progressive_carries,progressive_passes,progressive_passes_received,total_passing_distance,total_progressive_passing_distance,short_passes_completed,short_passes_attempted,medium_passes_completed,medium_passes_attempted,long_passes_completed,long_passes_attempted,expected_assists,key_passes,passes_into_final_third,passes_into_penalty_area,crosses_into_penalty_area,shots,shots_on_target,average_shot_distance,shots_from_free_kicks,shots_from_penalties,touches,touches_in_defensive_penalty_area,touches_in_defensive_third,touches_in_middle_third,touches_in_attacking_third,touches_in_attacking_penalty_area,live_ball_touches,take_ons_attempted,take_ons_succeeded,times_tackled_during_take_on,carries,total_carrying_distance,progressive_carrying_distance,carries_into_final_third,carries_into_penalty_area,miscontrols,dispossessed,passes_received,tackles,tackles_won,defensive_third_tackles,middle_third_tackles,attacking_third_tackles,dribblers_tackled,dribbler_tackles_attempted,shots_blocked,passes_blocked,interceptions,clearances,errors_leading_to_shot,goals_against,shots_on_target_against,saves,clean_sheets,penalties_faced,penalties_allowed,penalties_saved,penalties_missed,season,match_id,competition_id,home_team_id,away_team_id,referee_id,home_goals,away_goals,home_shots,away_shots,home_shots_on_target,away_shots_on_target,home_corners,away_corners,home_fouls,away_fouls,home_yellow_cards,away_yellow_cards,home_red_cards,away_red_cards
0,t-00001,1.036188,0.74309,0.903132,0.133056,0.166174,0.645576,0.049546,0.930222,0.797282,0.703223,12.513789,22.194275,25.555579,3683.514635,1430.09012,94.039281,108.610826,82.654909,100.451485,23.553024,44.953139,0.649788,6.216721,16.859172,5.045191,1.083934,6.598092,2.469167,4.601497,0.153786,0.133056,325.898076,36.879678,92.761919,141.628909,94.846768,12.544114,325.731902,9.290643,5.655609,3.610049,186.648584,1099.123261,573.273346,9.857618,2.524369,7.385426,4.746953,214.148892,6.750608,4.569021,3.303309,2.654524,0.792775,2.103555,6.5014,1.448392,4.040682,5.102255,11.676693,0.363227,1.146045,3.219544,2.026731,0.265205,0.142421,0.119037,0.023384,0.0,2017-2018,m-00001,1,t-00001,t-00013,r-00001,4,3,27,6,10,3,9,4,9,12,0,1,0,0
1,t-00013,1.21452,1.147207,1.161042,0.053478,0.073069,1.405871,0.103363,1.166705,1.106291,0.96683,15.806833,42.490628,37.508872,6695.30157,2411.770552,169.980117,194.868729,150.344437,179.053193,42.238974,76.634811,0.988176,7.940905,31.822561,8.548638,1.261404,11.619152,3.986711,6.336347,0.662396,0.053478,569.389166,46.329663,154.646391,272.715774,147.183818,20.12817,569.316097,14.569691,9.448923,5.102909,325.186365,1760.99436,903.282071,13.116028,4.141613,11.666828,8.910311,367.248807,15.693352,10.67954,8.222845,5.888943,1.581564,5.544945,14.691936,2.423515,6.082448,9.043475,21.098576,0.49763,1.080602,3.166436,2.064404,0.182518,0.068444,0.022815,0.045629,0.0,2017-2018,m-00001,1,t-00001,t-00013,r-00001,4,3,27,6,10,3,9,4,9,12,0,1,0,0


In [16]:
def group_stats_by_team(df: pd.DataFrame) -> pd.DataFrame:
	df = df.drop(columns=["player_id"])
	df[team_stats_columns] = df[team_stats_columns].groupby("team_id").sum().reset_index()
	return df[df.index < df["team_id"].nunique()]

Combine rows for each team into a single row, by subtracting the home team values from the away team values

In [17]:
home = df["home_team_id"].unique().tolist()[0]
away = df["away_team_id"].unique().tolist()[0]

columns = df.columns.to_list()
final_df = {}

for column in columns:
	if column in pure_stats_columns:
		value = df[column][df["team_id"] == home].iloc[0] - df[column][df["team_id"] == away].iloc[0] 
		final_df[column] = value
	else:
		final_df[column] = df[column][df["team_id"] == home].iloc[0]

# Subtract the values
result = pd.DataFrame(final_df, index=[0]).drop(columns=["team_id"])
result

Unnamed: 0,goals,assists,non_penalty_goals,penalties_scored,penalties_attempted,yellow_cards,red_cards,expected_goals,non_penalty_expected_goals,expected_assisted_goals,progressive_carries,progressive_passes,progressive_passes_received,total_passing_distance,total_progressive_passing_distance,short_passes_completed,short_passes_attempted,medium_passes_completed,medium_passes_attempted,long_passes_completed,long_passes_attempted,expected_assists,key_passes,passes_into_final_third,passes_into_penalty_area,crosses_into_penalty_area,shots,shots_on_target,average_shot_distance,shots_from_free_kicks,shots_from_penalties,touches,touches_in_defensive_penalty_area,touches_in_defensive_third,touches_in_middle_third,touches_in_attacking_third,touches_in_attacking_penalty_area,live_ball_touches,take_ons_attempted,take_ons_succeeded,times_tackled_during_take_on,carries,total_carrying_distance,progressive_carrying_distance,carries_into_final_third,carries_into_penalty_area,miscontrols,dispossessed,passes_received,tackles,tackles_won,defensive_third_tackles,middle_third_tackles,attacking_third_tackles,dribblers_tackled,dribbler_tackles_attempted,shots_blocked,passes_blocked,interceptions,clearances,errors_leading_to_shot,goals_against,shots_on_target_against,saves,clean_sheets,penalties_faced,penalties_allowed,penalties_saved,penalties_missed,season,match_id,competition_id,home_team_id,away_team_id,referee_id,home_goals,away_goals,home_shots,away_shots,home_shots_on_target,away_shots_on_target,home_corners,away_corners,home_fouls,away_fouls,home_yellow_cards,away_yellow_cards,home_red_cards,away_red_cards
0,-0.178332,-0.404117,-0.25791,0.079578,0.093106,-0.760295,-0.053817,-0.236484,-0.309009,-0.263606,-3.293044,-20.296353,-11.953293,-3011.786934,-981.680432,-75.940836,-86.257902,-67.689528,-78.601708,-18.68595,-31.681671,-0.338387,-1.724184,-14.963389,-3.503447,-0.17747,-5.02106,-1.517544,-1.73485,-0.50861,0.079578,-243.49109,-9.449985,-61.884472,-131.086865,-52.33705,-7.584057,-243.584195,-5.279047,-3.793313,-1.492859,-138.537781,-661.871099,-330.008726,-3.25841,-1.617244,-4.281402,-4.163358,-153.099915,-8.942744,-6.110519,-4.919537,-3.234418,-0.788789,-3.44139,-8.190536,-0.975123,-2.041767,-3.94122,-9.421884,-0.134403,0.065443,0.053109,-0.037673,0.082687,0.073977,0.096222,-0.022245,0.0,2017-2018,m-00001,1,t-00001,t-00013,r-00001,4,3,27,6,10,3,9,4,9,12,0,1,0,0


In [18]:
def convert_team_rows_to_single_row(df: pd.DataFrame) -> pd.DataFrame:
	home = df["home_team_id"].unique().tolist()[0]
	away = df["away_team_id"].unique().tolist()[0]

	columns = df.columns.to_list()
	final_df = {}

	for column in columns:
		if column in pure_stats_columns:
			value = df[column][df["team_id"] == home].iloc[0] - df[column][df["team_id"] == away].iloc[0] 
			final_df[column] = value
		else:
			final_df[column] = df[column][df["team_id"] == home].iloc[0]

	return pd.DataFrame(final_df, index=[0]).drop(columns=["team_id"])


Carry out this process with every match and generate a complete dataset for all of them

Entire method

In [19]:
all_matches = db.get_df("SELECT * FROM match")
match_values = get_match_column_values(all_matches)

complete_player_career_stats_for_match_df = pd.DataFrame()
complete_player_form_stats_for_match_df = pd.DataFrame()

columns_to_remove = ["_plus_", "_minus", "_divided_by_",]

for match in match_values:

	output_columns = [
		"home_goals", "away_goals", "home_shots", "away_shots", "home_shots_on_target", "away_shots_on_target",
		"home_corners", "away_corners", "home_fouls", "away_fouls", "home_yellow_cards", "away_yellow_cards",
		"home_red_cards", "away_red_cards"
	]
	match_columns = [
		"match_id", "competition_id", "home_team_id", "away_team_id", "referee_id",
		"home_goals", "away_goals", "home_shots", "away_shots", "home_shots_on_target", "away_shots_on_target",
		"home_corners", "away_corners", "home_fouls", "away_fouls", "home_yellow_cards", "away_yellow_cards",
		"home_red_cards", "away_red_cards"
	]

	player_stats_columns = [
		"player_id", "minutes_played","ninetys","goals","assists","non_penalty_goals","penalties_scored","penalties_attempted","yellow_cards","red_cards","expected_goals",
		"non_penalty_expected_goals","expected_assisted_goals","progressive_carries","progressive_passes","progressive_passes_received","total_passing_distance","total_progressive_passing_distance","short_passes_completed","short_passes_attempted","medium_passes_completed","medium_passes_attempted",
		"long_passes_completed","long_passes_attempted","expected_assists","key_passes","passes_into_final_third","passes_into_penalty_area","crosses_into_penalty_area","shots","shots_on_target","average_shot_distance","shots_from_free_kicks",
		"shots_from_penalties","touches","touches_in_defensive_penalty_area","touches_in_defensive_third","touches_in_middle_third","touches_in_attacking_third","touches_in_attacking_penalty_area","live_ball_touches","take_ons_attempted","take_ons_succeeded","times_tackled_during_take_on",
		"carries","total_carrying_distance","progressive_carrying_distance","carries_into_final_third","carries_into_penalty_area","miscontrols","dispossessed","passes_received","tackles","tackles_won","defensive_third_tackles",
		"middle_third_tackles","attacking_third_tackles","dribblers_tackled","dribbler_tackles_attempted","shots_blocked","passes_blocked","interceptions","clearances","errors_leading_to_shot","goals_against","shots_on_target_against","saves","clean_sheets","penalties_faced","penalties_allowed","penalties_saved","penalties_missed"
	]
	pure_stats_columns = [
		"minutes_played","goals","assists","non_penalty_goals","penalties_scored","penalties_attempted","yellow_cards","red_cards","expected_goals",
		"non_penalty_expected_goals","expected_assisted_goals","progressive_carries","progressive_passes","progressive_passes_received","total_passing_distance","total_progressive_passing_distance","short_passes_completed","short_passes_attempted","medium_passes_completed","medium_passes_attempted",
		"long_passes_completed","long_passes_attempted","expected_assists","key_passes","passes_into_final_third","passes_into_penalty_area","crosses_into_penalty_area","shots","shots_on_target","average_shot_distance","shots_from_free_kicks",
		"shots_from_penalties","touches","touches_in_defensive_penalty_area","touches_in_defensive_third","touches_in_middle_third","touches_in_attacking_third","touches_in_attacking_penalty_area","live_ball_touches","take_ons_attempted","take_ons_succeeded","times_tackled_during_take_on",
		"carries","total_carrying_distance","progressive_carrying_distance","carries_into_final_third","carries_into_penalty_area","miscontrols","dispossessed","passes_received","tackles","tackles_won","defensive_third_tackles",
		"middle_third_tackles","attacking_third_tackles","dribblers_tackled","dribbler_tackles_attempted","shots_blocked","passes_blocked","interceptions","clearances","errors_leading_to_shot","goals_against","shots_on_target_against","saves","clean_sheets","penalties_faced","penalties_allowed","penalties_saved","penalties_missed"
	]
	pure_stats_columns_no_minutes = [
		"goals","assists","non_penalty_goals","penalties_scored","penalties_attempted","yellow_cards","red_cards","expected_goals",
		"non_penalty_expected_goals","expected_assisted_goals","progressive_carries","progressive_passes","progressive_passes_received","total_passing_distance","total_progressive_passing_distance","short_passes_completed","short_passes_attempted","medium_passes_completed","medium_passes_attempted",
		"long_passes_completed","long_passes_attempted","expected_assists","key_passes","passes_into_final_third","passes_into_penalty_area","crosses_into_penalty_area","shots","shots_on_target","average_shot_distance","shots_from_free_kicks",
		"shots_from_penalties","touches","touches_in_defensive_penalty_area","touches_in_defensive_third","touches_in_middle_third","touches_in_attacking_third","touches_in_attacking_penalty_area","live_ball_touches","take_ons_attempted","take_ons_succeeded","times_tackled_during_take_on",
		"carries","total_carrying_distance","progressive_carrying_distance","carries_into_final_third","carries_into_penalty_area","miscontrols","dispossessed","passes_received","tackles","tackles_won","defensive_third_tackles",
		"middle_third_tackles","attacking_third_tackles","dribblers_tackled","dribbler_tackles_attempted","shots_blocked","passes_blocked","interceptions","clearances","errors_leading_to_shot","goals_against","shots_on_target_against","saves","clean_sheets","penalties_faced","penalties_allowed","penalties_saved","penalties_missed"
	]
	team_stats_columns = [
		"team_id", "goals","assists","non_penalty_goals","penalties_scored","penalties_attempted","yellow_cards","red_cards","expected_goals",
		"non_penalty_expected_goals","expected_assisted_goals","progressive_carries","progressive_passes","progressive_passes_received","total_passing_distance","total_progressive_passing_distance","short_passes_completed","short_passes_attempted","medium_passes_completed","medium_passes_attempted",
		"long_passes_completed","long_passes_attempted","expected_assists","key_passes","passes_into_final_third","passes_into_penalty_area","crosses_into_penalty_area","shots","shots_on_target","average_shot_distance","shots_from_free_kicks",
		"shots_from_penalties","touches","touches_in_defensive_penalty_area","touches_in_defensive_third","touches_in_middle_third","touches_in_attacking_third","touches_in_attacking_penalty_area","live_ball_touches","take_ons_attempted","take_ons_succeeded","times_tackled_during_take_on",
		"carries","total_carrying_distance","progressive_carrying_distance","carries_into_final_third","carries_into_penalty_area","miscontrols","dispossessed","passes_received","tackles","tackles_won","defensive_third_tackles",
		"middle_third_tackles","attacking_third_tackles","dribblers_tackled","dribbler_tackles_attempted","shots_blocked","passes_blocked","interceptions","clearances","errors_leading_to_shot","goals_against","shots_on_target_against","saves","clean_sheets","penalties_faced","penalties_allowed","penalties_saved","penalties_missed"
	]
	
	season = match[2]
	home_team_id = match[0]
	away_team_id = match[1]
	match_id = match[3]

	career_df = create_player_stats_for_match(season, home_team_id, away_team_id, "<")
	form_df = create_player_stats_for_match(season, home_team_id, away_team_id, "=")

	if career_df.empty or form_df.empty:
		continue
	
	for key, df in {"career": career_df, "form": form_df}.items():
		columns = [col for col in df.columns if any(word in col for word in columns_to_remove)]
		df = df.drop(columns=columns)

		df = group_stats_by_player_for_home_and_away_teams(df)

		# Ensuring both team have players that have played in the premier league before including in the model dataset
		if df["team_id"].nunique() < 2:
			continue

		df = create_per_90_stats(df)
		df = create_contribution_per_90_stats(df)
		df = group_stats_by_team(df)
		df = convert_team_rows_to_single_row(df)

		if key == "career" and complete_player_career_stats_for_match_df.empty:
			complete_player_career_stats_for_match_df = df.copy(deep=True)
		elif key == "form" and complete_player_form_stats_for_match_df.empty:
			complete_player_form_stats_for_match_df = df.copy(deep=True)
		elif key == "career":
			complete_player_career_stats_for_match_df = pd.concat([complete_player_career_stats_for_match_df, df])
		else:
			complete_player_form_stats_for_match_df = pd.concat([complete_player_form_stats_for_match_df, df])


Combine the form and career stats for all players at a ratio of 2:3 form:career

In [20]:
career_stats = complete_player_career_stats_for_match_df.copy(deep=True)
form_stats = complete_player_form_stats_for_match_df.copy(deep=True)

career_stats_ratio = 0.6
form_stats_ratio = 0.4

career_stats[pure_stats_columns_no_minutes] = career_stats[pure_stats_columns_no_minutes] * career_stats_ratio
form_stats[pure_stats_columns_no_minutes] = form_stats[pure_stats_columns_no_minutes] * form_stats_ratio

all_stats = pd.concat([career_stats, form_stats])
# Combined stats for all the players on both teams
all_match_stats = all_stats[pure_stats_columns_no_minutes + ["match_id"]]
#Match facts for all games
all_match_facts = all_stats[match_columns].drop_duplicates(subset='match_id')

combined = all_match_stats.groupby("match_id").sum().reset_index()
combined = combined.merge(all_match_facts, how="inner", on=["match_id"])

In [21]:
combined.head()
combined.to_csv("../final_combined_dataframe.csv")

# Conclusion