# Creating the dataset

The purpose of this notebook is to establish the process of converting the rows for all the players in both squads in the game, to a single row for use as the neural network input

## Imports

In [1]:
import pandas as pd
import numpy as np
from db_connection import local_pl_stats_connector
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import pairwise_distances_argmin_min
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import plotly.express as px

## Method

Create database connection variable

In [2]:
pd.set_option("display.max_rows", None)

db = local_pl_stats_connector

Generate a dataframe containing all the career stats of the players (up to and including season of the game in question) 
playing in a specific match

In [3]:
def create_player_stats_for_match(game_season: str, home_team_id: str, away_team_id: str) -> pd.DataFrame:
	return db.get_df(f"""
		SELECT * FROM historic_player_per_ninety
		WHERE player_id IN (
			SELECT player_id, m.id AS match_id FROM historic_player_per_ninety hpn
			JOIN match m
				ON m.season = '{game_season}'
				AND m.home_team_id = '{home_team_id}'
				AND m.away_team_id = '{away_team_id}'
				AND hpn.team_id IN (m.home_team_id, m.away_team_id)
			WHERE hpn.season <= '{game_season}'
		)
		JOIN match m
			ON m.season = '{game_season}'
			AND m.home_team_id = '{home_team_id}'
			AND m.away_team_id = '{away_team_id}'
	""")

Collect match facts for a particular game based on the season it occurred and the home and away teams

In [4]:
def create_match_facts_for_match(game_season: str, home_team_id: str, away_team_id: str) -> pd.DataFrame:
	"""
		Generate a dataframe containing all the match facts based on the season and teams
	"""
	return db.get_df(f"""
		SELECT * FROM match 
		WHERE season = '{game_season}' and home_team_id = '{home_team_id}' and away_team_id = '{away_team_id}'
	""")

Create a list of lists of the home and away team ids, and the season for all the games in the matches table, for use in matching the players to the correct matches and teams

In [5]:
def get_match_column_values(all_matches: pd.DataFrame) -> list:
	columns_to_extract = ["home_team_id", "away_team_id", "season"]
	values_list = []

	for index, row in all_matches.iterrows():
		row_values = [row[column] for column in columns_to_extract]
		values_list.append(row_values)

	return values_list

In [6]:
all_matches = db.get_df("SELECT * FROM match")
match_values = get_match_column_values(all_matches)

complete_dataset = pd.DataFrame()
df = ''

for match in match_values:

	season = match[2]
	home_team_id = match[0]
	away_team_id = match[1]

	df = create_player_stats_for_match(season, home_team_id, away_team_id)
	# combine all rows for all the players into one row row the match
	print(match)
	break
df.sort_values("player_id")

['t-00001', 't-00013', '2017-2018']


Unnamed: 0,player_id,minutes_played,ninetys,goals,assists,goals_plus_assists,non_penalty_goals,penalties_scored,penalties_attempted,yellow_cards,...,goals_against,shots_on_target_against,saves,clean_sheets,penalties_faced,penalties_allowed,penalties_saved,penalties_missed,season,team_id
343,p-00001,366.0,4.1,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017-2018,t-00001
270,p-00002,2225.0,24.7,2.0,0.0,2.0,2.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017-2018,t-00001
354,p-00003,922.0,10.2,1.0,5.0,6.0,1.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017-2018,t-00001
351,p-00003,794.0,8.8,2.0,4.0,6.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017-2018,t-00009
353,p-00003,922.0,10.2,1.0,5.0,6.0,1.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017-2018,t-00009
352,p-00003,794.0,8.8,2.0,4.0,6.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017-2018,t-00001
418,p-00004,1846.0,20.5,7.0,8.0,15.0,7.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017-2018,t-00001
275,p-00005,2202.0,24.5,14.0,4.0,18.0,12.0,2.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017-2018,t-00001
550,p-00006,1193.0,13.3,1.0,3.0,4.0,1.0,0.0,0.0,6.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017-2018,t-00001
399,p-00007,2163.0,24.0,4.0,8.0,12.0,4.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017-2018,t-00001


Columns being excluded, along with the output columns for the NN

In [7]:
output_columns = [
	"home_goals", "away_goals", "home_shots", "away_shots", "home_shots_on_target", "away_shots_on_target",
	"home_corners", "away_corners", "home_fouls", "away_fouls", "home_yellow_cards", "away_yellow_cards",
	"home_red_cards", "away_red_cards"
]

columns_to_remove = ["_plus_", "_minus", "_divided_by_",]

career_columns = [
	"player_id", "minutes_played","ninetys","goals","assists","non_penalty_goals","penalties_scored","penalties_attempted","yellow_cards","red_cards","expected_goals",
	"non_penalty_expected_goals","expected_assisted_goals","progressive_carries","progressive_passes","progressive_passes_received","total_passing_distance","total_progressive_passing_distance","short_passes_completed","short_passes_attempted","medium_passes_completed","medium_passes_attempted",
	"long_passes_completed","long_passes_attempted","expected_assists","key_passes","passes_into_final_third","passes_into_penalty_area","crosses_into_penalty_area","shots","shots_on_target","average_shot_distance","shots_from_free_kicks",
	"shots_from_penalties","touches","touches_in_defensive_penalty_area","touches_in_defensive_third","touches_in_middle_third","touches_in_attacking_third","touches_in_attacking_penalty_area","live_ball_touches","take_ons_attempted","take_ons_succeeded","times_tackled_during_take_on",
	"carries","total_carrying_distance","progressive_carrying_distance","carries_into_final_third","carries_into_penalty_area","miscontrols","dispossessed","passes_received","tackles","tackles_won","defensive_third_tackles",
	"middle_third_tackles","attacking_third_tackles","dribblers_tackled","dribbler_tackles_attempted","shots_blocked","passes_blocked","interceptions","clearances","errors_leading_to_shot","goals_against","shots_on_target_against","saves","clean_sheets","penalties_faced","penalties_allowed","penalties_saved","penalties_missed","season", "team_id", "match_id"
]
player_stats_columns = [
	"player_id", "minutes_played","ninetys","goals","assists","non_penalty_goals","penalties_scored","penalties_attempted","yellow_cards","red_cards","expected_goals",
	"non_penalty_expected_goals","expected_assisted_goals","progressive_carries","progressive_passes","progressive_passes_received","total_passing_distance","total_progressive_passing_distance","short_passes_completed","short_passes_attempted","medium_passes_completed","medium_passes_attempted",
	"long_passes_completed","long_passes_attempted","expected_assists","key_passes","passes_into_final_third","passes_into_penalty_area","crosses_into_penalty_area","shots","shots_on_target","average_shot_distance","shots_from_free_kicks",
	"shots_from_penalties","touches","touches_in_defensive_penalty_area","touches_in_defensive_third","touches_in_middle_third","touches_in_attacking_third","touches_in_attacking_penalty_area","live_ball_touches","take_ons_attempted","take_ons_succeeded","times_tackled_during_take_on",
	"carries","total_carrying_distance","progressive_carrying_distance","carries_into_final_third","carries_into_penalty_area","miscontrols","dispossessed","passes_received","tackles","tackles_won","defensive_third_tackles",
	"middle_third_tackles","attacking_third_tackles","dribblers_tackled","dribbler_tackles_attempted","shots_blocked","passes_blocked","interceptions","clearances","errors_leading_to_shot","goals_against","shots_on_target_against","saves","clean_sheets","penalties_faced","penalties_allowed","penalties_saved","penalties_missed"
]

Remove columns with operations

In [8]:
columns = [col for col in df.columns if any(word in col for word in columns_to_remove)]
df = df.drop(columns=columns)

Group by the player id and calculate the yearly mean of each stat over their careers

In [9]:
player_career_stats = df[player_stats_columns].groupby("player_id").sum() / df.groupby("player_id")["season"].nunique()
player_career_stats

Unnamed: 0_level_0,assists,attacking_third_tackles,average_shot_distance,carries,carries_into_final_third,carries_into_penalty_area,clean_sheets,clearances,crosses_into_penalty_area,defensive_third_tackles,...,total_carrying_distance,total_passing_distance,total_progressive_passing_distance,touches,touches_in_attacking_penalty_area,touches_in_attacking_third,touches_in_defensive_penalty_area,touches_in_defensive_third,touches_in_middle_third,yellow_cards
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
p-00001,,,,,,,,,,,...,,,,,,,,,,
p-00002,,,,,,,,,,,...,,,,,,,,,,
p-00003,,,,,,,,,,,...,,,,,,,,,,
p-00004,,,,,,,,,,,...,,,,,,,,,,
p-00005,,,,,,,,,,,...,,,,,,,,,,
p-00006,,,,,,,,,,,...,,,,,,,,,,
p-00007,,,,,,,,,,,...,,,,,,,,,,
p-00008,,,,,,,,,,,...,,,,,,,,,,
p-00009,,,,,,,,,,,...,,,,,,,,,,
p-00010,,,,,,,,,,,...,,,,,,,,,,


In [10]:
df.groupby("player_id")["season"].count()

player_id
p-00001    1
p-00002    1
p-00003    4
p-00004    1
p-00005    1
p-00006    1
p-00007    1
p-00008    1
p-00009    1
p-00010    1
p-00011    1
p-00012    1
p-00014    1
p-00015    1
p-00016    1
p-00017    1
p-00018    1
p-00019    1
p-00020    1
p-00021    1
p-00022    1
p-00023    1
p-00035    1
p-00036    1
p-00039    1
p-00049    2
p-00050    4
p-00054    1
p-00056    4
p-00057    4
p-00061    4
p-00067    1
p-00068    1
p-00069    1
p-00070    1
p-00071    1
p-00072    1
p-00073    1
p-00074    1
p-00075    1
p-00077    1
p-00078    1
p-00079    1
p-00080    1
p-00081    1
p-00082    1
p-00083    1
p-00084    1
p-00085    1
p-00087    1
p-00089    1
p-00093    1
p-00116    1
p-00118    1
p-00119    1
p-00120    1
p-00121    1
p-00122    1
p-00123    1
p-00124    1
p-00126    1
p-00127    1
p-00128    1
p-00130    1
p-00132    4
p-00133    1
p-00134    1
p-00135    1
p-00136    1
p-00137    1
p-00138    1
p-00139    1
p-00140    1
p-00141    1
p-00143    1
p-00144    1
p-

In [11]:
df[player_stats_columns].groupby("player_id").sum()

Unnamed: 0_level_0,minutes_played,ninetys,goals,assists,non_penalty_goals,penalties_scored,penalties_attempted,yellow_cards,red_cards,expected_goals,...,clearances,errors_leading_to_shot,goals_against,shots_on_target_against,saves,clean_sheets,penalties_faced,penalties_allowed,penalties_saved,penalties_missed
player_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
p-00001,366.0,4.1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.2,...,22.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
p-00002,2225.0,24.7,2.0,0.0,2.0,0.0,0.0,4.0,0.0,1.9,...,138.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
p-00003,3432.0,38.0,6.0,18.0,6.0,0.0,0.0,4.0,0.0,4.6,...,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
p-00004,1846.0,20.5,7.0,8.0,7.0,0.0,0.0,0.0,0.0,6.1,...,6.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
p-00005,2202.0,24.5,14.0,4.0,12.0,2.0,2.0,1.0,0.0,13.7,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
p-00006,1193.0,13.3,1.0,3.0,1.0,0.0,0.0,6.0,0.0,1.1,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
p-00007,2163.0,24.0,4.0,8.0,4.0,0.0,0.0,4.0,0.0,4.4,...,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
p-00008,381.0,4.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,3.0,12.0,9.0,2.0,0.0,0.0,0.0,0.0
p-00009,1057.0,11.7,10.0,4.0,9.0,1.0,2.0,0.0,0.0,8.4,...,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
p-00010,822.0,9.1,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.1,...,40.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Calculate the proportion each row contributes to the singular