In [1]:
import os

import pandas as pd
pd.set_option('display.max_columns', 60)
pd.set_option('display.max_rows', 300)

import etl
import utils as u
data_cfg = u.get_config('data')
cfg = u.get_config()

---

## Filter match stats function

#### (Note that I keep rerunning the below when I add a filter so to reproduce my checks, comment out the filters done in the function for that specific thing)

##### I'll try to leave the checks below in the order which I added the filters in the function

### Filters to perform
- Get only rows with at least 1 match stat - COMPLETE
- Get only rows where match is finished - COMPLETE
    - NOTE: Get matches that were in progress during the run and delete from logs then rerun to get that data
- Get only matches where the home and away team are are both from the subset of teams we are interested in - COMPLETE

#### Investigating data

In [2]:
match_events_raw = pd.read_csv("data/csv_of_json_data/match_events_raw.csv")
match_stats_raw = pd.read_csv("data/csv_of_json_data/match_stats_raw.csv")
match_events_count_raw = pd.read_csv("data/csv_of_json_data/match_events_count_raw.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# Perform obvious filters for match_stats
match_stats_filtered = match_stats_raw[match_stats_raw.has_match_stats].copy()
match_stats_filtered.drop(columns='has_match_stats', inplace=True)

# Filter out rows that aren't "Match Finished"
match_stats_filtered = match_stats_filtered[
    match_stats_filtered.fixture_status == "Match Finished"].copy()

# Filter for only key teams
key_team_ids = u.get_ids_of_key_teams()
match_stats_filtered = match_stats_filtered[
    (match_stats_filtered['home_team_id'].isin(key_team_ids))
    & (match_stats_filtered['away_team_id'].isin(key_team_ids))].copy()

# Remove 2 fixtures that are missing most data (can't do anything to fix these):
match_stats_filtered = match_stats_filtered[
    ~match_stats_filtered.home_possession.isnull()].copy()

In [4]:
match_stats_filtered.shape

(7016, 54)

In [5]:
match_stats_filtered[match_stats_filtered.league_type == 'League'].isnull().sum()

fixture_id               0
country                  0
league_name              0
league_id                0
league_type              0
league_season            0
fixture_date             0
fixture_round            0
fixture_status           0
fixture_elapsed          0
fixture_venue            0
fixture_referee       2671
fixture_result_ht        1
fixture_result_ft        0
fixture_result_et     6515
fixture_result_pen    6515
home_team_name           0
home_team_id             0
away_team_name           0
away_team_id             0
home_goals               0
away_goals               0
home_shots_ont           0
away_shots_ont           0
home_shots_offt          0
away_shots_offt          0
home_shots_tot           2
away_shots_tot           2
home_shots_inb           2
away_shots_inb           2
home_shots_outb          3
away_shots_outb          3
home_passes_acc          2
away_passes_acc          2
home_passes_tot          2
away_passes_tot          2
home_passes_pct       1093
a

In [6]:
match_stats_filtered[match_stats_filtered.league_type == 'Cup'].isnull().sum()

fixture_id              0
country                 0
league_name             0
league_id               0
league_type             0
league_season           0
fixture_date            0
fixture_round           0
fixture_status          0
fixture_elapsed         0
fixture_venue           0
fixture_referee       165
fixture_result_ht       0
fixture_result_ft       0
fixture_result_et     460
fixture_result_pen    472
home_team_name          0
home_team_id            0
away_team_name          0
away_team_id            0
home_goals              0
away_goals              0
home_shots_ont          0
away_shots_ont          0
home_shots_offt         0
away_shots_offt         0
home_shots_tot          0
away_shots_tot          0
home_shots_inb         85
away_shots_inb         85
home_shots_outb        85
away_shots_outb        85
home_passes_acc        85
away_passes_acc        85
home_passes_tot        85
away_passes_tot        85
home_passes_pct       185
away_passes_pct       185
home_possess

In [7]:
match_stats_filtered[match_stats_filtered.home_shots_tot.isnull()]

Unnamed: 0,fixture_id,country,league_name,league_id,league_type,league_season,fixture_date,fixture_round,fixture_status,fixture_elapsed,fixture_venue,fixture_referee,fixture_result_ht,fixture_result_ft,fixture_result_et,fixture_result_pen,home_team_name,home_team_id,away_team_name,away_team_id,home_goals,away_goals,home_shots_ont,away_shots_ont,home_shots_offt,away_shots_offt,home_shots_tot,away_shots_tot,home_shots_inb,away_shots_inb,home_shots_outb,away_shots_outb,home_passes_acc,away_passes_acc,home_passes_tot,away_passes_tot,home_passes_pct,away_passes_pct,home_possession,away_possession,home_corners,away_corners,home_offsides,away_offsides,home_fouls,away_fouls,home_yc,away_yc,home_rc,away_rc,home_gksaves,away_gksaves,home_shots_bl,away_shots_bl
4054,27411,Spain,Primera Division,87,League,2018,17/03/2019,Regular Season - 28,Match Finished,90,Estadio Municipal de Ipurúa (Eibar),David Medié,0-0,1-2,,,Eibar,545,Valladolid,720,1.0,2.0,3.0,3.0,9.0,4.0,,,,,,,,,,,,,0.58,0.42,5.0,0.0,4.0,5.0,16.0,14.0,4.0,2.0,0.0,0.0,1.0,2.0,2.0,1.0
4576,29264,Italy,Serie A,94,League,2018,17/03/2019,Regular Season - 28,Match Finished,90,Stadio Comunale Luigi Ferraris (Genova),M. Di Bello,0-0,2-0,,,Genoa,495,Juventus,496,2.0,0.0,5.0,0.0,7.0,4.0,,,,,,,,,,,,,0.43,0.57,6.0,6.0,3.0,1.0,12.0,11.0,2.0,3.0,0.0,0.0,0.0,3.0,7.0,2.0


---

### 1) Look at the single result where ht is null

In [8]:
match_stats_filtered[match_stats_filtered.fixture_result_ht.isnull()]

Unnamed: 0,fixture_id,country,league_name,league_id,league_type,league_season,fixture_date,fixture_round,fixture_status,fixture_elapsed,fixture_venue,fixture_referee,fixture_result_ht,fixture_result_ft,fixture_result_et,fixture_result_pen,home_team_name,home_team_id,away_team_name,away_team_id,home_goals,away_goals,home_shots_ont,away_shots_ont,home_shots_offt,away_shots_offt,home_shots_tot,away_shots_tot,home_shots_inb,away_shots_inb,home_shots_outb,away_shots_outb,home_passes_acc,away_passes_acc,home_passes_tot,away_passes_tot,home_passes_pct,away_passes_pct,home_possession,away_possession,home_corners,away_corners,home_offsides,away_offsides,home_fouls,away_fouls,home_yc,away_yc,home_rc,away_rc,home_gksaves,away_gksaves,home_shots_bl,away_shots_bl
3766,21650,Italy,Serie A,66,League,2016,28/08/2016,Regular Season - 2,Match Finished,90,Stadio Città del Tricolore (Reggio Emilia),,,0-3,,,Sassuolo,488,Pescara,525,0.0,3.0,3.0,5.0,5.0,7.0,13.0,15.0,4.0,9.0,9.0,6.0,304.0,487.0,394.0,574.0,0.77,0.85,0.4,0.6,2.0,7.0,2.0,3.0,13.0,21.0,2.0,2.0,0.0,0.0,5.0,1.0,5.0,3.0


In [9]:
# Method to check that half time and full time scores are correct

match_events_raw[(match_events_raw.fixture_id == 21650) &
                 (match_events_raw.event_type == 'Goal') &
                 (match_events_raw.event_time_elapsed <= 45)]

Unnamed: 0,fixture_id,country,league_name,league_id,league_type,league_season,fixture_date,fixture_round,fixture_status,fixture_elapsed,fixture_venue,fixture_referee,fixture_result_ht,fixture_result_ft,fixture_result_et,fixture_result_pen,home_team_name,home_team_id,away_team_name,away_team_id,home_goals,away_goals,event_team_id,event_team_name,event_player_name,event_time_elapsed,event_time_elapsed_plus,event_type,event_detail,event_comments
151497,21650,Italy,Serie A,66,League,2016,28/08/2016,Regular Season - 2,Match Finished,90,Stadio Città del Tricolore (Reggio Emilia),,,0-3,,,Sassuolo,488,Pescara,525,0.0,3.0,488.0,Sassuolo,Grégoire Defrel,38.0,,Goal,Normal Goal,


##### Solution: I can use match events to correct this - No need to filter

---

## 2) Checks

#### Checks to perform for possible data filling
- Goals < shots_on_target
- Total shots = Sum(shots_ont + shots_offt + shots_bl)
- Total shots = Sum(shots_inb + shots_outb)
- Passes_pct = passes_act/passes_tot
- Sum(Home_possession + away_possession) = 1
- If home_gksaves == (shots on target - goals scored)

### 2.1) Goals < shots_on_target

In [10]:
fixture_ids_to_check = match_stats_filtered[
    (match_stats_filtered.home_goals > match_stats_filtered.home_shots_ont)|
    (match_stats_filtered.away_goals > match_stats_filtered.away_shots_ont)
               ]['fixture_id']

for fixture in fixture_ids_to_check:
    fixture_data = match_stats_filtered[match_stats_filtered.fixture_id == fixture]
    
    print(f"{fixture_data.fixture_date.values[0]}, {fixture_data.home_team_name.values[0]} vs {fixture_data.away_team_name.values[0]}")
    
    home_columns = ['home_goals','home_shots_tot', 'home_shots_ont',
                          'home_possession', 'home_passes_tot', 'home_passes_pct', 'home_fouls',
                          'home_yc', 'home_rc', 'home_offsides', 'home_corners', 'home_gksaves', 'home_shots_bl']
    away_columns = ['away_goals', 'away_shots_tot', 'away_shots_ont',
                          'away_possession', 'away_passes_tot', 'away_passes_pct', 'away_fouls',
                          'away_yc', 'away_rc', 'away_offsides', 'away_corners', 'away_gksaves', 'away_shots_bl']
    columns = [x.split('away_')[-1] for x in away_columns]
    
    for i in range(len(columns)):
            print(f"{columns[i]}: {fixture_data[home_columns[i]].values[0]} -- {fixture_data[away_columns[i]].values[0]}")
    
    print("\n\n\n\n\n")


12/01/2019, Burnley vs Fulham
goals: 2.0 -- 1.0
shots_tot: 11.0 -- 12.0
shots_ont: 1.0 -- 4.0
possession: 0.41 -- 0.59
passes_tot: 419.0 -- 564.0
passes_pct: 0.67 -- 0.8
fouls: 5.0 -- 9.0
yc: 1.0 -- 2.0
rc: 0.0 -- 0.0
offsides: 2.0 -- 0.0
corners: 2.0 -- 6.0
gksaves: 2.0 -- 0.0
shots_bl: 5.0 -- 4.0






27/02/2019, Chelsea vs Tottenham
goals: 2.0 -- 0.0
shots_tot: 11.0 -- 9.0
shots_ont: 1.0 -- 0.0
possession: 0.46 -- 0.54
passes_tot: 480.0 -- 546.0
passes_pct: 0.81 -- 0.87
fouls: 7.0 -- 14.0
yc: 1.0 -- 1.0
rc: 0.0 -- 0.0
offsides: 1.0 -- 1.0
corners: 2.0 -- 2.0
gksaves: nan -- nan
shots_bl: 2.0 -- 4.0






30/03/2019, Burnley vs Wolves
goals: 2.0 -- 0.0
shots_tot: 6.0 -- 8.0
shots_ont: 1.0 -- 1.0
possession: 0.37 -- 0.63
passes_tot: 322.0 -- 534.0
passes_pct: 0.64 -- 0.77
fouls: 12.0 -- 10.0
yc: 1.0 -- 1.0
rc: nan -- nan
offsides: 1.0 -- 2.0
corners: 1.0 -- 5.0
gksaves: 1.0 -- 0.0
shots_bl: 2.0 -- 4.0






22/10/2017, Udinese vs Juventus
goals: 2.0 -- 6.0
shots_tot: 12.0 -- 13.0
sho

##### Conclusion: Checked these fixtures with google results and all fine except for fixture 35777 (The others can be explained by OG's in the match)
##### Solution: Removing fixture 35777 because too incorrect so can't trust the other data which is harder to confirm like gksaves etc

In [11]:
# Manually removing Amiens vs Lille match (35777)
match_stats_filtered = match_stats_filtered[
    (match_stats_filtered.fixture_id).astype(int) != 35777].copy()

### 2.2) Total shots = Sum(shots_ont + shots_offt + shots_bl)

In [12]:
home_shots_not_null = match_stats_filtered[
    (~match_stats_filtered.home_shots_ont.isnull()) &
    (~match_stats_filtered.home_shots_offt.isnull()) &
    (~match_stats_filtered.home_shots_bl.isnull()) &
    (~match_stats_filtered.home_shots_tot.isnull())].copy()

display(home_shots_not_null[
    (home_shots_not_null.home_shots_ont + home_shots_not_null.home_shots_offt +
     home_shots_not_null.home_shots_bl) != home_shots_not_null.home_shots_tot])

away_shots_not_null = match_stats_filtered[
    (~match_stats_filtered.away_shots_ont.isnull()) &
    (~match_stats_filtered.away_shots_offt.isnull()) &
    (~match_stats_filtered.away_shots_bl.isnull()) &
    (~match_stats_filtered.away_shots_tot.isnull())]

display(away_shots_not_null[
    (away_shots_not_null.away_shots_ont + away_shots_not_null.away_shots_offt +
     away_shots_not_null.away_shots_bl) != away_shots_not_null.away_shots_tot])


Unnamed: 0,fixture_id,country,league_name,league_id,league_type,league_season,fixture_date,fixture_round,fixture_status,fixture_elapsed,fixture_venue,fixture_referee,fixture_result_ht,fixture_result_ft,fixture_result_et,fixture_result_pen,home_team_name,home_team_id,away_team_name,away_team_id,home_goals,away_goals,home_shots_ont,away_shots_ont,home_shots_offt,away_shots_offt,home_shots_tot,away_shots_tot,home_shots_inb,away_shots_inb,home_shots_outb,away_shots_outb,home_passes_acc,away_passes_acc,home_passes_tot,away_passes_tot,home_passes_pct,away_passes_pct,home_possession,away_possession,home_corners,away_corners,home_offsides,away_offsides,home_fouls,away_fouls,home_yc,away_yc,home_rc,away_rc,home_gksaves,away_gksaves,home_shots_bl,away_shots_bl


Unnamed: 0,fixture_id,country,league_name,league_id,league_type,league_season,fixture_date,fixture_round,fixture_status,fixture_elapsed,fixture_venue,fixture_referee,fixture_result_ht,fixture_result_ft,fixture_result_et,fixture_result_pen,home_team_name,home_team_id,away_team_name,away_team_id,home_goals,away_goals,home_shots_ont,away_shots_ont,home_shots_offt,away_shots_offt,home_shots_tot,away_shots_tot,home_shots_inb,away_shots_inb,home_shots_outb,away_shots_outb,home_passes_acc,away_passes_acc,home_passes_tot,away_passes_tot,home_passes_pct,away_passes_pct,home_possession,away_possession,home_corners,away_corners,home_offsides,away_offsides,home_fouls,away_fouls,home_yc,away_yc,home_rc,away_rc,home_gksaves,away_gksaves,home_shots_bl,away_shots_bl


##### Conclusion: NO ISSUES -> No filters required

### 2.3) Total shots = Sum(shots_inb + shots_outb)

In [13]:
home_shots_not_null = match_stats_filtered[
    (~match_stats_filtered.home_shots_inb.isnull()) &
    (~match_stats_filtered.home_shots_outb.isnull()) &
    (~match_stats_filtered.home_shots_tot.isnull())]
home_shots_not_null['diff'] = (
    (home_shots_not_null.home_shots_inb + home_shots_not_null.home_shots_outb)
    - home_shots_not_null.home_shots_tot)

print(home_shots_not_null.shape)
home_inb_outb_wrong = home_shots_not_null[
    (home_shots_not_null.home_shots_inb + home_shots_not_null.home_shots_outb)
    != home_shots_not_null.home_shots_tot]


display(home_inb_outb_wrong[['fixture_date', 'home_team_name', 'away_team_name',
                             'home_shots_tot', 'home_shots_inb', 'home_shots_outb', 'diff']])

away_shots_not_null = match_stats_filtered[
    (~match_stats_filtered.away_shots_inb.isnull()) &
    (~match_stats_filtered.away_shots_outb.isnull()) &
    (~match_stats_filtered.away_shots_tot.isnull())]
away_shots_not_null['diff'] = (
    (away_shots_not_null.away_shots_inb + away_shots_not_null.away_shots_outb)
    - away_shots_not_null.away_shots_tot)

print(away_shots_not_null.shape)
away_inb_outb_wrong = away_shots_not_null[
    (away_shots_not_null.away_shots_inb + away_shots_not_null.away_shots_outb)
    != away_shots_not_null.away_shots_tot]

display(away_inb_outb_wrong[['fixture_date', 'home_team_name', 'away_team_name',
                             'away_shots_tot', 'away_shots_inb', 'away_shots_outb', 'diff']])

(6927, 55)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


Unnamed: 0,fixture_date,home_team_name,away_team_name,home_shots_tot,home_shots_inb,home_shots_outb,diff
685,20/05/2018,Lazio,Inter,14.0,8.0,7.0,1.0
713,05/05/2018,Juventus,Bologna,15.0,12.0,4.0,1.0
723,28/04/2018,Inter,Juventus,10.0,5.0,6.0,1.0
738,18/04/2018,AS Roma,Genoa,15.0,11.0,5.0,1.0
797,11/03/2018,Crotone,Sampdoria,14.0,11.0,4.0,1.0
822,17/02/2018,Genoa,Inter,9.0,7.0,3.0,1.0
831,10/02/2018,Napoli,Lazio,17.0,12.0,6.0,1.0
841,04/02/2018,Udinese,AC Milan,13.0,10.0,4.0,1.0
850,28/01/2018,Napoli,Bologna,9.0,6.0,4.0,1.0
881,30/12/2017,Bologna,Udinese,12.0,11.0,2.0,1.0


(6927, 55)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,fixture_date,home_team_name,away_team_name,away_shots_tot,away_shots_inb,away_shots_outb,diff
686,20/05/2018,Sassuolo,AS Roma,14.0,7.0,8.0,1.0
721,29/04/2018,Verona,Spal,17.0,10.0,8.0,1.0
723,28/04/2018,Inter,Juventus,13.0,7.0,7.0,1.0
734,21/04/2018,Spal,AS Roma,14.0,10.0,5.0,1.0
791,17/03/2018,Udinese,Sassuolo,12.0,6.0,7.0,1.0
796,11/03/2018,Cagliari,Lazio,12.0,8.0,5.0,1.0
809,25/02/2018,Sampdoria,Udinese,16.0,11.0,6.0,1.0
852,28/01/2018,Spal,Inter,10.0,8.0,3.0,1.0
876,05/01/2018,Chievo,Udinese,9.0,5.0,5.0,1.0
910,10/12/2017,Sassuolo,Crotone,8.0,5.0,4.0,1.0


In [14]:
# How many matches are affected
len(set(list(away_inb_outb_wrong.fixture_id.unique()) + list(home_inb_outb_wrong.fixture_id.unique())))

275

In [15]:
# What kind of differences do we find:
print(set(home_inb_outb_wrong.home_shots_tot -
    (home_inb_outb_wrong.home_shots_inb + home_inb_outb_wrong.home_shots_outb)))

print(set(away_inb_outb_wrong.away_shots_tot -
    (away_inb_outb_wrong.away_shots_inb + away_inb_outb_wrong.away_shots_outb)))

{-1.0, -2.0}
{-1.0, -2.0}


In [16]:
# We observe the following 3 cases
display(match_stats_filtered[
    match_stats_filtered.home_shots_inb > match_stats_filtered.home_shots_tot])
display(match_stats_filtered[
    match_stats_filtered.away_shots_inb > match_stats_filtered.away_shots_tot])

Unnamed: 0,fixture_id,country,league_name,league_id,league_type,league_season,fixture_date,fixture_round,fixture_status,fixture_elapsed,fixture_venue,fixture_referee,fixture_result_ht,fixture_result_ft,fixture_result_et,fixture_result_pen,home_team_name,home_team_id,away_team_name,away_team_id,home_goals,away_goals,home_shots_ont,away_shots_ont,home_shots_offt,away_shots_offt,home_shots_tot,away_shots_tot,home_shots_inb,away_shots_inb,home_shots_outb,away_shots_outb,home_passes_acc,away_passes_acc,home_passes_tot,away_passes_tot,home_passes_pct,away_passes_pct,home_possession,away_possession,home_corners,away_corners,home_offsides,away_offsides,home_fouls,away_fouls,home_yc,away_yc,home_rc,away_rc,home_gksaves,away_gksaves,home_shots_bl,away_shots_bl
1842,11112,Germany,Bundesliga 1,35,League,2017,25/11/2017,Regular Season - 13,Match Finished,90,Signal-Iduna-Park (Dortmund),D. Aytekin,4-0,4-4,,,Borussia Dortmund,165,FC Schalke 04,174,4.0,4.0,6.0,7.0,3.0,5.0,11.0,12.0,12.0,10.0,0.0,2.0,283.0,330.0,382.0,417.0,0.74,0.79,0.46,0.54,3.0,5.0,1.0,2.0,18.0,20.0,5.0,5.0,1.0,0.0,3.0,3.0,2.0,0.0
3535,21419,Italy,Serie A,66,League,2016,19/02/2017,Regular Season - 25,Match Finished,90,Stadio Adriatico-Giovanni Cornacchia (Pescara),,3-0,5-0,,,Pescara,525,Genoa,495,5.0,0.0,6.0,2.0,2.0,6.0,8.0,13.0,9.0,11.0,0.0,2.0,307.0,424.0,396.0,506.0,0.78,0.84,0.42,0.58,5.0,9.0,4.0,0.0,14.0,15.0,2.0,1.0,0.0,0.0,2.0,2.0,0.0,5.0


Unnamed: 0,fixture_id,country,league_name,league_id,league_type,league_season,fixture_date,fixture_round,fixture_status,fixture_elapsed,fixture_venue,fixture_referee,fixture_result_ht,fixture_result_ft,fixture_result_et,fixture_result_pen,home_team_name,home_team_id,away_team_name,away_team_id,home_goals,away_goals,home_shots_ont,away_shots_ont,home_shots_offt,away_shots_offt,home_shots_tot,away_shots_tot,home_shots_inb,away_shots_inb,home_shots_outb,away_shots_outb,home_passes_acc,away_passes_acc,home_passes_tot,away_passes_tot,home_passes_pct,away_passes_pct,home_possession,away_possession,home_corners,away_corners,home_offsides,away_offsides,home_fouls,away_fouls,home_yc,away_yc,home_rc,away_rc,home_gksaves,away_gksaves,home_shots_bl,away_shots_bl
3233,20655,Spain,Primera Division,64,League,2016,09/01/2017,Regular Season - 17,Match Finished,90,Estadio El Sadar (Pamplona (Iruñea)),,1-2,3-3,,,Osasuna,727,Valencia,532,3.0,3.0,4.0,3.0,11.0,3.0,22.0,7.0,13.0,8.0,9.0,0.0,261.0,312.0,365.0,421.0,0.72,0.74,0.47,0.53,7.0,3.0,1.0,1.0,19.0,13.0,2.0,5.0,0.0,0.0,1.0,1.0,7.0,1.0


##### EVIDENCE: Total shots of all 3 matches were correct. Evidence suggests the issue is with the SHOTS_INB column

In [17]:
# Checking for case of outside the box being more than total shots but no cases
display(match_stats_filtered[
    match_stats_filtered.home_shots_outb > match_stats_filtered.home_shots_tot])
display(match_stats_filtered[
    match_stats_filtered.away_shots_outb > match_stats_filtered.away_shots_tot])
# No cases for outside the box shots may be because it is unlikely every shot will be from outside the box

Unnamed: 0,fixture_id,country,league_name,league_id,league_type,league_season,fixture_date,fixture_round,fixture_status,fixture_elapsed,fixture_venue,fixture_referee,fixture_result_ht,fixture_result_ft,fixture_result_et,fixture_result_pen,home_team_name,home_team_id,away_team_name,away_team_id,home_goals,away_goals,home_shots_ont,away_shots_ont,home_shots_offt,away_shots_offt,home_shots_tot,away_shots_tot,home_shots_inb,away_shots_inb,home_shots_outb,away_shots_outb,home_passes_acc,away_passes_acc,home_passes_tot,away_passes_tot,home_passes_pct,away_passes_pct,home_possession,away_possession,home_corners,away_corners,home_offsides,away_offsides,home_fouls,away_fouls,home_yc,away_yc,home_rc,away_rc,home_gksaves,away_gksaves,home_shots_bl,away_shots_bl


Unnamed: 0,fixture_id,country,league_name,league_id,league_type,league_season,fixture_date,fixture_round,fixture_status,fixture_elapsed,fixture_venue,fixture_referee,fixture_result_ht,fixture_result_ft,fixture_result_et,fixture_result_pen,home_team_name,home_team_id,away_team_name,away_team_id,home_goals,away_goals,home_shots_ont,away_shots_ont,home_shots_offt,away_shots_offt,home_shots_tot,away_shots_tot,home_shots_inb,away_shots_inb,home_shots_outb,away_shots_outb,home_passes_acc,away_passes_acc,home_passes_tot,away_passes_tot,home_passes_pct,away_passes_pct,home_possession,away_possession,home_corners,away_corners,home_offsides,away_offsides,home_fouls,away_fouls,home_yc,away_yc,home_rc,away_rc,home_gksaves,away_gksaves,home_shots_bl,away_shots_bl


In [18]:
# Show the few cases were difference in more than 1
display(home_inb_outb_wrong[home_inb_outb_wrong['diff'] > 1][['fixture_date', 'home_team_name', 'away_team_name',
                             'home_shots_tot', 'home_shots_inb', 'home_shots_outb', 'diff']])

display(away_inb_outb_wrong[away_inb_outb_wrong['diff'] > 1][['fixture_date', 'home_team_name', 'away_team_name',
                             'away_shots_tot', 'away_shots_inb', 'away_shots_outb', 'diff']])

Unnamed: 0,fixture_date,home_team_name,away_team_name,home_shots_tot,home_shots_inb,home_shots_outb,diff
1104,28/04/2018,Real Sociedad,Athletic Club,17.0,13.0,6.0,2.0
1240,28/01/2018,Leganes,Espanyol,11.0,8.0,5.0,2.0
5002,17/09/2017,Paris Saint Germain,Lyon,14.0,13.0,3.0,2.0


Unnamed: 0,fixture_date,home_team_name,away_team_name,away_shots_tot,away_shots_inb,away_shots_outb,diff
1024,20/09/2017,Benevento,AS Roma,21.0,13.0,10.0,2.0
1391,23/09/2017,Girona,Barcelona,9.0,7.0,4.0,2.0
2818,14/01/2017,Swansea,Arsenal,14.0,12.0,4.0,2.0


##### Conclusion: 275/6927 games are affected. Issue is most likely always with shots_inb column
##### Solution: Remove the difference from the SHOTS_INB column so the sum equals total shots -> No filters required

### 2.4) Passes_pct = passes_act/passes_tot

In [19]:
home_passes_not_null = match_stats_filtered[
    (~match_stats_filtered.home_passes_acc.isnull()) &
    (~match_stats_filtered.home_passes_tot.isnull()) &
    (~match_stats_filtered.home_passes_pct.isnull())]

print(home_passes_not_null.shape)
home_pacces_pct_wrong = home_passes_not_null[
    (home_passes_not_null.home_passes_acc / home_passes_not_null.home_passes_tot
    ).apply(lambda x: round(x, 2)) != home_passes_not_null.home_passes_pct]
home_pacces_pct_wrong['calc_home_passes_pct'] = (
    home_pacces_pct_wrong.home_passes_acc / home_pacces_pct_wrong.home_passes_tot
    ).apply(lambda x: round(x, 2))

display(home_pacces_pct_wrong[[
    'fixture_date', 'home_team_name', 'away_team_name',
    'home_passes_acc', 'home_passes_tot', 'home_passes_pct', 'calc_home_passes_pct']])


away_passes_not_null = match_stats_filtered[
    (~match_stats_filtered.away_passes_acc.isnull()) &
    (~match_stats_filtered.away_passes_tot.isnull()) &
    (~match_stats_filtered.away_passes_pct.isnull())]

print(away_passes_not_null.shape)
away_pacces_pct_wrong = away_passes_not_null[
    (away_passes_not_null.away_passes_acc / away_passes_not_null.away_passes_tot
    ).apply(lambda x: round(x, 2)) != away_passes_not_null.away_passes_pct]
away_pacces_pct_wrong['calc_away_passes_pct'] = (
    away_pacces_pct_wrong.away_passes_acc / away_pacces_pct_wrong.away_passes_tot
    ).apply(lambda x: round(x, 2))

display(away_pacces_pct_wrong[[
    'fixture_date', 'home_team_name', 'away_team_name',
    'away_passes_acc', 'away_passes_tot', 'away_passes_pct', 'calc_away_passes_pct']])

(5737, 54)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


Unnamed: 0,fixture_date,home_team_name,away_team_name,home_passes_acc,home_passes_tot,home_passes_pct,calc_home_passes_pct
347,20/04/2019,Newcastle,Southampton,200.0,320.0,0.63,0.62
418,25/09/2018,Hannover 96,1899 Hoffenheim,429.0,520.0,0.83,0.82
425,28/09/2018,Hertha Berlin,Bayern Munich,203.0,280.0,0.73,0.72
933,25/11/2017,Cagliari,Inter,298.0,400.0,0.75,0.74
1047,27/08/2017,Crotone,Verona,180.0,288.0,0.63,0.62
1050,27/08/2017,Spal,Udinese,334.0,400.0,0.84,0.83
1158,18/03/2018,Barcelona,Athletic Club,507.0,600.0,0.85,0.84
2031,31/03/2018,Crystal Palace,Liverpool,195.0,312.0,0.63,0.62
2475,04/02/2017,Bayern Munich,FC Schalke 04,501.0,600.0,0.84,0.83
3513,05/03/2017,Cagliari,Inter,232.0,320.0,0.73,0.72


(5737, 54)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,fixture_date,home_team_name,away_team_name,away_passes_acc,away_passes_tot,away_passes_pct,calc_away_passes_pct
7,12/08/2018,Liverpool,West Ham,297.0,360.0,0.83,0.82
234,29/01/2019,Wolves,West Ham,298.0,400.0,0.75,0.74
407,21/09/2018,VfB Stuttgart,Fortuna Dusseldorf,298.0,400.0,0.75,0.74
1656,12/05/2018,Hertha Berlin,RB Leipzig,330.0,400.0,0.83,0.82
2090,31/01/2018,Everton,Leicester,319.0,440.0,0.73,0.72
2972,18/09/2016,Tottenham,Sunderland,140.0,224.0,0.63,0.62
3383,28/08/2016,Athletic Club,Barcelona,507.0,600.0,0.85,0.84
3454,23/04/2017,AC Milan,Empoli,232.0,320.0,0.73,0.72
3874,27/10/2018,Atletico Madrid,Real Sociedad,298.0,400.0,0.75,0.74


##### Conclusion: NO ISSUES. Differences are all of 0.01 and therefore probably just rounding errors -> No filters required

### 2.5) Sum(Home_possession + away_possession) = 1

In [20]:
possession_not_null = match_stats_filtered[
    (~match_stats_filtered.home_possession.isnull()) &
    (~match_stats_filtered.away_possession.isnull())]

print(possession_not_null.shape)
possession_sum_wrong = possession_not_null[
    (possession_not_null.home_possession + possession_not_null.away_possession) != 1]
if len(possession_sum_wrong) > 0:    
    possession_sum_wrong['calc_possession_sum'] = (
        possession_not_null.home_possession + possession_not_null.away_possession)

    display(possession_sum_wrong[[
        'fixture_date', 'home_team_name', 'away_team_name',
        'home_possession', 'away_possession', 'calc_possession_sum']])
else:
    print("All sum to 1")

(7015, 54)
All sum to 1


##### Conclusion: NO ISSUES. Possession numbers for all games sum to 1 -> No filters required

### 2.6) If home_gksaves == (shots on target - goals scored)

In [21]:
home_gksaves_not_null = match_stats_filtered[
    (~match_stats_filtered.home_gksaves.isnull()) &
    (~match_stats_filtered.away_goals.isnull()) &
    (~match_stats_filtered.away_shots_ont.isnull())]
print(home_gksaves_not_null.shape)

away_gksaves_not_null = match_stats_filtered[
    (~match_stats_filtered.away_gksaves.isnull()) &
    (~match_stats_filtered.home_goals.isnull()) &
    (~match_stats_filtered.home_shots_ont.isnull())]
print(away_gksaves_not_null.shape)


home_gksaves_wrong = home_gksaves_not_null[
    home_gksaves_not_null.home_gksaves != (
        home_gksaves_not_null.away_shots_ont - home_gksaves_not_null.away_goals)
]
display(home_gksaves_wrong[[
    'fixture_date', 'home_team_name', 'away_team_name',
    'home_gksaves', 'away_shots_ont', 'away_goals'
]])


away_gksaves_wrong = away_gksaves_not_null[
    away_gksaves_not_null.away_gksaves != (
        away_gksaves_not_null.home_shots_ont - away_gksaves_not_null.home_goals)
]
display(away_gksaves_wrong[[
    'fixture_date', 'home_team_name', 'away_team_name',
    'away_gksaves', 'home_shots_ont', 'home_goals'
]])




(6990, 54)
(6991, 54)


Unnamed: 0,fixture_date,home_team_name,away_team_name,home_gksaves,away_shots_ont,away_goals
13,18/08/2018,Leicester,Wolves,2.0,3.0,0.0
28,26/08/2018,Newcastle,Chelsea,2.0,3.0,2.0
34,01/09/2018,Crystal Palace,Southampton,5.0,6.0,2.0
39,02/09/2018,Watford,Tottenham,2.0,2.0,1.0
41,15/09/2018,Bournemouth,Leicester,5.0,8.0,2.0
...,...,...,...,...,...,...
23433,04/01/2020,Reading,Blackpool,2.0,5.0,2.0
23540,07/01/2020,Manchester United,Manchester City,3.0,5.0,3.0
23628,28/01/2020,AC Milan,Torino,2.0,5.0,2.0
23635,22/01/2020,Juventus,AS Roma,4.0,4.0,1.0


Unnamed: 0,fixture_date,home_team_name,away_team_name,away_gksaves,home_shots_ont,home_goals
10,18/08/2018,Cardiff,Newcastle,0.0,1.0,0.0
13,18/08/2018,Leicester,Wolves,1.0,2.0,2.0
17,19/08/2018,Manchester City,Huddersfield,9.0,14.0,6.0
21,25/08/2018,Arsenal,West Ham,9.0,10.0,3.0
26,25/08/2018,Liverpool,Brighton,6.0,8.0,1.0
...,...,...,...,...,...,...
23412,15/01/2020,Fiorentina,Atalanta,0.0,3.0,2.0
23421,04/01/2020,Leicester,Wigan,5.0,6.0,2.0
23432,05/01/2020,Middlesbrough,Tottenham,2.0,4.0,1.0
23629,29/01/2020,Inter,Fiorentina,3.0,4.0,2.0


In [22]:
# Quick look to explain the difference in df size when we'd expect them to be the same
print(home_gksaves_not_null.shape)
print(away_gksaves_not_null.shape)
print([x for x in away_gksaves_not_null.fixture_id.unique()
       if x not in home_gksaves_not_null.fixture_id.unique()])
display(away_gksaves_not_null[away_gksaves_not_null.fixture_id == 121319])

(6990, 54)
(6991, 54)
[121319]


Unnamed: 0,fixture_id,country,league_name,league_id,league_type,league_season,fixture_date,fixture_round,fixture_status,fixture_elapsed,fixture_venue,fixture_referee,fixture_result_ht,fixture_result_ft,fixture_result_et,fixture_result_pen,home_team_name,home_team_id,away_team_name,away_team_id,home_goals,away_goals,home_shots_ont,away_shots_ont,home_shots_offt,away_shots_offt,home_shots_tot,away_shots_tot,home_shots_inb,away_shots_inb,home_shots_outb,away_shots_outb,home_passes_acc,away_passes_acc,home_passes_tot,away_passes_tot,home_passes_pct,away_passes_pct,home_possession,away_possession,home_corners,away_corners,home_offsides,away_offsides,home_fouls,away_fouls,home_yc,away_yc,home_rc,away_rc,home_gksaves,away_gksaves,home_shots_bl,away_shots_bl
7073,121319,France,Coupe de la Ligue,433,Cup,2018,14/08/2018,1st Round,Match Finished,90,"Stade du Hainaut, Valenciennes",,0-1,0-1,,,Valenciennes,105,Lorient,97,0.0,1.0,8.0,1.0,2.0,5.0,10.0,6.0,,,,,,,,,,,0.54,0.46,6.0,1.0,,4.0,14.0,11.0,2.0,0.0,0.0,0.0,,9.0,,


In [23]:
set(home_gksaves_wrong.home_gksaves -
    (home_gksaves_wrong.away_shots_ont - home_gksaves_wrong.away_goals))

{-3.0, -2.0, -1.0, 1.0, 2.0, 3.0}

In [24]:
set(away_gksaves_wrong.away_gksaves -
    (away_gksaves_wrong.home_shots_ont - away_gksaves_wrong.home_goals))

{-3.0, -2.0, -1.0, 1.0, 2.0, 3.0}

##### Conclusion: NO. We can't use (shots_ont-goals) to get gksaves. Over 25% of the dataset doesn't agree with it and differences vary from (-3 to 3). There's only about 25-26 games that don't have gk saves after the filter to remove fixture 35777 so not worth the effort to get this data. Likely will just drop them

##### Solution: Will FILTER these matches if we need gksaves in the model

---

## 3) Performing manipulations to fill in NaN columns where possible

#### Manipulations to do
- Total shots -> (shots_ont + shots_offt + shots_bl)
- Total shots -> (shots_inb + shots_outb)
- Passes_pct -> (passes_acc/passes_tot)
- HT Result ->> Can fix using match events
- Red & Yellow Cards ->> Can fix using match events

### 3.1) Total shots -> (shots_ont + shots_offt + shots_bl)

In [25]:
print(match_stats_filtered[[
    'home_shots_ont', 'home_shots_offt', 'home_shots_bl', 'home_shots_tot'
]].isnull().sum())
print()
print(match_stats_filtered[[
    'away_shots_ont', 'away_shots_offt', 'away_shots_bl', 'away_shots_tot'
]].isnull().sum())

home_shots_ont       0
home_shots_offt      0
home_shots_bl      110
home_shots_tot       2
dtype: int64

away_shots_ont       0
away_shots_offt      0
away_shots_bl      110
away_shots_tot       2
dtype: int64


#### 3.1.1) Correct the shots_tot columns

In [26]:
match_stats_filtered[match_stats_filtered.home_shots_tot.isnull()]

Unnamed: 0,fixture_id,country,league_name,league_id,league_type,league_season,fixture_date,fixture_round,fixture_status,fixture_elapsed,fixture_venue,fixture_referee,fixture_result_ht,fixture_result_ft,fixture_result_et,fixture_result_pen,home_team_name,home_team_id,away_team_name,away_team_id,home_goals,away_goals,home_shots_ont,away_shots_ont,home_shots_offt,away_shots_offt,home_shots_tot,away_shots_tot,home_shots_inb,away_shots_inb,home_shots_outb,away_shots_outb,home_passes_acc,away_passes_acc,home_passes_tot,away_passes_tot,home_passes_pct,away_passes_pct,home_possession,away_possession,home_corners,away_corners,home_offsides,away_offsides,home_fouls,away_fouls,home_yc,away_yc,home_rc,away_rc,home_gksaves,away_gksaves,home_shots_bl,away_shots_bl
4054,27411,Spain,Primera Division,87,League,2018,17/03/2019,Regular Season - 28,Match Finished,90,Estadio Municipal de Ipurúa (Eibar),David Medié,0-0,1-2,,,Eibar,545,Valladolid,720,1.0,2.0,3.0,3.0,9.0,4.0,,,,,,,,,,,,,0.58,0.42,5.0,0.0,4.0,5.0,16.0,14.0,4.0,2.0,0.0,0.0,1.0,2.0,2.0,1.0
4576,29264,Italy,Serie A,94,League,2018,17/03/2019,Regular Season - 28,Match Finished,90,Stadio Comunale Luigi Ferraris (Genova),M. Di Bello,0-0,2-0,,,Genoa,495,Juventus,496,2.0,0.0,5.0,0.0,7.0,4.0,,,,,,,,,,,,,0.43,0.57,6.0,6.0,3.0,1.0,12.0,11.0,2.0,3.0,0.0,0.0,0.0,3.0,7.0,2.0


##### Investigation: We can see all the necessary columns are there so we will perform the calculation and add the code to the function

In [27]:
shots_tot_null_fixture_ids = list(
    match_stats_filtered[match_stats_filtered.home_shots_tot.isnull()].fixture_id)

In [28]:
shots_tot_null_fixture_ids

[27411, 29264]

In [29]:
match_stats_filtered['home_shots_tot'].fillna(
    match_stats_filtered['home_shots_ont'] +
    match_stats_filtered['home_shots_offt'] +
    match_stats_filtered['home_shots_bl'],
    inplace=True
)

match_stats_filtered['away_shots_tot'].fillna(
    match_stats_filtered['away_shots_ont'] +
    match_stats_filtered['away_shots_offt'] +
    match_stats_filtered['away_shots_bl'],
    inplace=True
)

In [30]:
match_stats_filtered[match_stats_filtered.home_shots_tot.isnull()]

Unnamed: 0,fixture_id,country,league_name,league_id,league_type,league_season,fixture_date,fixture_round,fixture_status,fixture_elapsed,fixture_venue,fixture_referee,fixture_result_ht,fixture_result_ft,fixture_result_et,fixture_result_pen,home_team_name,home_team_id,away_team_name,away_team_id,home_goals,away_goals,home_shots_ont,away_shots_ont,home_shots_offt,away_shots_offt,home_shots_tot,away_shots_tot,home_shots_inb,away_shots_inb,home_shots_outb,away_shots_outb,home_passes_acc,away_passes_acc,home_passes_tot,away_passes_tot,home_passes_pct,away_passes_pct,home_possession,away_possession,home_corners,away_corners,home_offsides,away_offsides,home_fouls,away_fouls,home_yc,away_yc,home_rc,away_rc,home_gksaves,away_gksaves,home_shots_bl,away_shots_bl


In [31]:
match_stats_filtered[
    match_stats_filtered.fixture_id.isin(shots_tot_null_fixture_ids)]

Unnamed: 0,fixture_id,country,league_name,league_id,league_type,league_season,fixture_date,fixture_round,fixture_status,fixture_elapsed,fixture_venue,fixture_referee,fixture_result_ht,fixture_result_ft,fixture_result_et,fixture_result_pen,home_team_name,home_team_id,away_team_name,away_team_id,home_goals,away_goals,home_shots_ont,away_shots_ont,home_shots_offt,away_shots_offt,home_shots_tot,away_shots_tot,home_shots_inb,away_shots_inb,home_shots_outb,away_shots_outb,home_passes_acc,away_passes_acc,home_passes_tot,away_passes_tot,home_passes_pct,away_passes_pct,home_possession,away_possession,home_corners,away_corners,home_offsides,away_offsides,home_fouls,away_fouls,home_yc,away_yc,home_rc,away_rc,home_gksaves,away_gksaves,home_shots_bl,away_shots_bl
4054,27411,Spain,Primera Division,87,League,2018,17/03/2019,Regular Season - 28,Match Finished,90,Estadio Municipal de Ipurúa (Eibar),David Medié,0-0,1-2,,,Eibar,545,Valladolid,720,1.0,2.0,3.0,3.0,9.0,4.0,14.0,8.0,,,,,,,,,,,0.58,0.42,5.0,0.0,4.0,5.0,16.0,14.0,4.0,2.0,0.0,0.0,1.0,2.0,2.0,1.0
4576,29264,Italy,Serie A,94,League,2018,17/03/2019,Regular Season - 28,Match Finished,90,Stadio Comunale Luigi Ferraris (Genova),M. Di Bello,0-0,2-0,,,Genoa,495,Juventus,496,2.0,0.0,5.0,0.0,7.0,4.0,19.0,6.0,,,,,,,,,,,0.43,0.57,6.0,6.0,3.0,1.0,12.0,11.0,2.0,3.0,0.0,0.0,0.0,3.0,7.0,2.0


#### 3.1.2) Correct the shots_bl columns

In [32]:
shots_bl_null_fixture_ids = list(
    match_stats_filtered[match_stats_filtered.home_shots_bl.isnull()].fixture_id)
print(len(shots_bl_null_fixture_ids))

110


In [33]:
match_stats_filtered['home_shots_bl'].fillna(
    match_stats_filtered['home_shots_tot'] -
    (match_stats_filtered['home_shots_ont'] + match_stats_filtered['home_shots_offt']),
    inplace=True)
match_stats_filtered['away_shots_bl'].fillna(
    match_stats_filtered['away_shots_tot'] -
    (match_stats_filtered['away_shots_ont'] + match_stats_filtered['away_shots_offt']),
    inplace=True)

In [34]:
match_stats_filtered[match_stats_filtered.home_shots_bl.isnull()]

Unnamed: 0,fixture_id,country,league_name,league_id,league_type,league_season,fixture_date,fixture_round,fixture_status,fixture_elapsed,fixture_venue,fixture_referee,fixture_result_ht,fixture_result_ft,fixture_result_et,fixture_result_pen,home_team_name,home_team_id,away_team_name,away_team_id,home_goals,away_goals,home_shots_ont,away_shots_ont,home_shots_offt,away_shots_offt,home_shots_tot,away_shots_tot,home_shots_inb,away_shots_inb,home_shots_outb,away_shots_outb,home_passes_acc,away_passes_acc,home_passes_tot,away_passes_tot,home_passes_pct,away_passes_pct,home_possession,away_possession,home_corners,away_corners,home_offsides,away_offsides,home_fouls,away_fouls,home_yc,away_yc,home_rc,away_rc,home_gksaves,away_gksaves,home_shots_bl,away_shots_bl


In [35]:
match_stats_filtered[match_stats_filtered.fixture_id.isin(shots_bl_null_fixture_ids)]

Unnamed: 0,fixture_id,country,league_name,league_id,league_type,league_season,fixture_date,fixture_round,fixture_status,fixture_elapsed,fixture_venue,fixture_referee,fixture_result_ht,fixture_result_ft,fixture_result_et,fixture_result_pen,home_team_name,home_team_id,away_team_name,away_team_id,home_goals,away_goals,home_shots_ont,away_shots_ont,home_shots_offt,away_shots_offt,home_shots_tot,away_shots_tot,home_shots_inb,away_shots_inb,home_shots_outb,away_shots_outb,home_passes_acc,away_passes_acc,home_passes_tot,away_passes_tot,home_passes_pct,away_passes_pct,home_possession,away_possession,home_corners,away_corners,home_offsides,away_offsides,home_fouls,away_fouls,home_yc,away_yc,home_rc,away_rc,home_gksaves,away_gksaves,home_shots_bl,away_shots_bl
127,192,England,Premier League,2,League,2018,24/11/2018,Regular Season - 13,Match Finished,90,"Vicarage Road, Watford","Jonathan Moss, England",0-0,0-3,,,Watford,38,Liverpool,40,0.0,3.0,1.0,7.0,4.0,3.0,5.0,10.0,3.0,7.0,2.0,3.0,250.0,526.0,360.0,637.0,0.69,0.83,0.36,0.64,5.0,5.0,4.0,4.0,12.0,13.0,0.0,2.0,0.0,1.0,4.0,1.0,0.0,0.0
944,8925,Italy,Serie A,28,League,2017,19/11/2017,Regular Season - 13,Match Finished,90,Stadio Ezio Scida (Crotone),P. Tagliavento,0-1,0-1,,,Crotone,501,Genoa,495,0.0,1.0,4.0,7.0,7.0,3.0,11.0,10.0,3.0,8.0,8.0,2.0,416.0,274.0,549.0,394.0,0.76,0.7,0.58,0.42,5.0,2.0,1.0,4.0,17.0,15.0,1.0,3.0,0.0,0.0,6.0,4.0,0.0,0.0
985,8966,Italy,Serie A,28,League,2017,21/10/2017,Regular Season - 9,Match Finished,90,Stadio Comunale Luigi Ferraris (Genova),G. Calvarese,3-0,5-0,,,Sampdoria,498,Crotone,501,5.0,0.0,5.0,2.0,1.0,6.0,6.0,8.0,6.0,5.0,0.0,3.0,701.0,283.0,777.0,362.0,0.9,0.78,0.68,0.32,4.0,0.0,0.0,3.0,11.0,20.0,1.0,3.0,0.0,0.0,2.0,0.0,0.0,0.0
1321,9764,Spain,Primera Division,30,League,2017,25/11/2017,Regular Season - 13,Match Finished,90,Estadio Benito Villamarín (Sevilla),Carlos Del Cerro,0-1,2-2,,,Real Betis,543,Girona,547,2.0,2.0,6.0,7.0,6.0,6.0,12.0,13.0,8.0,8.0,4.0,5.0,339.0,288.0,442.0,393.0,0.77,0.73,0.51,0.49,3.0,5.0,6.0,3.0,9.0,16.0,2.0,1.0,0.0,0.0,5.0,4.0,0.0,0.0
1657,10927,Germany,Bundesliga 1,35,League,2017,12/05/2018,Regular Season - 34,Match Finished,90,Schwarzwald-Stadion (Freiburg im Breisgau),,0-0,2-0,,,SC Freiburg,160,FC Augsburg,170,2.0,0.0,3.0,2.0,9.0,2.0,12.0,4.0,10.0,1.0,2.0,3.0,305.0,331.0,380.0,425.0,0.8,0.78,0.47,0.53,5.0,4.0,0.0,3.0,13.0,17.0,1.0,2.0,0.0,0.0,2.0,1.0,0.0,0.0
2360,16729,Germany,Bundesliga 1,54,League,2016,06/05/2017,Regular Season - 32,Match Finished,90,Commerzbank-Arena (Frankfurt am Main),,0-0,0-2,,,Eintracht Frankfurt,169,VfL Wolfsburg,161,0.0,2.0,2.0,3.0,5.0,4.0,7.0,7.0,4.0,6.0,3.0,1.0,373.0,358.0,467.0,459.0,0.8,0.78,0.5,0.5,2.0,4.0,1.0,1.0,12.0,14.0,1.0,2.0,0.0,0.0,1.0,2.0,0.0,0.0
2422,16791,Germany,Bundesliga 1,54,League,2016,18/03/2017,Regular Season - 25,Match Finished,90,RheinEnergieStadion (Köln),,3-0,4-2,,,FC Koln,192,Hertha Berlin,159,4.0,2.0,6.0,6.0,1.0,2.0,7.0,8.0,4.0,6.0,3.0,2.0,202.0,579.0,292.0,679.0,0.69,0.85,0.29,0.71,2.0,9.0,1.0,1.0,12.0,12.0,2.0,3.0,0.0,0.0,4.0,2.0,0.0,0.0
2449,16818,Germany,Bundesliga 1,54,League,2016,25/02/2017,Regular Season - 22,Match Finished,90,Red Bull Arena (Leipzig),,2-0,3-1,,,RB Leipzig,173,FC Koln,192,3.0,1.0,5.0,1.0,8.0,4.0,13.0,5.0,12.0,2.0,2.0,3.0,389.0,348.0,544.0,492.0,0.72,0.71,0.52,0.48,3.0,0.0,3.0,1.0,15.0,10.0,1.0,4.0,0.0,0.0,0.0,3.0,0.0,0.0
3139,20561,Spain,Primera Division,64,League,2016,11/03/2017,Regular Season - 27,Match Finished,90,Estadio La Rosaleda (Málaga),,0-1,1-2,,,Malaga,535,Alaves,542,1.0,2.0,9.0,2.0,6.0,2.0,15.0,4.0,13.0,4.0,2.0,0.0,387.0,260.0,488.0,369.0,0.79,0.7,0.57,0.43,5.0,3.0,0.0,1.0,17.0,13.0,2.0,3.0,0.0,1.0,0.0,8.0,0.0,0.0
3622,21506,Italy,Serie A,66,League,2016,15/12/2016,Regular Season - 3,Match Finished,90,Stadio Comunale Luigi Ferraris (Genova),,1-0,1-0,,,Genoa,495,Fiorentina,502,1.0,0.0,2.0,1.0,3.0,0.0,5.0,1.0,2.0,0.0,3.0,1.0,67.0,96.0,91.0,133.0,0.74,0.72,0.4,0.6,1.0,1.0,3.0,0.0,6.0,5.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0


### 3.2) Total shots -> (shots_inb + shots_outb)

In [36]:
print(match_stats_filtered[[
    'home_shots_inb', 'home_shots_outb', 'home_shots_tot'
]].isnull().sum())
print()
print(match_stats_filtered[[
    'away_shots_inb', 'away_shots_outb', 'away_shots_tot'
]].isnull().sum())

home_shots_inb     87
home_shots_outb    88
home_shots_tot      0
dtype: int64

away_shots_inb     87
away_shots_outb    88
away_shots_tot      0
dtype: int64


#### 3.2.1) See if we can correct any of the rows

In [37]:
match_stats_filtered[match_stats_filtered.home_shots_outb.isnull()][[
    'home_shots_inb', 'home_shots_outb', 'away_shots_inb', 'away_shots_outb']].isnull().sum()

home_shots_inb     87
home_shots_outb    88
away_shots_inb     87
away_shots_outb    88
dtype: int64

So we know we can't do anything for 87 of the cases

In [38]:
match_stats_filtered[
    (match_stats_filtered.home_shots_outb.isnull()) &
    (~match_stats_filtered.home_shots_inb.isnull())]

Unnamed: 0,fixture_id,country,league_name,league_id,league_type,league_season,fixture_date,fixture_round,fixture_status,fixture_elapsed,fixture_venue,fixture_referee,fixture_result_ht,fixture_result_ft,fixture_result_et,fixture_result_pen,home_team_name,home_team_id,away_team_name,away_team_id,home_goals,away_goals,home_shots_ont,away_shots_ont,home_shots_offt,away_shots_offt,home_shots_tot,away_shots_tot,home_shots_inb,away_shots_inb,home_shots_outb,away_shots_outb,home_passes_acc,away_passes_acc,home_passes_tot,away_passes_tot,home_passes_pct,away_passes_pct,home_possession,away_possession,home_corners,away_corners,home_offsides,away_offsides,home_fouls,away_fouls,home_yc,away_yc,home_rc,away_rc,home_gksaves,away_gksaves,home_shots_bl,away_shots_bl
3170,20592,Spain,Primera Division,64,League,2016,25/02/2017,Regular Season - 24,Match Finished,90,Estadio Municipal de Butarque (Leganés),,2-0,4-0,,,Leganes,537,Deportivo La Coruna,544,4.0,0.0,5.0,3.0,1.0,7.0,7.0,11.0,7.0,11.0,,,266.0,227.0,370.0,323.0,0.72,0.7,0.52,0.48,2.0,7.0,1.0,0.0,18.0,13.0,2.0,5.0,0.0,1.0,3.0,1.0,1.0,1.0


In [39]:
match_stats_filtered.loc[match_stats_filtered["fixture_id"]==20592,'home_shots_outb'] = (
    match_stats_filtered.loc[match_stats_filtered["fixture_id"]==20592,'home_shots_tot'] -
    match_stats_filtered.loc[match_stats_filtered["fixture_id"]==20592,'home_shots_inb'])
match_stats_filtered.loc[match_stats_filtered["fixture_id"]==20592,'away_shots_outb'] = (
    match_stats_filtered.loc[match_stats_filtered["fixture_id"]==20592,'away_shots_tot'] -
    match_stats_filtered.loc[match_stats_filtered["fixture_id"]==20592,'away_shots_inb'])

In [40]:
(match_stats_filtered.loc[match_stats_filtered["fixture_id"]==20592,'home_shots_tot'] -
 match_stats_filtered.loc[match_stats_filtered["fixture_id"]==20592,'home_shots_inb'])

3170    0.0
dtype: float64

In [41]:
match_stats_filtered[
    (match_stats_filtered.home_shots_outb.isnull()) &
    (~match_stats_filtered.home_shots_inb.isnull())]

Unnamed: 0,fixture_id,country,league_name,league_id,league_type,league_season,fixture_date,fixture_round,fixture_status,fixture_elapsed,fixture_venue,fixture_referee,fixture_result_ht,fixture_result_ft,fixture_result_et,fixture_result_pen,home_team_name,home_team_id,away_team_name,away_team_id,home_goals,away_goals,home_shots_ont,away_shots_ont,home_shots_offt,away_shots_offt,home_shots_tot,away_shots_tot,home_shots_inb,away_shots_inb,home_shots_outb,away_shots_outb,home_passes_acc,away_passes_acc,home_passes_tot,away_passes_tot,home_passes_pct,away_passes_pct,home_possession,away_possession,home_corners,away_corners,home_offsides,away_offsides,home_fouls,away_fouls,home_yc,away_yc,home_rc,away_rc,home_gksaves,away_gksaves,home_shots_bl,away_shots_bl


#### 3.2.2) Correct the shots_inb columns so inb+outb=tot

In [42]:
# Checking that the null columns will stay null - YES
(match_stats_filtered['home_shots_tot'] - match_stats_filtered['home_shots_outb']
    ).isnull().sum()

87

In [43]:
match_stats_filtered['home_shots_inb'] = (
    match_stats_filtered['home_shots_tot'] - match_stats_filtered['home_shots_outb'])
match_stats_filtered['away_shots_inb'] = (
    match_stats_filtered['away_shots_tot'] - match_stats_filtered['away_shots_outb'])

In [44]:
home_shots_not_null = match_stats_filtered[
    (~match_stats_filtered.home_shots_inb.isnull()) &
    (~match_stats_filtered.home_shots_outb.isnull()) &
    (~match_stats_filtered.home_shots_tot.isnull())]
home_shots_not_null['diff'] = (
    (home_shots_not_null.home_shots_inb + home_shots_not_null.home_shots_outb)
    - home_shots_not_null.home_shots_tot)

print(home_shots_not_null.shape)
home_inb_outb_wrong = home_shots_not_null[
    (home_shots_not_null.home_shots_inb + home_shots_not_null.home_shots_outb)
    != home_shots_not_null.home_shots_tot]


display(home_inb_outb_wrong[['fixture_date', 'home_team_name', 'away_team_name',
                             'home_shots_tot', 'home_shots_inb', 'home_shots_outb', 'diff']])

away_shots_not_null = match_stats_filtered[
    (~match_stats_filtered.away_shots_inb.isnull()) &
    (~match_stats_filtered.away_shots_outb.isnull()) &
    (~match_stats_filtered.away_shots_tot.isnull())]
away_shots_not_null['diff'] = (
    (away_shots_not_null.away_shots_inb + away_shots_not_null.away_shots_outb)
    - away_shots_not_null.away_shots_tot)

print(away_shots_not_null.shape)
away_inb_outb_wrong = away_shots_not_null[
    (away_shots_not_null.away_shots_inb + away_shots_not_null.away_shots_outb)
    != away_shots_not_null.away_shots_tot]

display(away_inb_outb_wrong[['fixture_date', 'home_team_name', 'away_team_name',
                             'away_shots_tot', 'away_shots_inb', 'away_shots_outb', 'diff']])

(6928, 55)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


Unnamed: 0,fixture_date,home_team_name,away_team_name,home_shots_tot,home_shots_inb,home_shots_outb,diff


(6928, 55)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,fixture_date,home_team_name,away_team_name,away_shots_tot,away_shots_inb,away_shots_outb,diff


### 3.3) Passes_pct -> (passes_acc/passes_tot)

In [45]:
print(match_stats_filtered[match_stats_filtered.home_passes_acc.isnull()][[
    'home_passes_acc', 'home_passes_tot', 'home_passes_pct',
    'away_passes_acc', 'away_passes_tot', 'away_passes_pct']].isnull().sum())

print(match_stats_filtered[match_stats_filtered.home_passes_tot.isnull()][[
    'home_passes_acc', 'home_passes_tot', 'home_passes_pct',
    'away_passes_acc', 'away_passes_tot', 'away_passes_pct']].isnull().sum())
print()
print(match_stats_filtered[match_stats_filtered.home_passes_pct.isnull()][[
    'home_passes_acc', 'home_passes_tot', 'home_passes_pct',
    'away_passes_acc', 'away_passes_tot', 'away_passes_pct']].isnull().sum())

home_passes_acc    87
home_passes_tot    87
home_passes_pct    87
away_passes_acc    87
away_passes_tot    87
away_passes_pct    87
dtype: int64
home_passes_acc    87
home_passes_tot    87
home_passes_pct    87
away_passes_acc    87
away_passes_tot    87
away_passes_pct    87
dtype: int64

home_passes_acc      87
home_passes_tot      87
home_passes_pct    1278
away_passes_acc      87
away_passes_tot      87
away_passes_pct    1278
dtype: int64


We see that the rows where passes_acc and passes_tot are missing so are passes_pct. There is no way to correct these. Notice there are 87. This is the same number as where shots_inb and shots_outb were null and I think are the same rows. We may need to exclude these from the dataset. That they are only 87 is good.

#### 3.3.1) Correct the passes_pct column where we have passes_acc and passes_tot

In [46]:
print(match_stats_filtered[
    (match_stats_filtered.home_passes_pct.isnull()) &
    (~match_stats_filtered.home_passes_tot.isnull())].shape)
print(1191+87) # Just a check to see we have what we expect

(1191, 54)
1278


In [47]:
# Checking that the null columns will stay null - YES
(match_stats_filtered['home_passes_acc']/match_stats_filtered['home_passes_tot']
    ).isnull().sum()

87

In [48]:
match_stats_filtered['home_passes_pct'] = (
    match_stats_filtered['home_passes_acc']/match_stats_filtered['home_passes_tot']
).apply(lambda x: round(x, 2))

match_stats_filtered['away_passes_pct'] = (
    match_stats_filtered['away_passes_acc']/match_stats_filtered['away_passes_tot']
).apply(lambda x: round(x, 2))

In [49]:
print(match_stats_filtered[
    (match_stats_filtered.home_passes_pct.isnull()) &
    (~match_stats_filtered.home_passes_tot.isnull())].shape)

(0, 54)


### 3.4) HT Result ->> Can fix using match events

In [50]:
print(match_stats_filtered[
    match_stats_filtered.fixture_result_ht.isnull()][['fixture_result_ht']].isnull().sum())

fixture_result_ht    1
dtype: int64


#### 3.4.1) Correct the HT result column

In [51]:
match_stats_filtered[match_stats_filtered.fixture_result_ht.isnull()]

Unnamed: 0,fixture_id,country,league_name,league_id,league_type,league_season,fixture_date,fixture_round,fixture_status,fixture_elapsed,fixture_venue,fixture_referee,fixture_result_ht,fixture_result_ft,fixture_result_et,fixture_result_pen,home_team_name,home_team_id,away_team_name,away_team_id,home_goals,away_goals,home_shots_ont,away_shots_ont,home_shots_offt,away_shots_offt,home_shots_tot,away_shots_tot,home_shots_inb,away_shots_inb,home_shots_outb,away_shots_outb,home_passes_acc,away_passes_acc,home_passes_tot,away_passes_tot,home_passes_pct,away_passes_pct,home_possession,away_possession,home_corners,away_corners,home_offsides,away_offsides,home_fouls,away_fouls,home_yc,away_yc,home_rc,away_rc,home_gksaves,away_gksaves,home_shots_bl,away_shots_bl
3766,21650,Italy,Serie A,66,League,2016,28/08/2016,Regular Season - 2,Match Finished,90,Stadio Città del Tricolore (Reggio Emilia),,,0-3,,,Sassuolo,488,Pescara,525,0.0,3.0,3.0,5.0,5.0,7.0,13.0,15.0,4.0,9.0,9.0,6.0,304.0,487.0,394.0,574.0,0.77,0.85,0.4,0.6,2.0,7.0,2.0,3.0,13.0,21.0,2.0,2.0,0.0,0.0,5.0,1.0,5.0,3.0


In [52]:
match_stats_filtered.loc[
    ((match_stats_filtered.fixture_id).astype(int) == 21650), 'home_goals'] = 2
match_stats_filtered.loc[
    ((match_stats_filtered.fixture_id).astype(int) == 21650), 'away_goals'] = 1
match_stats_filtered.loc[
    ((match_stats_filtered.fixture_id).astype(int) == 21650), 'fixture_result_ht'] = '1-0'
match_stats_filtered.loc[
    ((match_stats_filtered.fixture_id).astype(int) == 21650), 'fixture_result_ft'] = '2-1'

In [53]:
display(match_stats_filtered[match_stats_filtered.fixture_result_ht.isnull()])
display(match_stats_filtered[match_stats_filtered.fixture_id == 21650])

Unnamed: 0,fixture_id,country,league_name,league_id,league_type,league_season,fixture_date,fixture_round,fixture_status,fixture_elapsed,fixture_venue,fixture_referee,fixture_result_ht,fixture_result_ft,fixture_result_et,fixture_result_pen,home_team_name,home_team_id,away_team_name,away_team_id,home_goals,away_goals,home_shots_ont,away_shots_ont,home_shots_offt,away_shots_offt,home_shots_tot,away_shots_tot,home_shots_inb,away_shots_inb,home_shots_outb,away_shots_outb,home_passes_acc,away_passes_acc,home_passes_tot,away_passes_tot,home_passes_pct,away_passes_pct,home_possession,away_possession,home_corners,away_corners,home_offsides,away_offsides,home_fouls,away_fouls,home_yc,away_yc,home_rc,away_rc,home_gksaves,away_gksaves,home_shots_bl,away_shots_bl


Unnamed: 0,fixture_id,country,league_name,league_id,league_type,league_season,fixture_date,fixture_round,fixture_status,fixture_elapsed,fixture_venue,fixture_referee,fixture_result_ht,fixture_result_ft,fixture_result_et,fixture_result_pen,home_team_name,home_team_id,away_team_name,away_team_id,home_goals,away_goals,home_shots_ont,away_shots_ont,home_shots_offt,away_shots_offt,home_shots_tot,away_shots_tot,home_shots_inb,away_shots_inb,home_shots_outb,away_shots_outb,home_passes_acc,away_passes_acc,home_passes_tot,away_passes_tot,home_passes_pct,away_passes_pct,home_possession,away_possession,home_corners,away_corners,home_offsides,away_offsides,home_fouls,away_fouls,home_yc,away_yc,home_rc,away_rc,home_gksaves,away_gksaves,home_shots_bl,away_shots_bl
3766,21650,Italy,Serie A,66,League,2016,28/08/2016,Regular Season - 2,Match Finished,90,Stadio Città del Tricolore (Reggio Emilia),,1-0,2-1,,,Sassuolo,488,Pescara,525,2.0,1.0,3.0,5.0,5.0,7.0,13.0,15.0,4.0,9.0,9.0,6.0,304.0,487.0,394.0,574.0,0.77,0.85,0.4,0.6,2.0,7.0,2.0,3.0,13.0,21.0,2.0,2.0,0.0,0.0,5.0,1.0,5.0,3.0


In [54]:
match_events_raw[
        (match_events_raw.fixture_id == 21650) & (match_events_raw.event_type == 'Goal')]

Unnamed: 0,fixture_id,country,league_name,league_id,league_type,league_season,fixture_date,fixture_round,fixture_status,fixture_elapsed,fixture_venue,fixture_referee,fixture_result_ht,fixture_result_ft,fixture_result_et,fixture_result_pen,home_team_name,home_team_id,away_team_name,away_team_id,home_goals,away_goals,event_team_id,event_team_name,event_player_name,event_time_elapsed,event_time_elapsed_plus,event_type,event_detail,event_comments
151497,21650,Italy,Serie A,66,League,2016,28/08/2016,Regular Season - 2,Match Finished,90,Stadio Città del Tricolore (Reggio Emilia),,,0-3,,,Sassuolo,488,Pescara,525,0.0,3.0,488.0,Sassuolo,Grégoire Defrel,38.0,,Goal,Normal Goal,
151498,21650,Italy,Serie A,66,League,2016,28/08/2016,Regular Season - 2,Match Finished,90,Stadio Città del Tricolore (Reggio Emilia),,,0-3,,,Sassuolo,488,Pescara,525,0.0,3.0,488.0,Sassuolo,Domenico Berardi,67.0,,Goal,Normal Goal,
151499,21650,Italy,Serie A,66,League,2016,28/08/2016,Regular Season - 2,Match Finished,90,Stadio Città del Tricolore (Reggio Emilia),,,0-3,,,Sassuolo,488,Pescara,525,0.0,3.0,525.0,Pescara,Rey Manaj,81.0,,Goal,Normal Goal,


#### Evidence: So I've just compared this match to google results and it seems the home and away goals numbers are wrong. All other stats look correct tho. And match events were correct.
#### Solution: I have added to the function that creates the match stats csv so that it corrects the values of the row
#### Action: I think I will need to perform a check to see that the home and away goals figures match the events data (3.4.2)

#### 3.4.2) Check which matches have the goals not matching the events data

In [55]:
checks = match_stats_filtered[['fixture_id', 'home_goals', 'away_goals']].copy()
count_checks = match_events_count_raw[[
    'fixture_id',
    'home_normal_goal', 'home_penalty', 'home_own_goal', 'home_missed_penalty',
    'away_normal_goal', 'away_penalty', 'away_own_goal', 'away_missed_penalty']].copy()

checks = checks.merge(count_checks, on='fixture_id', how='inner')

ids_goals_dont_add = list(set(
    list(checks[checks['home_goals'] != (
        checks['home_normal_goal'] + checks['home_own_goal'] + checks['home_penalty']
)].fixture_id.unique()) +
    list(checks[checks['away_goals'] != (
        checks['away_normal_goal'] + checks['away_own_goal'] + checks['away_penalty']
)].fixture_id.unique())))

checks[checks.fixture_id.isin(ids_goals_dont_add)]

Unnamed: 0,fixture_id,home_goals,away_goals,home_normal_goal,home_penalty,home_own_goal,home_missed_penalty,away_normal_goal,away_penalty,away_own_goal,away_missed_penalty
5789,157067,1.0,2.0,2.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
5813,157091,1.0,5.0,2.0,0.0,0.0,0.0,5.0,0.0,0.0,1.0
5818,157096,2.0,1.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5823,157101,1.0,1.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
5827,157105,2.0,2.0,3.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
5828,157106,3.0,2.0,1.0,1.0,1.0,0.0,2.0,0.0,1.0,0.0
5831,157109,3.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5851,157129,3.0,1.0,3.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
5857,157135,1.0,2.0,1.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0
5861,157139,1.0,2.0,2.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0


In [56]:
for fixture_id in ids_goals_dont_add:
    fixture_stats_data = match_stats_filtered[match_stats_filtered.fixture_id == fixture_id]
    fixture_events_data = checks[checks.fixture_id == fixture_id]
    
    home_events_goal = int(
        list(fixture_events_data['home_normal_goal'])[0] +
        list(fixture_events_data['home_own_goal'])[0] +
        list(fixture_events_data['home_penalty'])[0])
    
    away_events_goal = int(
        list(fixture_events_data['away_normal_goal'])[0] +
        list(fixture_events_data['away_own_goal'])[0] +
        list(fixture_events_data['away_penalty'])[0])
    
    print(f"{int(fixture_events_data.home_goals)}-{int(fixture_events_data.away_goals)}\
    {home_events_goal}-{away_events_goal}\
    {list(fixture_stats_data.fixture_date)[0]} \
    {list(fixture_stats_data.home_team_name)[0]} vs {list(fixture_stats_data.away_team_name)[0]}\
    {fixture_id}")

print("Finished")

1-5    2-5    05/10/2019     Norwich vs Aston Villa    157091
2-1    3-1    19/10/2019     Aston Villa vs Brighton    157096
3-1    4-1    10/11/2019     Manchester United vs Brighton    157129
1-2    2-2    22/09/2019     Chelsea vs Liverpool    157067
1-1    1-2    20/10/2019     Manchester United vs Liverpool    157101
1-2    1-3    23/11/2019     Bournemouth vs Wolves    157135
2-2    3-2    27/10/2019     Arsenal vs Crystal Palace    157105
3-2    3-3    26/10/2019     Brighton vs Everton    157106
1-2    2-2    23/11/2019     Crystal Palace vs Liverpool    157139
0-2    0-3    23/11/2019     Everton vs Norwich    157140
3-0    4-0    26/10/2019     Manchester City vs Aston Villa    157109
2-1    3-1    23/11/2019     Manchester City vs Chelsea    157141
Finished


#### Solution: I have updated the function that create the match events to drop events that didnt happen and the function that creates match stats to correct the row which had incorrect values

---

### 3.5) Red & Yellow Cards ->> Can fix using match events

In [57]:
match_stats_csv_cards = match_stats_filtered[['fixture_id',
                                         'home_yc', 'away_yc', 'home_rc', 'away_rc']].copy()
match_stats_csv_cards.isnull().sum()

fixture_id       0
home_yc        232
away_yc        232
home_rc       1516
away_rc       1502
dtype: int64

##### NOTE: After doing some checks, the only issues are the times where the match_stat_csv has a higher card count than the match_events. This means that a match_event was not recorded in the match_events dataset. In the numerous cases I checked, when the match events count is larger than the match stats count, the match events is correct. Therefore we will only make changes to the cases where the match_events_data is wrong. The reason for this is we are going to use the match events data counts to overwrite the counts in the match_stats since a lot of the card values are null.
##### NOTE2: An issue is that now that we know there are inaccuracies in the match events, it may be that when the match stats are null, the matche evnts are still wrong. However, following checks, as far as I can see, the yc and rc columns are null because in that match there were 0 instances of a yellow or red card. Therefore it is safe to go ahead with this approach.

#### 3.5.1) Check which matches have the cards stats greater than in events data

In [58]:
match_events_count_csv_cards = match_events_count_raw[[
    'fixture_id', 'home_red_card', 'away_red_card', 'home_yellow_card', 'away_yellow_card']]
cards_csv = match_stats_csv_cards.merge(match_events_count_csv_cards, on='fixture_id', how='left')

yc_csv_clean = cards_csv[~cards_csv.home_yc.isnull()]
home_yc_wrong = yc_csv_clean[(yc_csv_clean['home_yc'] > yc_csv_clean['home_yellow_card'])]
away_yc_wrong = yc_csv_clean[(yc_csv_clean['away_yc'] > yc_csv_clean['away_yellow_card'])]

home_rc_csv_clean = cards_csv[~cards_csv.home_rc.isnull()]
home_rc_wrong = home_rc_csv_clean[
    (home_rc_csv_clean['home_rc'] > home_rc_csv_clean['home_red_card'])]

away_rc_csv_clean = cards_csv[~cards_csv.away_rc.isnull()]
away_rc_wrong = away_rc_csv_clean[
    (away_rc_csv_clean['away_rc'] > away_rc_csv_clean['away_red_card'])]

ids_cards_wrong_in_events = list(set(list(home_rc_wrong.fixture_id.unique()) +
                                     list(away_rc_wrong.fixture_id.unique()) +
                                     list(home_yc_wrong.fixture_id.unique()) +
                                     list(away_yc_wrong.fixture_id.unique())))

display("Home YC", home_yc_wrong)
display("Home RC", home_rc_wrong)
display("Away YC", away_yc_wrong)
display("Away RC", away_rc_wrong)

'Home YC'

Unnamed: 0,fixture_id,home_yc,away_yc,home_rc,away_rc,home_red_card,away_red_card,home_yellow_card,away_yellow_card
2961,20639,5.0,3.0,0.0,0.0,0.0,0.0,4.0,3.0
3071,20749,4.0,1.0,0.0,0.0,0.0,0.0,3.0,1.0
5343,93299,2.0,4.0,0.0,0.0,0.0,0.0,1.0,4.0
5506,93462,3.0,3.0,1.0,1.0,1.0,1.0,2.0,3.0
6197,157623,5.0,4.0,,,0.0,0.0,3.0,4.0


'Home RC'

Unnamed: 0,fixture_id,home_yc,away_yc,home_rc,away_rc,home_red_card,away_red_card,home_yellow_card,away_yellow_card


'Away YC'

Unnamed: 0,fixture_id,home_yc,away_yc,home_rc,away_rc,home_red_card,away_red_card,home_yellow_card,away_yellow_card
659,2796,0.0,3.0,,1.0,0.0,0.0,0.0,2.0
1008,9001,1.0,4.0,0.0,1.0,0.0,0.0,1.0,3.0
3070,20748,3.0,5.0,1.0,1.0,1.0,1.0,3.0,4.0
3096,20774,2.0,4.0,0.0,0.0,0.0,0.0,2.0,3.0
3138,20816,0.0,4.0,0.0,0.0,0.0,0.0,0.0,3.0
6017,157443,0.0,2.0,,,0.0,0.0,0.0,1.0
6615,214221,1.0,4.0,0.0,1.0,0.0,1.0,1.0,3.0


'Away RC'

Unnamed: 0,fixture_id,home_yc,away_yc,home_rc,away_rc,home_red_card,away_red_card,home_yellow_card,away_yellow_card
659,2796,0.0,3.0,,1.0,0.0,0.0,0.0,2.0
1008,9001,1.0,4.0,0.0,1.0,0.0,0.0,1.0,3.0


In [59]:
len(ids_cards_wrong_in_events)

12

#### 3.5.2) Check which matches have time_elapsed for card events as negative

In [60]:
ids_neg_time_cards = match_events_raw[
    match_events_raw.event_time_elapsed<-1].fixture_id.unique()

for fixture_id in ids_neg_time_cards:
    fixture_events_data = match_events_raw[match_events_raw.fixture_id == fixture_id]
    
    if fixture_id in [
        152985, 153053, 16890, 16907, 17500,20627, 20686, 20707, 20710, 20737,
        20740, 20746, 20756, 20761, 20780, 20781, 20793, 20800, 20815, 20819,
        20823, 21476, 21520, 21596, 78621, 78731, 78753, 78842
    ]: continue
    
    
    print((f"{(fixture_id in ids_cards_wrong_in_events)}: {fixture_id}     "+
            f"{list(fixture_events_data.event_time_elapsed)[0]} -> {list(fixture_events_data.event_detail)[0]}   "+
            f"{list(fixture_events_data.fixture_date)[0]}  "+
            f"{list(fixture_events_data.home_team_name)[0]} vs {list(fixture_events_data.away_team_name)[0]}"))

print("Finished")

Finished


#### 3.5.3) Identify matches where the match_events are incorrect

In [61]:
card_dfs = [(home_rc_wrong, 'home', 'red'), (home_yc_wrong, 'home', 'yellow'),
            (away_rc_wrong, 'away', 'red'), (away_yc_wrong, 'away', 'yellow')]

print("This is to check there aren't missing match event rows\n")
for card_df, home_away, red_yellow in card_dfs:
    ids_of_incorrect_cards = list(card_df.fixture_id.unique())
    ha_upper = home_away.upper()
    
    ry_upper = red_yellow.upper()
    ry = red_yellow[0]
    
    print("==================================================================")
    print(f"{home_away} {ry_upper} | stats - events")
    print("__________________________________________________________________")
    if (len(ids_of_incorrect_cards) == 0): print('NONE')
    for fixture_id in ids_of_incorrect_cards:
        fixture_events_data = match_events_raw[match_events_raw.fixture_id == fixture_id]
        fixture_event_count = cards_csv[cards_csv.fixture_id == fixture_id]
        
        home_team = list(fixture_events_data.home_team_name)[0]
        away_team = list(fixture_events_data.away_team_name)[0]
        fixture_date = list(fixture_events_data.fixture_date)[0]
        
        stat_count = list(fixture_event_count[f"{home_away}_{ry}c"])[0]
        event_count = list(fixture_event_count[f"{home_away}_{red_yellow}_card"])[0]
        
        print((f"{fixture_id} :       "+
               f"{stat_count} - {event_count}    " + 
               f"{fixture_date}  "+
               f"{home_team} vs {away_team}"))
    
    print("------------------------------------------------------------------\n")

This is to check there aren't missing match event rows

home RED | stats - events
__________________________________________________________________
NONE
------------------------------------------------------------------

home YELLOW | stats - events
__________________________________________________________________
20639 :       5.0 - 4.0    22/01/2017  Osasuna vs Sevilla
20749 :       4.0 - 3.0    15/10/2016  Real Betis vs Real Madrid
93299 :       2.0 - 1.0    05/02/2017  Toulouse vs Angers
93462 :       3.0 - 2.0    24/09/2016  SC Bastia vs Guingamp
157623 :       5.0 - 3.0    31/01/2020  Rennes vs Nantes
------------------------------------------------------------------

away RED | stats - events
__________________________________________________________________
2796 :       1.0 - 0.0    04/05/2019  Bayern Munich vs Hannover 96
9001 :       1.0 - 0.0    20/09/2017  Juventus vs Fiorentina
------------------------------------------------------------------

away YELLOW | stats - even

##### Solution: I've created a function in utils called create_duplicate_row_with_modified_values and added to data_config the relevant details so that we can create the rows for the missing match events.

In [62]:
match_events_count_csv_cards = match_events_count_raw[[
    'fixture_id', 'home_red_card', 'away_red_card', 'home_yellow_card', 'away_yellow_card']]
cards_csv = match_stats_csv_cards.merge(match_events_count_csv_cards, on='fixture_id', how='left')

yc_csv_clean = cards_csv[~cards_csv.home_yc.isnull()]
home_yc_wrong = yc_csv_clean[(yc_csv_clean['home_yc'] < yc_csv_clean['home_yellow_card'])]
away_yc_wrong = yc_csv_clean[(yc_csv_clean['away_yc'] < yc_csv_clean['away_yellow_card'])]

home_rc_csv_clean = cards_csv[~cards_csv.home_rc.isnull()]
home_rc_wrong = home_rc_csv_clean[
    (home_rc_csv_clean['home_rc'] < home_rc_csv_clean['home_red_card'])]

away_rc_csv_clean = cards_csv[~cards_csv.away_rc.isnull()]
away_rc_wrong = away_rc_csv_clean[
    (away_rc_csv_clean['away_rc'] < away_rc_csv_clean['away_red_card'])]

ids_cards_wrong_in_events = list(set(list(home_rc_wrong.fixture_id.unique()) +
                                     list(away_rc_wrong.fixture_id.unique()) +
                                     list(home_yc_wrong.fixture_id.unique()) +
                                     list(away_yc_wrong.fixture_id.unique())))

display("Home YC", home_yc_wrong)
display("Home RC", home_rc_wrong)
display("Away YC", away_yc_wrong)
display("Away RC", away_rc_wrong)

'Home YC'

Unnamed: 0,fixture_id,home_yc,away_yc,home_rc,away_rc,home_red_card,away_red_card,home_yellow_card,away_yellow_card
1105,9569,5.0,2.0,1.0,0.0,1.0,0.0,6.0,2.0
1130,9596,1.0,1.0,0.0,0.0,0.0,0.0,3.0,1.0
1238,9718,6.0,3.0,1.0,0.0,1.0,0.0,7.0,3.0
1257,9746,0.0,4.0,0.0,0.0,0.0,0.0,1.0,4.0
1261,9751,7.0,4.0,1.0,0.0,1.0,0.0,8.0,4.0
1270,9761,3.0,3.0,1.0,0.0,1.0,0.0,4.0,3.0
1284,9776,2.0,3.0,0.0,0.0,0.0,0.0,3.0,3.0
1335,9829,2.0,7.0,1.0,1.0,1.0,1.0,3.0,7.0
1380,9874,1.0,2.0,1.0,0.0,1.0,0.0,2.0,2.0
2268,16893,4.0,3.0,1.0,0.0,1.0,0.0,5.0,3.0


'Home RC'

Unnamed: 0,fixture_id,home_yc,away_yc,home_rc,away_rc,home_red_card,away_red_card,home_yellow_card,away_yellow_card
708,8689,2.0,6.0,1.0,0.0,2.0,0.0,2.0,6.0
6890,232726,4.0,0.0,1.0,0.0,2.0,0.0,4.0,1.0


'Away YC'

Unnamed: 0,fixture_id,home_yc,away_yc,home_rc,away_rc,home_red_card,away_red_card,home_yellow_card,away_yellow_card
1085,9544,6.0,3.0,1.0,0.0,1.0,0.0,6.0,4.0
1138,9604,3.0,6.0,0.0,0.0,0.0,0.0,3.0,7.0
1159,9627,4.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0
1169,9641,3.0,3.0,0.0,1.0,0.0,1.0,3.0,4.0
1185,9660,3.0,3.0,0.0,0.0,0.0,0.0,3.0,4.0
1191,9666,3.0,1.0,0.0,0.0,0.0,0.0,3.0,2.0
1209,9686,2.0,2.0,0.0,0.0,0.0,0.0,2.0,3.0
1235,9715,3.0,5.0,0.0,0.0,0.0,0.0,3.0,6.0
1260,9750,3.0,2.0,0.0,0.0,0.0,0.0,3.0,3.0
1283,9775,1.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0


'Away RC'

Unnamed: 0,fixture_id,home_yc,away_yc,home_rc,away_rc,home_red_card,away_red_card,home_yellow_card,away_yellow_card
5304,93260,2.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0


In [63]:
matches_not_on_pitch = match_events_raw[match_events_raw.event_comments == "Not on pitch"]
matches_not_on_pitch_unsp = match_events_raw[
    match_events_raw.event_comments == "Not on pitch, Unsportsmanlike conduct"]

not_on_pitch_ids = list(matches_not_on_pitch.fixture_id.unique())
not_on_pitch_ids.extend(list(matches_not_on_pitch_unsp.fixture_id.unique()))
not_on_pitch_ids = list(set(not_on_pitch_ids))

In [64]:
card_dfs = [(home_rc_wrong, 'home', 'red'), (home_yc_wrong, 'home', 'yellow'),
            (away_rc_wrong, 'away', 'red'), (away_yc_wrong, 'away', 'yellow')]

print("This is for checking that there aren't extra match event rows")
for card_df, home_away, red_yellow in card_dfs:
    ids_of_incorrect_cards = list(card_df.fixture_id.unique())
    ha_upper = home_away.upper()
    
    ry_upper = red_yellow.upper()
    ry = red_yellow[0]
    
    print("==================================================================")
    print(f"{home_away} {ry_upper} | stats - events")
    print("__________________________________________________________________")
    if (len(ids_of_incorrect_cards) == 0): print('NONE')
    for fixture_id in ids_of_incorrect_cards:
        
        fixture_events_data = match_events_raw[match_events_raw.fixture_id == fixture_id]
        fixture_event_count = cards_csv[cards_csv.fixture_id == fixture_id]
        
        home_team = list(fixture_events_data.home_team_name)[0]
        away_team = list(fixture_events_data.away_team_name)[0]
        fixture_date = list(fixture_events_data.fixture_date)[0]
        
        stat_count = list(fixture_event_count[f"{home_away}_{ry}c"])[0]
        event_count = list(fixture_event_count[f"{home_away}_{red_yellow}_card"])[0]
        
        if fixture_id in ([
            8689, 232726, 93260, 9569, 9596, 9718, 9746, 9751, 9761, 9776, 9829, 9874, 16893,
            16928, 17671, 20454, 20459, 20620, 20635, 20647, 20654, 20669, 20693, 20707, 20727,
            20730, 20793, 20817, 21506, 27164, 27173, 27183, 27213, 27260, 27263, 27328, 35564,
            57373, 93391, 93446, 121288, 157236, 157626, 157627, 209075,
            157260, 157613, 209083, 209086, 214216, 214238, 232726, 232726, 250170, 315973, 320852,
            214131, 214197, 214204, 315960, 315972, 321836, 9544, 9604, 9627, 9641, 9660, 9666,
            9686, 9715, 9750, 9775, 9804, 9833, 9851, 9866, 9886, 10980, 16750, 20444, 20584,
            20641, 20797, 20800, 21631, 27161, 27214, 35663, 37586, 93304, 93349, 121467,
            152982, 214176
            
        ] + not_on_pitch_ids): continue
        
        print((f"{fixture_id} :       "+
               f"{stat_count} - {event_count}    " + 
               f"{fixture_date}  "+
               f"{home_team} vs {away_team}"))
    
    print("------------------------------------------------------------------\n")

This is for checking that there aren't extra match event rows
home RED | stats - events
__________________________________________________________________
------------------------------------------------------------------

home YELLOW | stats - events
__________________________________________________________________
------------------------------------------------------------------

away RED | stats - events
__________________________________________________________________
------------------------------------------------------------------

away YELLOW | stats - events
__________________________________________________________________
------------------------------------------------------------------



### NOTES

Once I've done this, I can just use the match_events_counts_csv to fill in the number of red and yellow cards of teams in the match stats dataframe and then I can finish the filtering and end up with a clean dataframe with all rows having no NaNs (depending on which variables the model deems important. Then I will keep only the important variables and then filter so there isn't a single missing value)

However when I split the columns into seperate files for the sake of the bayesian modelling, I will keep every value that isnt Nan for each variable and so some may have different lengths which may not be a problem