In [1]:
import os

import pandas as pd
pd.set_option('display.max_columns', 60)
pd.set_option('display.max_rows', 300)

import etl
import utils as u
data_cfg = u.get_config('data')
cfg = u.get_config()

---

#### Reading in data

In [2]:
match_events_clean = pd.read_csv("data/csv_of_json_data/match_events_clean.csv")
match_stats_clean = pd.read_csv("data/csv_of_json_data/match_stats_clean.csv")
match_events_count_clean = pd.read_csv("data/csv_of_json_data/match_events_count_clean.csv")

  interactivity=interactivity, compiler=compiler, result=result)


## Checks to perform:
    - Check match_stats data matches match_events data
    - That the columns have the right relationships with each other:
        - Total shots = Sum(shots_ont + shots_offt + shots_bl)
        - Total shots = Sum(shots_inb + shots_outb)
        - Passes_pct = passes_act/passes_tot
        - Sum(Home_possession + away_possession) = 1

---

## 1) Check match_stats data matches match_events data

### 1.1) Goals

In [3]:
checks = match_stats_clean[['fixture_id', 'home_goals', 'away_goals']].copy()
count_checks = match_events_count_clean[[
    'fixture_id',
    'home_normal_goal', 'home_penalty', 'home_own_goal', 'home_missed_penalty',
    'away_normal_goal', 'away_penalty', 'away_own_goal', 'away_missed_penalty']].copy()

checks = checks.merge(count_checks, on='fixture_id', how='inner')

ids_goals_dont_add = list(set(
    list(checks[checks['home_goals'] != (
        checks['home_normal_goal'] + checks['home_own_goal'] + checks['home_penalty']
)].fixture_id.unique()) +
    list(checks[checks['away_goals'] != (
        checks['away_normal_goal'] + checks['away_own_goal'] + checks['away_penalty']
)].fixture_id.unique())))

checks[checks.fixture_id.isin(ids_goals_dont_add)]

Unnamed: 0,fixture_id,home_goals,away_goals,home_normal_goal,home_penalty,home_own_goal,home_missed_penalty,away_normal_goal,away_penalty,away_own_goal,away_missed_penalty


### 1.2) Cards

In [4]:
match_stats_csv_cards = match_stats_clean[['fixture_id','home_yc', 'away_yc', 'home_rc', 'away_rc']].copy()

match_events_count_csv_cards = match_events_count_clean[[
    'fixture_id', 'home_red_card', 'away_red_card', 'home_yellow_card', 'away_yellow_card']]
cards_csv = match_stats_csv_cards.merge(match_events_count_csv_cards, on='fixture_id', how='left')

yc_csv_clean = cards_csv[~cards_csv.home_yc.isnull()]
home_yc_wrong = yc_csv_clean[(yc_csv_clean['home_yc'] != yc_csv_clean['home_yellow_card'])]
away_yc_wrong = yc_csv_clean[(yc_csv_clean['away_yc'] != yc_csv_clean['away_yellow_card'])]

home_rc_csv_clean = cards_csv[~cards_csv.home_rc.isnull()]
home_rc_wrong = home_rc_csv_clean[
    (home_rc_csv_clean['home_rc'] != home_rc_csv_clean['home_red_card'])]

away_rc_csv_clean = cards_csv[~cards_csv.away_rc.isnull()]
away_rc_wrong = away_rc_csv_clean[
    (away_rc_csv_clean['away_rc'] != away_rc_csv_clean['away_red_card'])]

ids_cards_wrong_in_events = list(set(list(home_rc_wrong.fixture_id.unique()) +
                                     list(away_rc_wrong.fixture_id.unique()) +
                                     list(home_yc_wrong.fixture_id.unique()) +
                                     list(away_yc_wrong.fixture_id.unique())))

display("Home YC", home_yc_wrong)
display("Home RC", home_rc_wrong)
display("Away YC", away_yc_wrong)
display("Away RC", away_rc_wrong)

'Home YC'

Unnamed: 0,fixture_id,home_yc,away_yc,home_rc,away_rc,home_red_card,away_red_card,home_yellow_card,away_yellow_card


'Home RC'

Unnamed: 0,fixture_id,home_yc,away_yc,home_rc,away_rc,home_red_card,away_red_card,home_yellow_card,away_yellow_card


'Away YC'

Unnamed: 0,fixture_id,home_yc,away_yc,home_rc,away_rc,home_red_card,away_red_card,home_yellow_card,away_yellow_card


'Away RC'

Unnamed: 0,fixture_id,home_yc,away_yc,home_rc,away_rc,home_red_card,away_red_card,home_yellow_card,away_yellow_card


#### 1.3) Double checking events has no negative values in time elapsed

In [5]:
match_events_clean[match_events_clean.event_time_elapsed<=0]

Unnamed: 0,fixture_id,country,league_name,league_id,league_type,league_season,fixture_date,fixture_round,fixture_status,fixture_elapsed,fixture_venue,fixture_referee,fixture_result_ht,fixture_result_ft,fixture_result_et,fixture_result_pen,home_team_name,home_team_id,away_team_name,away_team_id,home_goals,away_goals,event_team_id,event_team_name,event_player_name,event_time_elapsed,event_time_elapsed_plus,event_type,event_detail,event_comments


## 2) Check match_stats column relationships are valid

### 2.1) Total shots = Sum(shots_ont + shots_offt + shots_bl)

In [6]:
for ha in ['home', 'away']:
    display(match_stats_clean[
    (match_stats_clean[f"{ha}_shots_ont"] + match_stats_clean[f"{ha}_shots_offt"] +
     match_stats_clean[f"{ha}_shots_bl"]) != match_stats_clean[f"{ha}_shots_tot"]])

Unnamed: 0,fixture_id,country,league_name,league_id,league_type,league_season,fixture_date,fixture_round,fixture_status,fixture_elapsed,fixture_venue,fixture_referee,fixture_result_ht,fixture_result_ft,fixture_result_et,fixture_result_pen,home_team_name,home_team_id,away_team_name,away_team_id,home_goals,away_goals,home_shots_ont,away_shots_ont,home_shots_offt,away_shots_offt,home_shots_tot,away_shots_tot,home_shots_inb,away_shots_inb,home_shots_outb,away_shots_outb,home_passes_acc,away_passes_acc,home_passes_tot,away_passes_tot,home_passes_pct,away_passes_pct,home_possession,away_possession,home_corners,away_corners,home_offsides,away_offsides,home_fouls,away_fouls,home_yc,away_yc,home_rc,away_rc,home_gksaves,away_gksaves,home_shots_bl,away_shots_bl


Unnamed: 0,fixture_id,country,league_name,league_id,league_type,league_season,fixture_date,fixture_round,fixture_status,fixture_elapsed,fixture_venue,fixture_referee,fixture_result_ht,fixture_result_ft,fixture_result_et,fixture_result_pen,home_team_name,home_team_id,away_team_name,away_team_id,home_goals,away_goals,home_shots_ont,away_shots_ont,home_shots_offt,away_shots_offt,home_shots_tot,away_shots_tot,home_shots_inb,away_shots_inb,home_shots_outb,away_shots_outb,home_passes_acc,away_passes_acc,home_passes_tot,away_passes_tot,home_passes_pct,away_passes_pct,home_possession,away_possession,home_corners,away_corners,home_offsides,away_offsides,home_fouls,away_fouls,home_yc,away_yc,home_rc,away_rc,home_gksaves,away_gksaves,home_shots_bl,away_shots_bl


### 2.2) Total shots = Sum(shots_inb + shots_outb)

In [7]:
stats_shotsinb_clean = match_stats_clean[~match_stats_clean['home_shots_inb'].isnull()]

for ha in ['home', 'away']:
    display(stats_shotsinb_clean[
        (stats_shotsinb_clean[f"{ha}_shots_inb"] + stats_shotsinb_clean[f"{ha}_shots_outb"]
        ) != stats_shotsinb_clean[f"{ha}_shots_tot"]])


Unnamed: 0,fixture_id,country,league_name,league_id,league_type,league_season,fixture_date,fixture_round,fixture_status,fixture_elapsed,fixture_venue,fixture_referee,fixture_result_ht,fixture_result_ft,fixture_result_et,fixture_result_pen,home_team_name,home_team_id,away_team_name,away_team_id,home_goals,away_goals,home_shots_ont,away_shots_ont,home_shots_offt,away_shots_offt,home_shots_tot,away_shots_tot,home_shots_inb,away_shots_inb,home_shots_outb,away_shots_outb,home_passes_acc,away_passes_acc,home_passes_tot,away_passes_tot,home_passes_pct,away_passes_pct,home_possession,away_possession,home_corners,away_corners,home_offsides,away_offsides,home_fouls,away_fouls,home_yc,away_yc,home_rc,away_rc,home_gksaves,away_gksaves,home_shots_bl,away_shots_bl


Unnamed: 0,fixture_id,country,league_name,league_id,league_type,league_season,fixture_date,fixture_round,fixture_status,fixture_elapsed,fixture_venue,fixture_referee,fixture_result_ht,fixture_result_ft,fixture_result_et,fixture_result_pen,home_team_name,home_team_id,away_team_name,away_team_id,home_goals,away_goals,home_shots_ont,away_shots_ont,home_shots_offt,away_shots_offt,home_shots_tot,away_shots_tot,home_shots_inb,away_shots_inb,home_shots_outb,away_shots_outb,home_passes_acc,away_passes_acc,home_passes_tot,away_passes_tot,home_passes_pct,away_passes_pct,home_possession,away_possession,home_corners,away_corners,home_offsides,away_offsides,home_fouls,away_fouls,home_yc,away_yc,home_rc,away_rc,home_gksaves,away_gksaves,home_shots_bl,away_shots_bl


### 2.3) Passes_pct = passes_act/passes_tot

In [8]:
stats_no_null_pass = match_stats_clean[~match_stats_clean.home_passes_tot.isnull()]

for ha in ['home', 'away']:
    
    pacces_pct_wrong = stats_no_null_pass[
        (stats_no_null_pass[f"{ha}_passes_acc"]/stats_no_null_pass[f"{ha}_passes_tot"]).apply(lambda x: round(x, 2)
                                                                    ) != stats_no_null_pass[f"{ha}_passes_pct"]]
    pacces_pct_wrong[f"calc_{ha}_passes_pct"] = (
        pacces_pct_wrong[f"{ha}_passes_acc"] / pacces_pct_wrong[f"{ha}_passes_tot"]
        ).apply(lambda x: round(x, 2))

    display(pacces_pct_wrong[[
        'fixture_date', 'home_team_name', 'away_team_name',
        f"{ha}_passes_acc", f"{ha}_passes_tot", f"{ha}_passes_pct", f"calc_{ha}_passes_pct"]])


Unnamed: 0,fixture_date,home_team_name,away_team_name,home_passes_acc,home_passes_tot,home_passes_pct,calc_home_passes_pct


Unnamed: 0,fixture_date,home_team_name,away_team_name,away_passes_acc,away_passes_tot,away_passes_pct,calc_away_passes_pct


### 2.4) Sum(Home_possession + away_possession) = 1

In [9]:
possession_sum_wrong = match_stats_clean[
    (match_stats_clean.home_possession + match_stats_clean.away_possession) != 1]
if len(possession_sum_wrong) > 0:    
    possession_sum_wrong['calc_possession_sum'] = (
        match_stats_clean.home_possession + match_stats_clean.away_possession)

    display(possession_sum_wrong[[
        'fixture_date', 'home_team_name', 'away_team_name',
        'home_possession', 'away_possession', 'calc_possession_sum']])
else:
    print("All sum to 1")

All sum to 1
