# 0001.01 Data - Scope Football Data Co Uk Data

In [1]:
import pathlib
import sys
import datetime

import numpy as np
import pandas as pd
import missingno as msno
import matplotlib.pyplot as plt

%matplotlib inline

# Load the "autoreload" extension
%load_ext autoreload
# always reload modules marked with "%aimport"
%autoreload 1
# add the 'src' directory to path to import modules
src_dir = pathlib.Path().cwd().resolve().parent / 'src'
#src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)
# import my class code from the source
# %aimport src-dir.filename

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

PROJECT_DIR = pathlib.Path().cwd().resolve().parent

RAW_DATA_DIR = PROJECT_DIR / 'data' / '01-raw'
SCOPED_DATA_DIR = PROJECT_DIR / 'data' / '02-scoped'

### Interpretation Notes for football-data.co.uk Data Files


[Interpretation Notes for football-data-co.uk data files](../data/reference/notes.txt)

## Select the Top 16 European Leagues by Attendance

https://en.wikipedia.org/wiki/List_of_attendance_figures_at_domestic_professional_sports_leagues#Outdoor_sports

Referenced on 30 November 2019

## Select Raw Data Source, League and Seasons

In [2]:
scope_data = {'nations': ['germany', 'england', 'spain', 'italy',
          'france', 'england', 'germany', 'netherlands',
          'russian-federation', 'scotland', 'portugal', 'switzerland',
          'belgium', 'turkey', 'poland', 'england'],
              'leagues': ['bundesliga', 'english-premier-league', 'la-liga', 'serie-a',
           'ligue-1', 'english-championship', 'bundesliga-2', 'eredivisie',
           'premier-league', 'premiership', 'primeira-liga', 'super-league',
           'first-division-a', 'super-lig', 'ekstraklasa', 'one'],
              'seasons': ['2000-2001', '2001-2002', '2002-2003', '2003-2004',
           '2004-2005', '2005-2006', '2006-2007', '2007-2008',
           '2008-2009', '2009-2010', '2010-2011', '2011-2012',
           '2012-2013', '2013-2014', '2014-2015', '2015-2016',
           '2016-2017', '2017-2018']}

In [3]:
def make_fdcuk_load_fps(top_level_dir, scope_data):
    """
    Accepts directory names to enable reach into raw data directory
    Returns full filepaths of the data files
    """
    fps = []
    for nation, league in zip(scope_data['nations'], scope_data['leagues']):
        for season in scope_data['seasons']:
            fn = season + '.csv'
            stub = RAW_DATA_DIR / 'football-data' / nation / league / season
            fp = stub / 'football-data-co-uk' / 'season-data' / fn
            if fp.exists():
                fps.append(fp)
    return fps

def add_fdcuk_meta_data(df, fp):
    strfp = str(fp)
    season = fp.stem
    nation=strfp.split('/')[11]
    league = strfp.split('/')[12]
    df['nation'] = nation
    df['league'] = league
    df['season'] = season
    return df

def read_badly_formed_csv_to_df(fp):
    # https://stackoverflow.com/questions/55188544/pandas-how-to-workaround-error-tokenizing-data
    # Skipping line 17: Expected 59 fields in line 17, saw 65
    season_df = pd.read_csv(fp, header=None, sep='\n', encoding="ISO-8859-1")
    season_df = season_df[0].str.split(',', expand=True)
    season_df.columns = season_df.iloc[0]
    season_df.drop([0], axis=0, inplace=True)
    season_df = add_fdcuk_meta_data(season_df, fp)
    season_df['bad'] = 1
    return season_df

    

def load_fdcuk_fps_as_dfs(fdcuk_fps):   

    good_season_dfs=[]
    bad_season_dfs = []
    for fp in fdcuk_fps:
        try:
            # Note: dayfirst=True
            season_df = pd.read_csv(fp, dayfirst=True, parse_dates=['Date'],
                                    engine='python', error_bad_lines=True, encoding="ISO-8859-1")

            season_df = add_fdcuk_meta_data(season_df, fp)
            good_season_dfs.append(season_df)
        except:
            season_df = read_badly_formed_csv_to_df(fp)
            bad_season_dfs.append(season_df)
    
    all_season_dfs = good_season_dfs + bad_season_dfs
    return all_season_dfs



def make_scoped_save_fps(top_level_dir, season_dfs, source = 'indatabet-com'):
    
    scoped_fdcuk_fps = []
    for season_df in season_dfs:
        nation = season_df['nation'].unique()[0]
        league = season_df['league'].unique()[0]
        season = season_df['season'].unique()[0]
        fn = str(season) + '.csv'
        save_fp = top_level_dir / source / nation / league / season / fn
        scoped_fdcuk_fps.append(save_fp)
    return scoped_fdcuk_fps


def save_dfs_to_fps(dfs, fps):
    """
    
    """
    n = 0
    for df, fp in zip(dfs, fps):
        if not fp.exists():
            fp.parent.mkdir(parents=True, exist_ok=True)
        df.to_csv(fp, index=False)
        n += 1
    return n

fdcuk_fps = make_fdcuk_load_fps(RAW_DATA_DIR, scope_data)
all_season_dfs = load_fdcuk_fps_as_dfs(fdcuk_fps)
scoped_fdcuk_fps = make_scoped_save_fps(SCOPED_DATA_DIR,
                                           all_season_dfs,
                                           source = 'football-data-co-uk')
n_saved = save_dfs_to_fps(all_season_dfs, scoped_fdcuk_fps)
n_saved

226

In [4]:
stop

NameError: name 'stop' is not defined

In [None]:
bad_season_dfs[-1].head()

In [None]:
# Specify scope of data for cleaning
source = 'football-data'
nation = 'united-kingdom'
league = 'english-premier-league'
seasons = ['2008-2009', '2009-2010', '2010-2011', '2011-2012', '2012-2013',
           '2013-2014', '2014-2015', '2015-2016', '2016-2017', '2017-2018']

In [None]:
def form_fdcu_fps(RAW_DATA_DIR, source, nation, league, seasons):
    """
    Accepts directory names to enable reach into raw data directory
    Returns full filepaths of the data files
    """
    fps = []
    fdcu_specifics = ['football-data-co-uk', 'season-data']
    for season in seasons:
        fn = season + '.csv'
        stub = RAW_DATA_DIR / source / nation / league / season
        fp = stub / fdcu_specifics[0] / fdcu_specifics[1] / fn
        #if fp.is_file():
        if fp.exists():
            fps.append(fp)
    return fps

fp = form_fdcu_fps(RAW_DATA_DIR, source, nation, league, seasons)
fp[0]

In [None]:
nations = ['germany', 'united-kingdom', 'spain', 'italy',
          'france', 'united-kingdom', 'germany', 'netherlands',
          'russian-federation', 'scotland', 'portugal', 'switzerland',
          'belgium', 'turkey', 'poland', 'united-kingdom']

# poland, switzerland are multileague

leagues = ['bundesliga', 'english-premier-league', 'la-liga', 'serie-a',
           'ligue-1', 'english-championship', 'bundesliga-2', 'eredivisie',
           'premier-league', 'premiership', 'primeira-liga', 'super-league',
           'first-division-a', 'super-lig', 'ekstraklasa', 'one']

seasons = ['2000-2001', '2001-2002', '2002-2003', '2003-2004',
           '2004-2005', '2005-2006', '2006-2007', '2007-2008',
           '2008-2009', '2009-2010', '2010-2011', '2011-2012',
           '2012-2013', '2013-2014', '2014-2015', '2015-2016',
           '2016-2017', '2017-2018']

In [None]:
all_fps = []
for nation, league in zip(nations, leagues):
    fps = form_fdcu_fps(RAW_DATA_DIR, source, nation, league, seasons)
    all_fps.extend(fps)

print(len(all_fps))

In [None]:
def read_helper(fp, df):
    strfp = str(fp)
    season = fp.stem
    nation=strfp.split('/')[11]
    league = strfp.split('/')[12]
    df['nation'] = nation
    df['league'] = league
    df['season'] = season
    return df
    
    

good_season_dfs=[]
bad_season_dfs = []
for fp in all_fps:

    try:
        # Note: dayfirst=True
        season_df = pd.read_csv(fp, dayfirst=True, parse_dates=['Date'],
                                engine='python', error_bad_lines=True, encoding="ISO-8859-1")
        season_df = read_helper(fp, season_df)
        good_season_dfs.append(season_df)
    except:
        # https://stackoverflow.com/questions/55188544/pandas-how-to-workaround-error-tokenizing-data
        # Skipping line 17: Expected 59 fields in line 17, saw 65
        season_df = pd.read_csv(fp, header=None, sep='\n', encoding="ISO-8859-1")
        season_df = season_df[0].str.split(',', expand=True)
        season_df = read_helper(fp, season_df)
        bad_season_dfs.append(season_df)

df_orig = pd.concat(good_season_dfs, axis=0, sort=True)    
df_orig.head()

In [None]:
df_orig.shape

In [None]:
# Drop any columns that are all null
df_orig = df_orig.dropna(axis=1, how='all')
df_orig.shape

In [None]:
df_orig.info(verbose=True, null_counts=True)

In [None]:
# Looks like there are differently named columns for the same thing
# i.e Away, Away team and Home, Home Team
# and for Goals AG and FTAG, and HG and FTHG
df_orig['AwayTeam'] = df_orig['AwayTeam'].fillna(df_orig['Away'])
df_orig['HomeTeam'] = df_orig['HomeTeam'].fillna(df_orig['Home'])
df_orig['FTHG'] = df_orig['FTHG'].fillna(df_orig['HG'])
df_orig['FTAG'] = df_orig['FTAG'].fillna(df_orig['AG'])
df_orig['FTR'] = df_orig['FTR'].fillna(df_orig['Res'])

df_orig['PSH'] = df_orig['PSH'].fillna(df_orig['PH'])
df_orig['PSD'] = df_orig['PSD'].fillna(df_orig['PD'])
df_orig['PSA'] = df_orig['PSA'].fillna(df_orig['PA'])


df_orig[['league', 'AwayTeam', 'Away', 'HomeTeam', 'Home', 'FTHG', 'HG', 'FTAG', 'AG', 'FTR', 'Res']].info()

In [None]:
df_orig = df_orig.drop(columns=['Away', 'Home', 'HG', 'AG', 'Res'])

In [None]:
# looks like we have got some missing team names
# If everything else is mt, these may be mt lines at the bottom of the csvs
df_1 = df_orig[df_orig['AwayTeam'].isnull()]
df_1.head(30)

In [None]:
# These are full of nans - Drop these
df_2 = df_orig[~df_orig['AwayTeam'].isnull()]

In [None]:
df_2.shape

In [None]:
# Drop columns we definitely do not want
df_2 = df_2.drop(columns=['Attendance', 'Country', 'Div', 'Season', 'Time', 'ABP', 'HBP', 'Referee', 'HTR'])
df_2 = df_2.sort_values(by=['nation', 'league', 'season'])

In [None]:
# we definitely want to use better features than just goals
# Definitely want to use Shots and Shots on target
# These are coded as HS, AS, HST, AST
msno.matrix(df_2[['HS', 'AS', 'HST', 'AST']])
plt.show();

In [None]:
# We pre-sorted by nation, league, season so it appears that the missing blocks are by season.
# it also seems like there is less Shots on target data, than there is in the shots data, so, we will
# Drop any season that have got missing values for HST, or AST 

In [None]:
st_dfs=[]
for (nation, league, season), df in df_2.groupby(by=['nation', 'league', 'season']):
    if df[['HST', 'AST']].isnull().sum().sum() == 0:
          st_dfs.append(df)

In [None]:
df_cut = pd.concat(st_dfs, axis=0)

In [None]:
df_3 = df_cut.copy(deep=True)
df_3 = df_3.dropna(axis=1, how='all')
df_3 = df_3.sort_values(by=['nation', 'league', 'season'])
df_3.info(verbose=True, null_counts=True)

In [None]:
# Looks like there are 42200 potential records

In [None]:
# We have sorted on nation., league and season
# remove the full columns, and review how the nulls fit into the overall dataframe

#df_null_cols = df_3[df_3.columns[df_3.isnull().any()]]
msno.matrix(df_3)
plt.show();

In [None]:
df_3_full_cols = df_3[df_3.columns[~df_3.isnull().any()]]
df_3_full_cols.info()

In [None]:
# We will be keeping all these so rename
rename_d = {'AC': 'a_corners', 'AR': 'a_redCards', 'AS': 'a_shots', 'AST': 'a_shotsOnTarget',
           'AY': 'a_yellowCards', 'AwayTeam': 'a',
            'Date': 'date', 'FTAG': 'a_ftGoals', 'FTHG': 'h_ftGoals', 'FTR': 'ftResult',
            'HC': 'h_corners', 'HR': 'h_redCards', 'HS': 'h_shots', 'HST': 'h_shotsOnTarget',
            'HY': 'h_yellowCards', 'HomeTeam': 'h',
           'HTAG': 'a_htGoals', 'HTHG': 'h_htGoals', 'AF': 'a_fouls', 'HF': 'h_fouls',
           'HHW': 'h_woodWork', 'AHW': 'a_woodWork',
           'AO': 'a_offsides', 'HO': 'h_offsides'}
df_3 = df_3.rename(columns=rename_d)

In [None]:
df_4 = df_3.copy(deep=True)
# df_null_cols = df[df.columns[df.isnull().any()]]
msno.matrix(df_4[df_4.columns[df_4.isnull().any()]])
plt.show();   
# for (nation, league, season), df in df_4.groupby(by=['nation', 'league', 'season']):
#     print(nation, league, season)
#     df_null_cols = df[df.columns[df.isnull().any()]]
#     msno.matrix(df_null_cols, figsize=(14,2))
#     plt.show();    

In [None]:
# take alook at the columns with any nulls
# A full column has 42200 values
my_cols = [col for col in df_4.columns if col not in list(rename_d.values())]
df_5 = df_4[my_cols]
df_5[df_5.columns[df_5.isnull().any()]].info(verbose=True, null_counts=True)

In [None]:
# For each game, it would be ideal if, as well as the match statistics, we could have
# Mean odds, max odds, a full set of odds from a bookmaker, the closing odds
# 42200 Games
# 

In [None]:
bb_dfs=[]
for (nation, league, season), df in df_4.groupby(by=['nation', 'league', 'season']):
    if df[['BbMxH', 'BbMxD', 'BbMxA', 'B365A', 'B365D', 'B365H']].isnull().sum().sum() == 0:
          bb_dfs.append(df)

In [None]:
df_6 = pd.concat(bb_dfs)
df_6.info(verbose=True, null_counts=True)
#df_6[df_6.columns[~df_6.isnull().any()]].info(verbose=True, null_counts=True)


In [None]:
# Drop any columns with less than threshold
thresh = int(len(df_6) * 0.75)
df_7 = df_6.copy(deep=True)
df_7 = df_7.dropna(axis=1, thresh=thresh)
df_7.info(verbose=True, null_counts=True)

In [None]:
df = pd.concat([df_7, df_6[['PSCA', 'PSCD', 'PSCH']]], axis=1)

In [None]:
cols = [col for col in df.columns if not col.startswith(('h', 'a'))]#'h' not in col[0:2] or 'a' not in col[0:1]]
#or (col[0] != 'a')]
print(sorted(cols))
#df_7.columns#info(verbose=True, null_counts=True)

In [None]:
columns = {'B365H': 'odds_hwin_bet365', 'B365D': 'odds_draw_bet365', 'B365A': 'odds_awin_bet365',
            'BWH': 'odds_hwin_BW', 'BWD': 'odds_draw_BW', 'BWA': 'odds_awin_BW',
            'Bb1X2': 'n_Bb1X2', 'BbAH': 'n_BbAsian', 'BbAHh': 'BbAsian_handicap',
            'BbAv<2.5': 'odds_ftgoalsu2.5_bbmean', 'BbAv>2.5': 'odds_ftgoalso2.5_bbmean', 
            'BbAvA': 'odds_awin_bbmean',
            'BbAvAHA': 'odds_asianaway_bbmean', 'BbAvAHH': 'odds_asianhome_bbmean',
            'BbAvD': 'odds_draw_bbmean', 'BbAvH': 'odds_hwin_bbmean',
            'BbMx<2.5': 'odds_ftgoalsu2.5_bbmax', 'BbMx>2.5': 'odds_ftgoalso2.5_bbmax',
            'BbMxA': 'odds_awin_bbmax','BbMxAHA': 'odds_asianaway_bbmax', 'BbMxAHH': 'odds_asianhome_bbmax',
            'BbMxD': 'odds_draw_bbmax', 'BbMxH': 'odds_hwin_bbmax', 'BbOU': 'n_BbOU',
            'IWH': 'odds_hwin_IW', 'IWD': 'odds_draw_IW', 'IWA': 'odds_awin_IW',
            'LBH': 'odds_hwin_LB', 'LBD': 'odds_draw_LB', 'LBA': 'odds_awin_LB',
            'PSCH': 'clodds_hwin_pinn', 'PSCD': 'clodds_draw_pinn', 'PSCA': 'clodds_away_pinn',
            'VCH': 'odds_hwin_VC', 'VCD': 'odds_draw_VC', 'VCA': 'odds_awin_VC', 
            'WHH': 'odds_hwin_WH', 'WHD': 'odds_draw_WH', 'WHA': 'odds_awin_WH'}
df.rename(columns=columns, inplace=True)
df = df.reset_index(drop=True)

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.info(verbose=True, null_counts=True)

In [None]:
df.describe()

In [None]:
full_cols = df[df.columns[~df.isnull().any()]].columns
list(full_cols)

In [None]:
msno.matrix(df[df.columns[df.isnull().any()]])
plt.show();

In [None]:
df['nation'].value_counts()

In [None]:
df['league'].value_counts()

In [None]:
df['season'].value_counts().sort_index()

In [None]:
for (nation, league, season), seas_df in df.groupby(by=['nation', 'league', 'season']):
    season_df = seas_df.copy(deep=True)
    season_df.sort_values(by=['date'], inplace=True)
    season_df.reset_index(drop=True, inplace=True)
    fn = season + '.csv'
    source = 'football-data-co-uk'
    save_dir = CLEANED_DIR / source / nation / league / season
    save_fp = save_dir / fn
    save_dir.mkdir(parents=True, exist_ok=True)
    season_df.to_csv(save_fp, index=False)

In [None]:
season_df.head()

In [None]:
season_df.tail()

In [None]:
season_df.info()

In [None]:
season_df.describe()

## Test Read

In [None]:
df = pd.read_csv(save_fp, parse_dates=['date'], index_col=None)
df.head()

In [None]:
df.info()

In [None]:
stop

## End

In [None]:
df_orig[1].value_counts()

In [None]:
# In order to clean up the data, we have to see all the seaons together, so we can see 
# what data is missing across the full scope of the data
# Therefore, compile everything together and take a look
leagues = ['english-premier-league', 'la-liga', 'bundesliga', 'serie-a', 'ligue-1', 'primeira-liga',
           'russian-premier-league', 'ukranian-premier-league', 'eredivisie', 'superleague', 'super-lig',
           'superliga', 'belgian-pro-league'
all_fps = []
for league in leagues:
fps = form_fdcu_fps(RAW_DATA_DIR, source, nation, league, seasons)
season_dfs=[]
for season, fp in zip(seasons, fps):
    # Note: dayfirst=True
    season_df = pd.read_csv(fp, dayfirst=True, parse_dates=['Date'])
    # Add the season to help navigate the merged dataframe
    season_df['season'] = season
    season_dfs.append(season_df)

df = pd.concat(season_dfs, axis=0, sort=True)
df = df.sort_values(by='Date')
    
df.head()

In [None]:
df.info()

## Review Complete, Partially Complete Data

In [None]:
df_full = df[df.columns[~df.isnull().any()]]
df_full.info()

In [None]:
df_with_nulls = df[df.columns[df.isnull().any()]]

In [None]:
msno.matrix(df_with_nulls)
plt.show();

In [None]:
# Looks like the data is missing based on the season
# The Pinnacle closing odds data PSCA, PSCD, PSCH is missing for the first few seasons which is disappointing
# But even with missing values, we will leave move this data onto the next stage
# BbAH, BBAvAHA, BBAvAHH, BbMxAHA, and BbMXAHH have got more than 99% of their data 
# ie 3410/3420 = 0.997, so we will keep these columns

In [None]:
# drop_threshold = 0.995
# df.info().index
# #droppers = ['BSA']
df.dropna(thresh=df.shape[0]*0.6,how='all',axis=1)

In [None]:
stop

In [None]:
df_read = pd.read_csv(INTERIM_DATA_DIR / league / save_file_name,
                      parse_dates=['date'], index_col=None)
df.head()

In [None]:
# Grab the first data file and take a look
fp1 = form_fdcu_fps(RAW_DATA_DIR, source, nation, league, seasons)[0]
fp1

In [None]:
league = 'english-premier-league'
GAME_DATA_DIR = RAW_DATA_DIR / 'football-data' / 'United-Kingdom' / league

# seasons 2009-2010 through to 2017-2018
start_year = 2009 ; end_year = 2018
seasons = [str(year) + '-' + str(year+1) for year in range(start_year, end_year)]
print(GAME_DATA_DIR)
save_file_name = str('football-data-' + league + '.csv')

In [None]:
# Compile Seasons into one file
season_dfs=[]
for season in seasons:
    season_fp = GAME_DATA_DIR / season / 'football-data-co-uk' / 'season-data' / str(season + '.csv')
#     season_dfs = pd.read_csv(season_fp)
#     print(season_df.head())
    season_df = pd.read_csv(season_fp, dayfirst=True, parse_dates=['Date'])
    season_df['season'] = season
    season_dfs.append(season_df)

df = pd.concat(season_dfs, axis=0, sort=True)
    
df.head()

In [None]:
seasons_df = df.copy(deep=True)

# Drop unnecesary columns
# Referee is not available for the full data set, and Half Time Result and Full Time Result can be calculated
seasons_df.drop(columns=['Div', 'FTR', 'HTR', 'Referee', ], inplace=True)
seasons_df.rename(columns={'Date': 'date', 'HomeTeam': 'h', 'AwayTeam': 'a',
                          'FTHG': 'h_ftgoals', 'FTAG': 'a_ftgoals', 'HTHG': 'h_htgoals', 'HTAG': 'a_htgoals',
                         'HS': 'h_shots', 'AS': 'a_shots', 'HST': 'h_sot', 'AST': 'a_sot',
                         'HF': 'h_fouls', 'AF': 'a_fouls', 'HC': 'h_corners', 'AC': 'a_corners',
                         'HY': 'h_ycards', 'AY': 'a_ycards', 'HR': 'h_rcards', 'AR': 'a_rcards'}, inplace=True)

columns = {'B365H': 'odds_hwin_bet365', 'B365D': 'odds_draw_bet365', 'B365A': 'odds_awin_bet365',
          'BWH': 'odds_hwin_BW', 'BWD': 'odds_draw_BW', 'BWA': 'odds_awin_BW',
          'GBH': 'odds_hwin_GB', 'GBD': 'odds_draw_GB', 'GBA': 'odds_awin_GB',
          'IWH': 'odds_hwin_IW', 'IWD': 'odds_draw_IW', 'IWA': 'odds_awin_IW',
          'LBH': 'odds_hwin_LB', 'LBD': 'odds_draw_LB', 'LBA': 'odds_awin_LB',
          'SBH': 'odds_hwin_SB', 'SBD': 'odds_draw_SB', 'SBA': 'odds_awin_SB',
          'WHH': 'odds_hwin_WH', 'WHD': 'odds_draw_WH', 'WHA': 'odds_awin_WH',
          'SJH': 'odds_hwin_SJ', 'SJD': 'odds_draw_SJ', 'SJA': 'odds_awin_SJ',
          'VCH': 'odds_hwin_VC', 'VCD': 'odds_draw_VC', 'VCA': 'odds_awin_VC',           
          'BSH': 'odds_hwin_BS', 'BSD': 'odds_draw_BS', 'BSA': 'odds_awin_BS',
          'PSH': 'odds_hwin_pinn', 'PSD': 'odds_draw_pinn', 'PSA': 'odds_awin_pinn',
           'Bb1X2': 'n_Bb1X2',
           'BbMxH': 'odds_hwin_bbmax', 'BbMxD': 'odds_draw_bbmax', 'BbMxA': 'odds_awin_bbmax',
           'BbAvH': 'odds_hwin_bbmean', 'BbAvD': 'odds_draw_bbmean', 'BbAvA': 'odds_awin_bbmean',
           'BbOU': 'n_BbOU',
           'BbMx>2.5': 'odds_ftgoalso2.5_bbmax', 'BbAv>2.5': 'odds_ftgoalso2.5_bbmean',
           'BbMx<2.5': 'odds_ftgoalsu2.5_bbmax', 'BbAv<2.5': 'odds_ftgoalsu2.5_bbmean',
           'BbAH': 'n_BbAsian',
           'BbAHh': 'BbAsian_handicap',
           'BbMxAHH': 'odds_asianhome_bbmax', 'BbAvAHH': 'odds_asianhome_bbmean',
           'BbMxAHH': 'odds_asianhome_bbmax', 'BbAvAHH': 'odds_asianhome_bbmean',
           'BbMxAHA': 'odds_asianaway_bbmax', 'BbAvAHH': 'odds_asianaway_bbmean',
           'BbMxAHA': 'odds_asianaway_bbmax', 'BbAvAHA': 'odds_asianaway_bbmean',
           'PSCH': 'clodds_hwin_pinn', 'PSCD': 'clodds_draw_pinn', 'PSCA': 'clodds_away_pinn'}
seasons_df.rename(columns=columns, inplace=True)


seasons_df.head()

In [None]:
seasons_df.describe()

In [None]:
seasons_df.info()

In [None]:
seasons_df.to_csv(INTERIM_DATA_DIR / league / save_file_name, index=False)

## Read Back Data

In [None]:
df = pd.read_csv(INTERIM_DATA_DIR / league / save_file_name,
                      parse_dates=['date'], index_col=None)
df.head()

In [None]:
df.info()