In [1]:
import pandas as pd
import numpy as np
import os
import requests
import datetime
import functools

# COLLECT RAW DATA

In [None]:
os.makedirs('data_raw', exist_ok=True)
base_url = "https://www.football-data.co.uk/mmz4281/{}/E0.csv"
all_data = []

for year in range(2010, 2024):

    season = f'{str(year - 1)[-2:]}{str(year)[-2:]}'
    url = base_url.format(season)
    
    try:
        response = requests.get(url)
        response.raise_for_status()
        with open(f'data_raw/E0_{year}.csv', 'wb') as f:
            f.write(response.content)
        print(f'Successfully loaded data for season {year - 1}/{year}')

        season_data = pd.read_csv(f'data_raw/E0_{year}.csv')
        all_data.append(season_data)
    
    except requests.exceptions.RequestException as e:
        print(f'Error loading {url}: {e}')

if all_data:
    combined_data = pd.concat(all_data, ignore_index=True)    
    combined_data.to_csv(f'data_raw/E0_combined_2010_2023.csv', index=False)
    print(f'Successfully saved combined data')
else:
    print('No data was downloaded.')


# PREPROCESS DATA

In [9]:
def conjunction(*conditions):
    return functools.reduce(np.logical_and, conditions)

def union(*conditions):
    return functools.reduce(np.logical_or, conditions)

lookback_opp_matches = 3
lookback_matches = 5

In [63]:
data = pd.read_csv(f'data_raw/E0_combined_2010_2023.csv')

# date format
data.dropna(subset=['Date'], inplace=True)
data = data.dropna(subset=['Date'])
data['Date'] = pd.to_datetime(data['Date'], errors='coerce')
print(data['Date'].isna().sum())
data = data.dropna(subset=['Date'])
data['Date'] = data['Date'].dt.strftime('%Y-%m-%d')

  data['Date'] = pd.to_datetime(data['Date'], errors='coerce')


0


In [64]:
# average out betting odds
data['Hodds'] = np.mean(data[['B365H','BWH','IWH','LBH','PSH','WHH','SJH','VCH']],axis=1)
data['Dodds'] = np.mean(data[['B365D','BWD','IWD','LBD','PSD','WHD','SJD','VCD']],axis=1)
data['Aodds'] = np.mean(data[['B365A','BWA','IWA','LBA','PSA','WHA','SJA','VCA']],axis=1)

# Div,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,HS,AS,HST,AST,HF,AF,HC
# ,AC,HY,AY,HR,AR,B365H,B365D,B365A,BWH,BWD,BWA,IWH,IWD,IWA,LBH,LBD,LBA,PSH,PSD,PSA,WHH,WHD,WHA,SJH
# ,SJD,SJA,VCH,VCD,VCA,Bb1X2,BbMxH,BbAvH,BbMxD,BbAvD,BbMxA,BbAvA,BbOU,BbMx>2.5,BbAv>2.5,BbMx<2.5,BbAv<2.5
# ,BbAH,BbAHh,BbMxAHH,BbAvAHH,BbMxAHA,BbAvAHA,PSCH,PSCD,PSCA

use_col = ['Date','HomeTeam','AwayTeam','FTHG','FTAG','FTR','HTHG','HTAG','HTR','Referee','HS','AS','HST','AST',
            'HC','AC','HF','AF','HY','AY','HR','AR','Hodds','Dodds','Aodds']
data = data[use_col]


In [65]:
acc_hist = {'home_wins' : [], 'home_draws' : [], 'home_losses' : [], 'home_goals' : [], 'home_oppos_goals' : [],
            'home_shots' : [], 'home_oppos_shots' : [], 'home_shotontarget' : [], 'home_oppos_shotontarget' : [],
            'away_wins' : [], 'away_draws' : [], 'away_losses' : [], 'away_goals' : [], 'away_oppos_goals' : [],
            'away_shots' : [], 'away_oppos_shots' : [], 'away_shotontarget' : [], 'away_oppos_shotontarget' : [],
            'home_oppos_wins' : [], 'home_oppos_draws' : [], 'home_oppos_losses' : [],
            'home_fouls' : [], 'home_yellowcards' : [], 'home_redcards' : [], 'home_cornerkicks' : [],
            'home_oppos_cornerkicks' : [], 'home_oppos_fouls' : [], 'home_oppos_yellowcards' : [], 'home_oppos_redcards' : [],
            'away_fouls' : [], 'away_yellowcards' : [], 'away_redcards' : [], 'away_cornerkicks' : [],
            'away_oppos_cornerkicks' : [],'away_oppos_fouls' : [], 'away_oppos_yellowcards' : [], 'away_oppos_redcards' : [],
            
            'wins' : [], 'draws' : [], 'losses' : [], 
            'oppos_wins' : [], 'oppos_draws' : [], 'oppos_losses' : [],

            'goals' : [], 'shots' : [], 'shotontarget' : [], 
            'cornerkicks' : [], 'fouls' : [], 'yellowcards' : [], 'redcards' : [],  
            'oppos_goals' : [], 'oppos_shots' : [], 'oppos_shotontarget' : [], 
            'oppos_cornerkicks' : [], 'oppos_fouls' : [], 'oppos_yellowcards' : [], 'oppos_redcards' : []
            }

In [66]:
for row in data.iterrows() :
    hometeam = row[1]['HomeTeam']
    awayteam = row[1]['AwayTeam']
    date = row[1]['Date']

    # filter matches with same playing teams
    temp1 = data[conjunction(data['HomeTeam']==hometeam, data['AwayTeam']==awayteam)]
    temp2 = data[conjunction(data['HomeTeam']==awayteam, data['AwayTeam']==hometeam)]
    temp = pd.concat([temp1, temp2], axis=0)
    history = temp[temp['Date']<date].sort_values(by='Date').tail(lookback_opp_matches)
    # if opponent history is too short, continue
    if len(history) < lookback_opp_matches :
        for key in list(acc_hist.keys()) :
            acc_hist[key].append(np.nan)
        continue

    # compute average number of goals scored against opponent in the past N matches with the opponent
    home = history[history['HomeTeam'] == hometeam]
    away = history[history['AwayTeam'] == hometeam]
    home_sum = np.sum(home[['FTHG','FTAG','HS','AS','HST','AST','HC','AC','HF','AF','HY','AY','HR','AR']])
    away_sum = np.sum(away[['FTHG','FTAG','HS','AS','HST','AST','HC','AC','HF','AF','HY','AY','HR','AR']])


    # filter recent N matches of both home and away
    home = data[union(data['HomeTeam']==hometeam, data['AwayTeam']==hometeam)]
    home = home[home['Date']<date].sort_values(by='Date').tail(lookback_matches)
    away = data[union(data['HomeTeam']==awayteam, data['AwayTeam']==awayteam)]
    away = away[away['Date']<date].sort_values(by='Date').tail(lookback_matches)

    # if match history is too short, continue
    if len(home) < lookback_matches or len(away) < lookback_matches :
        for key in list(acc_hist.keys()) :
            acc_hist[key].append(np.nan)
        continue

    home_home_sum = np.sum(home[home['HomeTeam']==hometeam][['FTHG','HS','HST','HC','HF','HY','HR']])
    home_away_sum = np.sum(home[home['AwayTeam']==hometeam][['FTAG','AS','AST','AC','AF','AY','AR']])
    away_home_sum = np.sum(away[away['HomeTeam']==awayteam][['FTHG','HS','HST','HC','HF','HY','HR']])
    away_away_sum = np.sum(away[away['AwayTeam']==awayteam][['FTAG','AS','AST','AC','AF','AY','AR']])

    # append computation results to dictionary
    acc_hist['home_oppos_goals'].append((home_sum['FTHG'] + away_sum['FTAG']) / lookback_opp_matches)
    acc_hist['away_oppos_goals'].append((home_sum['FTAG'] + away_sum['FTHG']) / lookback_opp_matches)
    acc_hist['home_oppos_shots'].append((home_sum['HS'] + away_sum['AS']) / lookback_opp_matches)
    acc_hist['away_oppos_shots'].append((home_sum['AS'] + away_sum['HS']) / lookback_opp_matches)
    acc_hist['home_oppos_shotontarget'].append((home_sum['HST'] + away_sum['AST']) / lookback_opp_matches)
    acc_hist['away_oppos_shotontarget'].append((home_sum['AST'] + away_sum['HST']) / lookback_opp_matches)
    acc_hist['home_oppos_cornerkicks'].append((home_sum['HC'] + away_sum['AC']) / lookback_opp_matches)
    acc_hist['away_oppos_cornerkicks'].append((home_sum['AC'] + away_sum['HC']) / lookback_opp_matches)
    acc_hist['home_oppos_fouls'].append((home_sum['HF'] + away_sum['AF']) / lookback_opp_matches)
    acc_hist['away_oppos_fouls'].append((home_sum['AF'] + away_sum['HF']) / lookback_opp_matches)
    acc_hist['home_oppos_yellowcards'].append((home_sum['HY'] + away_sum['AY']) / lookback_opp_matches)
    acc_hist['away_oppos_yellowcards'].append((home_sum['AY'] + away_sum['HY']) / lookback_opp_matches)
    acc_hist['home_oppos_redcards'].append((home_sum['HR'] + away_sum['AR']) / lookback_opp_matches)
    acc_hist['away_oppos_redcards'].append((home_sum['AR'] + away_sum['HR']) / lookback_opp_matches)

    acc_hist['home_goals'].append((home_home_sum['FTHG'] + home_away_sum['FTAG']) / lookback_matches)
    acc_hist['away_goals'].append((away_home_sum['FTHG'] + away_away_sum['FTAG']) / lookback_matches)
    acc_hist['home_shots'].append((home_home_sum['HS'] + home_away_sum['AS']) / lookback_matches)
    acc_hist['away_shots'].append((away_home_sum['HS'] + away_away_sum['AS']) / lookback_matches)
    acc_hist['home_shotontarget'].append((home_home_sum['HST'] + home_away_sum['AST']) / lookback_matches)
    acc_hist['away_shotontarget'].append((away_home_sum['HST'] + away_away_sum['AST']) / lookback_matches)
    acc_hist['home_cornerkicks'].append((home_home_sum['HC'] + home_away_sum['AC']) / lookback_matches)
    acc_hist['away_cornerkicks'].append((away_home_sum['HC'] + away_away_sum['AC']) / lookback_matches)
    acc_hist['home_fouls'].append((home_home_sum['HF'] + home_away_sum['AF']) / lookback_matches)
    acc_hist['away_fouls'].append((away_home_sum['HF'] + away_away_sum['AF']) / lookback_matches)
    acc_hist['home_yellowcards'].append((home_home_sum['HY'] + home_away_sum['AY']) / lookback_matches)
    acc_hist['away_yellowcards'].append((away_home_sum['HY'] + away_away_sum['AY']) / lookback_matches)
    acc_hist['home_redcards'].append((home_home_sum['HR'] + home_away_sum['AR']) / lookback_matches)
    acc_hist['away_redcards'].append((away_home_sum['HR'] + away_away_sum['AR']) / lookback_matches)


    # count ratio of wins / draws / losses in the past N matches of Home vs Away
    res = []
    for r in history.iterrows() :
        if r[1]['HomeTeam'] == hometeam :
            res.append(r[1]['FTR'])
        else :
            if r[1]['FTR'] == 'A' :
                res.append('H')
            elif r[1]['FTR'] == 'H' :
                res.append('A')
            else :
                res.append('D')
    acc_hist['home_oppos_wins'].append(res.count('H') / lookback_opp_matches)
    acc_hist['home_oppos_draws'].append(res.count('D') / lookback_opp_matches)
    acc_hist['home_oppos_losses'].append(res.count('A') / lookback_opp_matches)


    # count ratio of wins / draws / losses in the past N matches
    res = []
    for r in home.iterrows() :
        if r[1]['HomeTeam'] == hometeam :
            res.append(r[1]['FTR'])
        else :
            if r[1]['FTR'] == 'A' :
                res.append('H')
            elif r[1]['FTR'] == 'H' :
                res.append('A')
            else :
                res.append('D')
    acc_hist['home_wins'].append(res.count('H') / lookback_matches)
    acc_hist['home_draws'].append(res.count('D') / lookback_matches)
    acc_hist['home_losses'].append(res.count('A') / lookback_matches)

    res = []
    for r in away.iterrows() :
        if r[1]['HomeTeam'] == awayteam :
            res.append(r[1]['FTR'])
        else :
            if r[1]['FTR'] == 'A' :
                res.append('H')
            elif r[1]['FTR'] == 'H' :
                res.append('A')
            else :
                res.append('D')
    acc_hist['away_wins'].append(res.count('H') / lookback_matches)
    acc_hist['away_draws'].append(res.count('D') / lookback_matches)
    acc_hist['away_losses'].append(res.count('A') / lookback_matches)

    acc_hist['wins'].append(acc_hist['home_wins'][-1] - acc_hist['away_wins'][-1])
    acc_hist['draws'].append(acc_hist['home_draws'][-1] - acc_hist['away_draws'][-1])
    acc_hist['losses'].append(acc_hist['home_losses'][-1] - acc_hist['away_losses'][-1])

    acc_hist['oppos_wins'].append(acc_hist['home_oppos_wins'][-1])
    acc_hist['oppos_draws'].append(acc_hist['home_oppos_draws'][-1])
    acc_hist['oppos_losses'].append(acc_hist['home_oppos_losses'][-1])

    acc_hist['goals'].append(acc_hist['home_goals'][-1] - acc_hist['away_goals'][-1])
    acc_hist['shots'].append(acc_hist['home_shots'][-1] - acc_hist['away_shots'][-1])
    acc_hist['shotontarget'].append(acc_hist['home_shotontarget'][-1] - acc_hist['away_shotontarget'][-1])
    acc_hist['cornerkicks'].append(acc_hist['home_cornerkicks'][-1] - acc_hist['away_cornerkicks'][-1])
    acc_hist['fouls'].append(acc_hist['home_fouls'][-1] - acc_hist['away_fouls'][-1])
    acc_hist['yellowcards'].append(acc_hist['home_yellowcards'][-1] - acc_hist['away_yellowcards'][-1])
    acc_hist['redcards'].append(acc_hist['home_redcards'][-1] - acc_hist['away_redcards'][-1])

    acc_hist['oppos_goals'].append(acc_hist['home_oppos_goals'][-1] - acc_hist['away_oppos_goals'][-1])
    acc_hist['oppos_shots'].append(acc_hist['home_oppos_shots'][-1] - acc_hist['away_oppos_shots'][-1])
    acc_hist['oppos_shotontarget'].append(acc_hist['home_oppos_shotontarget'][-1] - acc_hist['away_oppos_shotontarget'][-1])
    acc_hist['oppos_cornerkicks'].append(acc_hist['home_oppos_cornerkicks'][-1] - acc_hist['away_oppos_cornerkicks'][-1])
    acc_hist['oppos_fouls'].append(acc_hist['home_oppos_fouls'][-1] - acc_hist['away_oppos_fouls'][-1])
    acc_hist['oppos_yellowcards'].append(acc_hist['home_oppos_yellowcards'][-1] - acc_hist['away_oppos_yellowcards'][-1])
    acc_hist['oppos_redcards'].append(acc_hist['home_oppos_redcards'][-1] - acc_hist['away_oppos_redcards'][-1])

  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passkwargs)
  return reduction(axis=axis, out=out, **passk

In [67]:
data

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Referee,...,AC,HF,AF,HY,AY,HR,AR,Hodds,Dodds,Aodds
0,2009-08-15,Aston Villa,Wigan,0.0,2.0,A,0.0,1.0,A,M Clattenburg,...,6.0,15.0,14.0,2.0,2.0,0.0,0.0,1.680000,3.457143,5.128571
1,2009-08-15,Blackburn,Man City,0.0,2.0,A,0.0,1.0,A,M Dean,...,4.0,12.0,9.0,2.0,1.0,0.0,0.0,3.321429,3.207143,2.135714
2,2009-08-15,Bolton,Sunderland,0.0,1.0,A,0.0,1.0,A,A Marriner,...,7.0,16.0,10.0,2.0,1.0,0.0,0.0,2.221429,3.207143,3.142857
3,2009-08-15,Chelsea,Hull,2.0,1.0,H,1.0,1.0,D,A Wiley,...,4.0,13.0,15.0,1.0,2.0,0.0,0.0,1.175714,6.071429,16.428571
4,2009-08-15,Everton,Arsenal,1.0,6.0,A,0.0,3.0,A,M Halsey,...,9.0,11.0,13.0,0.0,0.0,0.0,0.0,3.068571,3.178571,2.278571
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5316,2023-05-28,Everton,Bournemouth,1.0,0.0,H,0.0,0.0,D,S Attwell,...,3.0,11.0,12.0,1.0,3.0,0.0,0.0,1.453333,4.460000,6.715000
5317,2023-05-28,Leeds,Tottenham,1.0,4.0,A,0.0,1.0,A,A Taylor,...,3.0,7.0,5.0,3.0,0.0,0.0,0.0,2.710000,3.738333,2.330000
5318,2023-05-28,Leicester,West Ham,2.0,1.0,H,1.0,0.0,H,S Hooper,...,5.0,8.0,10.0,1.0,1.0,0.0,0.0,1.918333,3.795000,3.565000
5319,2023-05-28,Man United,Fulham,2.0,1.0,H,1.0,1.0,D,R Jones,...,4.0,14.0,10.0,1.0,2.0,0.0,0.0,1.501667,4.463333,5.765000


In [68]:
acc_hist = pd.DataFrame(acc_hist)
acc_hist 

Unnamed: 0,home_wins,home_draws,home_losses,home_goals,home_oppos_goals,home_shots,home_oppos_shots,home_shotontarget,home_oppos_shotontarget,away_wins,...,fouls,yellowcards,redcards,oppos_goals,oppos_shots,oppos_shotontarget,oppos_cornerkicks,oppos_fouls,oppos_yellowcards,oppos_redcards
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5315,0.0,0.6,0.4,0.8,0.666667,11.8,14.000000,4.0,4.333333,0.4,...,2.6,0.4,0.2,-2.333333,0.000000,-3.000000,-0.333333,2.333333,1.000000,0.000000
5316,0.0,0.4,0.6,1.0,1.333333,11.8,16.333333,4.0,5.000000,0.2,...,2.2,0.6,0.0,-2.000000,2.333333,-2.666667,-2.000000,1.666667,1.666667,0.333333
5317,0.2,0.4,0.4,0.8,2.000000,9.0,12.333333,4.0,4.333333,0.2,...,2.2,0.2,0.0,0.333333,2.333333,0.000000,1.333333,2.000000,-1.000000,0.000000
5318,0.8,0.0,0.2,1.6,1.666667,18.2,14.000000,6.0,6.666667,0.4,...,-2.4,0.6,0.0,0.666667,2.666667,1.000000,1.333333,0.000000,-1.000000,0.000000


In [69]:

data = pd.concat([data, acc_hist], axis=1)
data.drop(columns=['FTHG','FTAG','HTHG','HTAG','HTR','Referee','HS','AS','HST','AST',
            'HC','AC','HF','AF','HY','AY','HR','AR', 
            'home_wins', 'home_draws', 'home_losses', 
            'home_goals', 'home_oppos_goals', 
            'home_shots', 'home_oppos_shots', 
            'home_shotontarget', 'home_oppos_shotontarget', 
            'away_wins', 'away_draws', 'away_losses', 
            'away_goals', 'away_oppos_goals', 
            'away_shots', 'away_oppos_shots', 
            'away_shotontarget', 'away_oppos_shotontarget', 
            'home_oppos_wins', 'home_oppos_draws', 'home_oppos_losses', 
            'home_fouls', 'home_yellowcards', 'home_redcards', 
            'home_cornerkicks', 'home_oppos_cornerkicks', 
            'home_oppos_fouls', 'home_oppos_yellowcards', 'home_oppos_redcards', 
            'away_fouls', 'away_yellowcards', 'away_redcards', 
            'away_cornerkicks','away_oppos_cornerkicks', 
            'away_oppos_fouls', 'away_oppos_yellowcards', 'away_oppos_redcards'
], inplace=True)




data.rename(columns={'FTR' : 'Result'}, inplace=True)
data.dropna(inplace=True)
data.to_csv('data_processed/E0.csv', index=False)

In [70]:
from sklearn.preprocessing import LabelEncoder

label_encoder_home = LabelEncoder()
label_encoder_away = LabelEncoder()

# Fit the encoders on the entire dataset
data['HomeTeam'] = label_encoder_home.fit_transform(data['HomeTeam'])
data['AwayTeam'] = label_encoder_away.fit_transform(data['AwayTeam'])

label_encoder = LabelEncoder()
data['Result'] = label_encoder.fit_transform(data['Result'])

In [71]:
data

Unnamed: 0,Date,HomeTeam,AwayTeam,Result,Hodds,Dodds,Aodds,wins,draws,losses,...,fouls,yellowcards,redcards,oppos_goals,oppos_shots,oppos_shotontarget,oppos_cornerkicks,oppos_fouls,oppos_yellowcards,oppos_redcards
552,2010-12-26,1,31,0,3.042857,3.285714,2.341429,-0.4,0.0,0.4,...,0.2,0.2,0.0,-0.333333,-11.666667,-9.666667,-5.333333,4.000000,0.333333,0.000000
553,2010-12-26,3,28,0,2.240000,3.214286,3.285714,-0.2,-0.2,0.4,...,2.0,1.0,0.0,-1.333333,-4.333333,-4.666667,0.000000,4.333333,0.333333,0.333333
555,2010-12-26,13,34,0,1.932857,3.328571,4.092857,0.0,0.2,-0.2,...,4.4,0.6,0.0,0.333333,-2.333333,-2.000000,1.000000,-4.666667,-1.000000,0.333333
556,2010-12-26,20,29,2,1.267143,5.321429,11.785714,0.2,0.0,-0.2,...,1.2,-0.4,0.0,0.333333,4.333333,3.000000,0.666667,-3.000000,-0.333333,-0.333333
558,2010-12-26,36,35,0,2.078571,3.242857,3.664286,0.2,-0.4,0.2,...,-3.2,0.4,-0.2,-1.000000,-14.000000,-9.666667,0.333333,4.666667,1.666667,0.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5315,2023-05-28,11,24,1,1.670000,4.013333,4.683333,-0.4,0.6,-0.2,...,2.6,0.4,0.2,-2.333333,0.000000,-3.000000,-0.333333,2.333333,1.000000,0.000000
5316,2023-05-28,12,5,2,1.453333,4.460000,6.715000,-0.2,0.2,0.0,...,2.2,0.6,0.0,-2.000000,2.333333,-2.666667,-2.000000,1.666667,1.666667,0.333333
5317,2023-05-28,16,31,0,2.710000,3.738333,2.330000,0.0,0.4,-0.4,...,2.2,0.2,0.0,0.333333,2.333333,0.000000,1.333333,2.000000,-1.000000,0.000000
5318,2023-05-28,17,34,2,1.918333,3.795000,3.565000,0.4,-0.2,-0.2,...,-2.4,0.6,0.0,0.666667,2.666667,1.000000,1.333333,0.000000,-1.000000,0.000000


# SPLIT PROCESSED DATA

In [72]:
test_years = [2022, 2023]
val_years = [2021]

In [73]:
test = data[pd.to_datetime(data['Date']).dt.year.apply(lambda x : x in test_years)]
val = data[pd.to_datetime(data['Date']).dt.year.apply(lambda x : x in val_years)]
train = data[pd.to_datetime(data['Date']).dt.year.apply(lambda x : x not in test_years and x not in val_years)]

train = train.drop(columns=['Date'])
val = val.drop(columns=['Date'])
test = test.drop(columns=['Date'])

In [74]:
test.to_csv('data_processed/test.csv', index=False)
val.to_csv('data_processed/val.csv', index=False)
train.to_csv('data_processed/train.csv', index=False)