In [2]:
# Import thư viện cần thiết
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import requests

In [None]:
output_dir = "data_raw"
os.makedirs(output_dir, exist_ok=True)

base_url = "https://www.football-data.co.uk/mmz4281/{}/E0.csv"
seasons = [f"{str(year)[-2:]}{str(year + 1)[-2:]}" for year in range(2015, 2024)]

for season in seasons:
    url = base_url.format(season)
    output_file = os.path.join(output_dir, f"E0_{season}.csv")
    
    print(f"Đang tải: {url}")
    
    try:
        response = requests.get(url)
        response.raise_for_status()  # Kiểm tra lỗi HTTP
        
        # Lưu file CSV
        with open(output_file, "wb") as f:
            f.write(response.content)
        print(f"Lưu thành công: {output_file}")
    
    except requests.exceptions.RequestException as e:
        print(f"Lỗi khi tải {url}: {e}")

print("Quá trình tải hoàn tất!")


In [6]:
output_dir = 'data_raw'
os.makedirs(output_dir, exist_ok=True)
base_url = "https://www.football-data.co.uk/mmz4281/{}/E0.csv"

for year in range(2015, 2024):
    season = f'{str(year - 1)[-2:]}{str(year)[-2:]}'
    url = base_url.format(season)
    output_file = os.path.join(output_dir, f'E0_{year}.csv')
        
    try:
        response = requests.get(url)
        response.raise_for_status()
        with open(output_file, 'wb') as f:
            f.write(response.content)
        print(f'successfully saved {output_file}')
    except requests.exceptions.RequestException as e:
        print(f'error loading {url}: {e}')

successfully saved data_raw\E0_2015.csv
successfully saved data_raw\E0_2016.csv
successfully saved data_raw\E0_2017.csv
successfully saved data_raw\E0_2018.csv
successfully saved data_raw\E0_2019.csv
successfully saved data_raw\E0_2020.csv
successfully saved data_raw\E0_2021.csv
successfully saved data_raw\E0_2022.csv
successfully saved data_raw\E0_2023.csv


In [None]:
output_dir = 'data_cleaned'
os.makedirs(output_dir, exist_ok=True)

seasons = [f"{str(year)[-2:]}{str(year + 1)[-2:]}" for year in range(2019, 2024)]

for season in seasons:
    df = pd.read_csv(f"data_raw/E0_{season}.csv")
    # Các cột giữ lại
    columns_to_keep = [
        'Date', 'HomeTeam', 'AwayTeam', 
        'AvgH', 'AvgD', 'AvgA', 
        'Avg>2.5', 'Avg<2.5', 
        'AvgAHH', 'AvgAHA'
    ]

    # Lọc dữ liệu với các cột quan trọng
    filtered_df = df[columns_to_keep]

    # Hiển thị dữ liệu sau khi loại bỏ cột không cần thiết
    print(df.head())
    filtered_df.to_csv(f"{output_dir}/E0_{season}.csv")
    



FileNotFoundError: [Errno 2] No such file or directory: 'data_raw/E0_1920.csv'

In [5]:
pip install utils

Collecting utils
  Downloading utils-1.0.2.tar.gz (13 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: utils
  Building wheel for utils (setup.py): started
  Building wheel for utils (setup.py): finished with status 'done'
  Created wheel for utils: filename=utils-1.0.2-py2.py3-none-any.whl size=13934 sha256=eeb6f07edf5d5f2306a437c691b66173b334228442db0401dccbe606393a3f61
  Stored in directory: c:\users\admin\appdata\local\pip\cache\wheels\b6\a1\81\1036477786ae0e17b522f6f5a838f9bc4288d1016fc5d0e1ec
Successfully built utils
Installing collected packages: utils
Successfully installed utils-1.0.2
Note: you may need to restart the kernel to use updated packages.


In [2]:
import functools
import numpy as np

def conjunction(*conditions):
    return functools.reduce(np.logical_and, conditions)

def union(*conditions):
    return functools.reduce(np.logical_or, conditions)

In [4]:
import pandas as pd
import numpy as np
import datetime
import pdb

class Dataset():
    def __init__(self, config):
        self.config = config

        self.load()


    def load(self):
        data = pd.read_csv(self.config.datadir+'epl.csv')
        self.raw = data.copy()

        # preprocess data
        data = self.preprocess(data)

        # split data
        train, test = self.split(data)
        self.train_set = train
        self.test_set = test


    def preprocess(self, data):
        print('preprocessing data...')

        # modify date format
        data['Date'] = data['Date'].apply(lambda x : datetime.datetime.strptime(x, '%d/%m/%y').strftime('%Y-%m-%d'))

        # average out betting odds
        data['Hodds'] = np.mean(data[['B365H','BWH','GBH','IWH','LBH','SBH','WHH','SJH','VCH','BSH']],axis=1)
        data['Dodds'] = np.mean(data[['B365D','BWD','GBD','IWD','LBD','SBD','WHD','SJD','VCD','BSD']],axis=1)
        data['Aodds'] = np.mean(data[['B365A','BWA','GBA','IWA','LBA','SBA','WHA','SJA','VCA','BSA']],axis=1)

        # filter columns - meta data @ http://www.football-data.co.uk/notes.txt
        use_col = ['Date','HomeTeam','AwayTeam','FTHG','FTAG','FTR','HTHG','HTAG','HTR','Referee','HS','AS','HST','AST',
                   'HC','AC','HF','AF','HY','AY','HR','AR','Hodds','Dodds','Aodds']
        data = data[use_col]

        # accumulate histories
        # : referenced http://andrew.carterlunn.co.uk/programming/2018/02/20/beating-the-bookmakers-with-tensorflow.html
        acc_hist = {'home_wins' : [], 'home_draws' : [], 'home_losses' : [], 'home_goals' : [], 'home_oppos_goals' : [],
                    'home_shots' : [], 'home_oppos_shots' : [], 'home_shotontarget' : [], 'home_oppos_shotontarget' : [],
                    'away_wins' : [], 'away_draws' : [], 'away_losses' : [], 'away_goals' : [], 'away_oppos_goals' : [],
                    'away_shots' : [], 'away_oppos_shots' : [], 'away_shotontarget' : [], 'away_oppos_shotontarget' : [],
                    'home_oppos_wins' : [], 'home_oppos_draws' : [], 'home_oppos_losses' : [],
                    'home_fouls' : [], 'home_yellowcards' : [], 'home_redcards' : [], 'home_cornerkicks' : [],
                    'home_oppos_cornerkicks' : [], 'home_oppos_fouls' : [], 'home_oppos_yellowcards' : [], 'home_oppos_redcards' : [],
                    'away_fouls' : [], 'away_yellowcards' : [], 'away_redcards' : [], 'away_cornerkicks' : [],
                    'away_oppos_cornerkicks' : [],'away_oppos_fouls' : [], 'away_oppos_yellowcards' : [], 'away_oppos_redcards' : []}
        d = 0
        for row in data.iterrows() :
            hometeam = row[1]['HomeTeam']
            awayteam = row[1]['AwayTeam']
            date = row[1]['Date']

            # filter matches with same playing teams
            temp1 = data[conjunction(data['HomeTeam']==hometeam, data['AwayTeam']==awayteam)]
            temp2 = data[conjunction(data['HomeTeam']==awayteam, data['AwayTeam']==hometeam)]
            temp = pd.concat([temp1, temp2], axis=0)
            history = temp[temp['Date']<date].sort_values(by='Date').tail(self.config.lookback_opp_matches)
            # if opponent history is too short, continue
            if len(history) < self.config.lookback_opp_matches :
                for key in list(acc_hist.keys()) :
                    acc_hist[key].append(np.nan)
                continue

            # compute average number of goals scored against opponent in the past N matches with the opponent
            home = history[history['HomeTeam'] == hometeam]
            away = history[history['AwayTeam'] == hometeam]
            home_sum = np.sum(home[['FTHG','FTAG','HS','AS','HST','AST','HC','AC','HF','AF','HY','AY','HR','AR']])
            away_sum = np.sum(away[['FTHG','FTAG','HS','AS','HST','AST','HC','AC','HF','AF','HY','AY','HR','AR']])


            # filter recent N matches of both home and away
            home = data[union(data['HomeTeam']==hometeam, data['AwayTeam']==hometeam)]
            home = home[home['Date']<date].sort_values(by='Date').tail(self.config.lookback_matches)
            away = data[union(data['HomeTeam']==awayteam, data['AwayTeam']==awayteam)]
            away = away[away['Date']<date].sort_values(by='Date').tail(self.config.lookback_matches)

            # if match history is too short, continue
            if len(home) < self.config.lookback_matches or len(away) < self.config.lookback_matches :
                for key in list(acc_hist.keys()) :
                    acc_hist[key].append(np.nan)
                continue

            home_home_sum = np.sum(home[home['HomeTeam']==hometeam][['FTHG','HS','HST','HC','HF','HY','HR']])
            home_away_sum = np.sum(home[home['AwayTeam']==hometeam][['FTAG','AS','AST','AC','AF','AY','AR']])
            away_home_sum = np.sum(away[away['HomeTeam']==awayteam][['FTHG','HS','HST','HC','HF','HY','HR']])
            away_away_sum = np.sum(away[away['AwayTeam']==awayteam][['FTAG','AS','AST','AC','AF','AY','AR']])

            # append computation results to dictionary
            acc_hist['home_oppos_goals'].append((home_sum['FTHG'] + away_sum['FTAG']) / self.config.lookback_opp_matches)
            acc_hist['away_oppos_goals'].append((home_sum['FTAG'] + away_sum['FTHG']) / self.config.lookback_opp_matches)
            acc_hist['home_oppos_shots'].append((home_sum['HS'] + away_sum['AS']) / self.config.lookback_opp_matches)
            acc_hist['away_oppos_shots'].append((home_sum['AS'] + away_sum['HS']) / self.config.lookback_opp_matches)
            acc_hist['home_oppos_shotontarget'].append((home_sum['HST'] + away_sum['AST']) / self.config.lookback_opp_matches)
            acc_hist['away_oppos_shotontarget'].append((home_sum['AST'] + away_sum['HST']) / self.config.lookback_opp_matches)
            acc_hist['home_oppos_cornerkicks'].append((home_sum['HC'] + away_sum['AC']) / self.config.lookback_opp_matches)
            acc_hist['away_oppos_cornerkicks'].append((home_sum['AC'] + away_sum['HC']) / self.config.lookback_opp_matches)
            acc_hist['home_oppos_fouls'].append((home_sum['HF'] + away_sum['AF']) / self.config.lookback_opp_matches)
            acc_hist['away_oppos_fouls'].append((home_sum['AF'] + away_sum['HF']) / self.config.lookback_opp_matches)
            acc_hist['home_oppos_yellowcards'].append((home_sum['HY'] + away_sum['AY']) / self.config.lookback_opp_matches)
            acc_hist['away_oppos_yellowcards'].append((home_sum['AY'] + away_sum['HY']) / self.config.lookback_opp_matches)
            acc_hist['home_oppos_redcards'].append((home_sum['HR'] + away_sum['AR']) / self.config.lookback_opp_matches)
            acc_hist['away_oppos_redcards'].append((home_sum['AR'] + away_sum['HR']) / self.config.lookback_opp_matches)

            acc_hist['home_goals'].append((home_home_sum['FTHG'] + home_away_sum['FTAG']) / self.config.lookback_matches)
            acc_hist['away_goals'].append((away_home_sum['FTHG'] + away_away_sum['FTAG']) / self.config.lookback_matches)
            acc_hist['home_shots'].append((home_home_sum['HS'] + home_away_sum['AS']) / self.config.lookback_matches)
            acc_hist['away_shots'].append((away_home_sum['HS'] + away_away_sum['AS']) / self.config.lookback_matches)
            acc_hist['home_shotontarget'].append((home_home_sum['HST'] + home_away_sum['AST']) / self.config.lookback_matches)
            acc_hist['away_shotontarget'].append((away_home_sum['HST'] + away_away_sum['AST']) / self.config.lookback_matches)
            acc_hist['home_cornerkicks'].append((home_home_sum['HC'] + home_away_sum['AC']) / self.config.lookback_matches)
            acc_hist['away_cornerkicks'].append((away_home_sum['HC'] + away_away_sum['AC']) / self.config.lookback_matches)
            acc_hist['home_fouls'].append((home_home_sum['HF'] + home_away_sum['AF']) / self.config.lookback_matches)
            acc_hist['away_fouls'].append((away_home_sum['HF'] + away_away_sum['AF']) / self.config.lookback_matches)
            acc_hist['home_yellowcards'].append((home_home_sum['HY'] + home_away_sum['AY']) / self.config.lookback_matches)
            acc_hist['away_yellowcards'].append((away_home_sum['HY'] + away_away_sum['AY']) / self.config.lookback_matches)
            acc_hist['home_redcards'].append((home_home_sum['HR'] + home_away_sum['AR']) / self.config.lookback_matches)
            acc_hist['away_redcards'].append((away_home_sum['HR'] + away_away_sum['AR']) / self.config.lookback_matches)


            # count ratio of wins / draws / losses in the past N matches of Home vs Away
            res = []
            for r in history.iterrows() :
                if r[1]['HomeTeam'] == hometeam :
                    res.append(r[1]['FTR'])
                else :
                    if r[1]['FTR'] == 'A' :
                        res.append('H')
                    elif r[1]['FTR'] == 'H' :
                        res.append('A')
                    else :
                        res.append('D')
            acc_hist['home_oppos_wins'].append(res.count('H') / self.config.lookback_opp_matches)
            acc_hist['home_oppos_draws'].append(res.count('D') / self.config.lookback_opp_matches)
            acc_hist['home_oppos_losses'].append(res.count('A') / self.config.lookback_opp_matches)


            # count ratio of wins / draws / losses in the past N matches
            res = []
            for r in home.iterrows() :
                if r[1]['HomeTeam'] == hometeam :
                    res.append(r[1]['FTR'])
                else :
                    if r[1]['FTR'] == 'A' :
                        res.append('H')
                    elif r[1]['FTR'] == 'H' :
                        res.append('A')
                    else :
                        res.append('D')
            acc_hist['home_wins'].append(res.count('H') / self.config.lookback_matches)
            acc_hist['home_draws'].append(res.count('D') / self.config.lookback_matches)
            acc_hist['home_losses'].append(res.count('A') / self.config.lookback_matches)

            res = []
            for r in away.iterrows() :
                if r[1]['HomeTeam'] == awayteam :
                    res.append(r[1]['FTR'])
                else :
                    if r[1]['FTR'] == 'A' :
                        res.append('H')
                    elif r[1]['FTR'] == 'H' :
                        res.append('A')
                    else :
                        res.append('D')
            acc_hist['away_wins'].append(res.count('H') / self.config.lookback_matches)
            acc_hist['away_draws'].append(res.count('D') / self.config.lookback_matches)
            acc_hist['away_losses'].append(res.count('A') / self.config.lookback_matches)

        acc_hist = pd.DataFrame(acc_hist)
        data = pd.concat([data, acc_hist], axis=1)
        data = data.dropna()

        return data

    def split(self, data):
        train = data[pd.to_datetime(data['Date']).dt.year.apply(lambda x : x not in self.config.test_years)]
        test = data[pd.to_datetime(data['Date']).dt.year.apply(lambda x : x in self.config.test_years)]
        return train, test


    def get_data_info(self):
        print('train set size : {}'.format(self.train_set.shape))
        print('test set size : {}'.format(self.test_set.shape))
        print('columns : \n {}'.format(list(self.train_set.columns)))
        print('data sample : \n {}'.format(self.train_set.head(10)))