In [1]:
import os
import numpy as np
import pandas as pd

from torch.utils.data import random_split
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import Dataset

## Original CSV file into merged CSV file

In [2]:
data = []
col = ["Date","HomeTeam","AwayTeam","FTR","B365H","B365D",
      "B365A","BWH","BWD","BWA","IWH","IWD","IWA","PSH","PSD","PSA","WHH","WHD","WHA",
      "VCH","VCD","VCA"]
for i in range(21,11,-1):
    temp = pd.read_csv(f"./data/EPL/{i}_{i+1}.csv")
    temp = temp[col]
    temp["Season1"] = i
    temp["Season2"] = i+1
    print(f"{i}/{i+1} season ok")
    data.append(temp)

21/22 season ok
20/21 season ok
19/20 season ok
18/19 season ok
17/18 season ok
16/17 season ok
15/16 season ok
14/15 season ok
13/14 season ok
12/13 season ok


In [3]:
print("data length : " + str(len(data)))
for i in range(len(data)) :
    print(f"data[{i}].shape = {data[i].shape}")

data length : 10
data[0].shape = (110, 24)
data[1].shape = (380, 24)
data[2].shape = (380, 24)
data[3].shape = (380, 24)
data[4].shape = (380, 24)
data[5].shape = (380, 24)
data[6].shape = (380, 24)
data[7].shape = (381, 24)
data[8].shape = (380, 24)
data[9].shape = (380, 24)


In [4]:
total = pd.concat(data)
print(f"total shape = {total.shape}")
total = total.dropna()
print(f"total after dropna shape = {total.shape}")
total = total.reset_index(drop=True)

total shape = (3531, 24)
total after dropna shape = (3528, 24)


In [5]:
total.columns.values

array(['Date', 'HomeTeam', 'AwayTeam', 'FTR', 'B365H', 'B365D', 'B365A',
       'BWH', 'BWD', 'BWA', 'IWH', 'IWD', 'IWA', 'PSH', 'PSD', 'PSA',
       'WHH', 'WHD', 'WHA', 'VCH', 'VCD', 'VCA', 'Season1', 'Season2'],
      dtype=object)

In [6]:
team_name = total["HomeTeam"].unique()

## encode string data to numeric data

In [7]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

In [8]:
le.fit(team_name)
team_label = le.transform(team_name)
print(f"team name : {team_name}")
print(f"team_label : {team_label}")

team name : ['Brentford' 'Man United' 'Burnley' 'Chelsea' 'Everton' 'Leicester'
 'Watford' 'Norwich' 'Newcastle' 'Tottenham' 'Liverpool' 'Aston Villa'
 'Crystal Palace' 'Leeds' 'Man City' 'Brighton' 'Southampton' 'Wolves'
 'Arsenal' 'West Ham' 'Fulham' 'West Brom' 'Sheffield United'
 'Bournemouth' 'Huddersfield' 'Cardiff' 'Stoke' 'Swansea' 'Hull'
 'Middlesbrough' 'Sunderland' 'QPR' 'Reading' 'Wigan']
team_label : [ 3 17  5  7  9 14 29 20 19 28 15  1  8 13 16  4 24 33  0 31 10 30 23  2
 11  6 25 27 12 18 26 21 22 32]


In [9]:
dic = {}
for i in range(len(team_label)) :
    dic[team_name[i]] = team_label[i]

In [10]:
dic

{'Brentford': 3,
 'Man United': 17,
 'Burnley': 5,
 'Chelsea': 7,
 'Everton': 9,
 'Leicester': 14,
 'Watford': 29,
 'Norwich': 20,
 'Newcastle': 19,
 'Tottenham': 28,
 'Liverpool': 15,
 'Aston Villa': 1,
 'Crystal Palace': 8,
 'Leeds': 13,
 'Man City': 16,
 'Brighton': 4,
 'Southampton': 24,
 'Wolves': 33,
 'Arsenal': 0,
 'West Ham': 31,
 'Fulham': 10,
 'West Brom': 30,
 'Sheffield United': 23,
 'Bournemouth': 2,
 'Huddersfield': 11,
 'Cardiff': 6,
 'Stoke': 25,
 'Swansea': 27,
 'Hull': 12,
 'Middlesbrough': 18,
 'Sunderland': 26,
 'QPR': 21,
 'Reading': 22,
 'Wigan': 32}

In [11]:
total["FTR"].factorize()[0]

array([0, 0, 1, ..., 2, 0, 2])

In [12]:
total["labeled_FTR"] = total["FTR"].factorize()[0]

In [13]:
labeled_HT = []
for i in total["HomeTeam"] :
    labeled_HT.append(dic[i])

In [14]:
labeled_AT = []
for i in total["AwayTeam"] :
    labeled_AT.append(dic[i])

In [15]:
total["labeled_HomeTeam"] = labeled_HT
total["labeled_AwayTeam"] = labeled_AT

In [16]:
total.columns.values

array(['Date', 'HomeTeam', 'AwayTeam', 'FTR', 'B365H', 'B365D', 'B365A',
       'BWH', 'BWD', 'BWA', 'IWH', 'IWD', 'IWA', 'PSH', 'PSD', 'PSA',
       'WHH', 'WHD', 'WHA', 'VCH', 'VCD', 'VCA', 'Season1', 'Season2',
       'labeled_FTR', 'labeled_HomeTeam', 'labeled_AwayTeam'],
      dtype=object)

In [17]:
total["Date"]

0       13/08/2021
1       14/08/2021
2       14/08/2021
3       14/08/2021
4       14/08/2021
           ...    
3523      19/05/13
3524      19/05/13
3525      19/05/13
3526      19/05/13
3527      19/05/13
Name: Date, Length: 3528, dtype: object

In [18]:
d = []
m = []

for t,i in enumerate(total["Date"]) :
    d.append(int(i[0:2]))
    m.append(int(i[3:5]))

In [19]:
total["Day"] = d
total["Month"] = m

In [20]:
total

Unnamed: 0,Date,HomeTeam,AwayTeam,FTR,B365H,B365D,B365A,BWH,BWD,BWA,...,VCH,VCD,VCA,Season1,Season2,labeled_FTR,labeled_HomeTeam,labeled_AwayTeam,Day,Month
0,13/08/2021,Brentford,Arsenal,H,4.00,3.40,1.95,4.00,3.50,1.95,...,4.10,3.40,2.00,21,22,0,3,0,13,8
1,14/08/2021,Man United,Leeds,H,1.53,4.50,5.75,1.53,4.50,5.75,...,1.55,4.40,6.00,21,22,0,17,13,14,8
2,14/08/2021,Burnley,Brighton,A,3.10,3.10,2.45,3.20,3.10,2.40,...,3.13,3.10,2.45,21,22,1,5,4,14,8
3,14/08/2021,Chelsea,Crystal Palace,H,1.25,5.75,13.00,1.28,5.75,10.50,...,1.25,5.75,13.00,21,22,0,7,8,14,8
4,14/08/2021,Everton,Southampton,H,1.90,3.50,4.00,1.95,3.50,3.90,...,1.95,3.40,4.10,21,22,0,9,24,14,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3523,19/05/13,Swansea,Fulham,A,1.80,3.80,4.75,1.72,3.75,4.50,...,1.83,3.80,4.60,12,13,1,27,10,19,5
3524,19/05/13,Tottenham,Sunderland,H,1.29,6.00,12.00,1.22,6.00,12.00,...,1.30,6.00,11.50,12,13,0,28,26,19,5
3525,19/05/13,West Brom,Man United,D,4.50,3.90,1.83,4.50,3.60,1.75,...,4.20,3.90,1.85,12,13,2,30,17,19,5
3526,19/05/13,West Ham,Reading,H,1.67,4.00,5.75,1.60,3.80,5.50,...,1.70,4.00,5.00,12,13,0,31,22,19,5


## Make useable csv file

In [21]:
if not os.path.isdir("./data/processed") :
    os.mkdir("./data/processed")

In [22]:
total = total[['Date', 'HomeTeam', 'AwayTeam', 'FTR', 'B365H', 'B365D', 'B365A',
       'BWH', 'BWD', 'BWA', 'IWH', 'IWD', 'IWA', 'PSH', 'PSD', 'PSA',
       'WHH', 'WHD', 'WHA', 'VCH', 'VCD', 'VCA', 'Season1', 'Season2', 'labeled_HomeTeam', 
       'labeled_AwayTeam', 'Day','Month','labeled_FTR']]

In [23]:
total.to_csv("./data/processed/total.csv")

In [24]:
total.columns.values

array(['Date', 'HomeTeam', 'AwayTeam', 'FTR', 'B365H', 'B365D', 'B365A',
       'BWH', 'BWD', 'BWA', 'IWH', 'IWD', 'IWA', 'PSH', 'PSD', 'PSA',
       'WHH', 'WHD', 'WHA', 'VCH', 'VCD', 'VCA', 'Season1', 'Season2',
       'labeled_HomeTeam', 'labeled_AwayTeam', 'Day', 'Month',
       'labeled_FTR'], dtype=object)

In [75]:
# numeric_col = ['B365H', 'B365D', 'B365A',
#        'BWH', 'BWD', 'BWA', 'IWH', 'IWD', 'IWA', 'PSH', 'PSD', 'PSA',
#        'WHH', 'WHD', 'WHA', 'VCH', 'VCD', 'VCA', 'Season1', 'Season2', 
#        'labeled_HomeTeam', 'labeled_AwayTeam', 'Day','Month','labeled_FTR']
numeric_col = ['labeled_HomeTeam', 'labeled_AwayTeam','Month','B365H', 'B365D', 'B365A',
       'BWH', 'BWD', 'BWA', 'IWH', 'IWD', 'IWA', 'PSH', 'PSD', 'PSA',
       'WHH', 'WHD', 'WHA', 'VCH', 'VCD', 'VCA', 'labeled_FTR']

In [76]:
numeric_total = total[numeric_col]

In [77]:
numeric_total.to_csv("./data/processed/numeric_total.csv")
print(numeric_total.shape)
print(numeric_total.columns.values)

(3528, 22)
['labeled_HomeTeam' 'labeled_AwayTeam' 'Month' 'B365H' 'B365D' 'B365A'
 'BWH' 'BWD' 'BWA' 'IWH' 'IWD' 'IWA' 'PSH' 'PSD' 'PSA' 'WHH' 'WHD' 'WHA'
 'VCH' 'VCD' 'VCA' 'labeled_FTR']


## Make train & test data

In [78]:
train_data = numeric_total.sample(frac=0.8)
test_data = numeric_total.drop(train_data.index)

test_data.shape

(706, 22)

In [79]:
train_data.to_csv("./data/processed/numeric_train.csv")
test_data.to_csv("./data/processed/numeric_test.csv")

In [80]:
train_data.iloc[0,:-1]
train_data.shape

(2822, 22)

### below code is useless just for practice

In [31]:
s = train_data.iloc[0,:-1].to_numpy(dtype=float)
s
ss = np.expand_dims(s, axis=1)
ss.dtype

dtype('float64')

In [32]:
from torchvision import transforms

trans = transforms.Compose([
                            transforms.ToTensor(),
                           ])
ss = trans(ss)

In [33]:
ss = ss.float()
ss.dtype

torch.float32

In [34]:
d = train_data.iloc[0,-1]
d = np.float64(d)
d.dtype

dtype('float64')

In [35]:
train_data = torch.from_numpy(np_train_data).float()

NameError: name 'np_train_data' is not defined

In [None]:
import torch

In [None]:
ss.type(torch.LongTensor).dtype

In [48]:
tmp = pd.read_csv(f"./data/EPL/19_20.csv")

In [49]:
tmp.shape

(380, 106)

In [50]:
tmp.columns.values

array(['Div', 'Date', 'Time', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG',
       'FTR', 'HTHG', 'HTAG', 'HTR', 'Referee', 'HS', 'AS', 'HST', 'AST',
       'HF', 'AF', 'HC', 'AC', 'HY', 'AY', 'HR', 'AR', 'B365H', 'B365D',
       'B365A', 'BWH', 'BWD', 'BWA', 'IWH', 'IWD', 'IWA', 'PSH', 'PSD',
       'PSA', 'WHH', 'WHD', 'WHA', 'VCH', 'VCD', 'VCA', 'MaxH', 'MaxD',
       'MaxA', 'AvgH', 'AvgD', 'AvgA', 'B365>2.5', 'B365<2.5', 'P>2.5',
       'P<2.5', 'Max>2.5', 'Max<2.5', 'Avg>2.5', 'Avg<2.5', 'AHh',
       'B365AHH', 'B365AHA', 'PAHH', 'PAHA', 'MaxAHH', 'MaxAHA', 'AvgAHH',
       'AvgAHA', 'B365CH', 'B365CD', 'B365CA', 'BWCH', 'BWCD', 'BWCA',
       'IWCH', 'IWCD', 'IWCA', 'PSCH', 'PSCD', 'PSCA', 'WHCH', 'WHCD',
       'WHCA', 'VCCH', 'VCCD', 'VCCA', 'MaxCH', 'MaxCD', 'MaxCA', 'AvgCH',
       'AvgCD', 'AvgCA', 'B365C>2.5', 'B365C<2.5', 'PC>2.5', 'PC<2.5',
       'MaxC>2.5', 'MaxC<2.5', 'AvgC>2.5', 'AvgC<2.5', 'AHCh', 'B365CAHH',
       'B365CAHA', 'PCAHH', 'PCAHA', 'MaxCAHH', 'MaxCAHA', 'Av