In [1]:
# for all the *.csv files in ../data/raw folder, read them and save them as a pandas dataframe
# then concat all the dataframes into one big dataframe
# finally save the big dataframe as a *.csv file in ../data/processed folder
import pandas as pd
import glob

dfs = []
for file in glob.glob("../data/raw/*.csv"):
    df = pd.read_csv(file)
    dfs.append(df)
df_buli_raw = pd.concat(dfs, ignore_index=True)

In [2]:
# convert dates like 26/09/2020 to 2020-09-26 or 14/08/15 to 2015-08-14
df_buli_preprocessed = df_buli_raw.copy()
# convert all from dd/mm/yyyy or dd/mm/yy to dd/mm/yyyy so ad "20" to the year if it is only 2 digits
df_buli_preprocessed['Date'] = df_buli_preprocessed['Date'].apply(lambda x: x if len(x.split("/")[-1]) == 4 else x[:-2] + "20" + x[-2:])

In [3]:
# convert timestamps from dd/mm/yyyy datetime
df_buli_preprocessed['Date'] = pd.to_datetime(df_buli_preprocessed['Date'], format='%d/%m/%Y')

# translation dict for columns
translation_dict = {
    "Div": "League Division",
    "Date": "Match Date (yyyy/mm/dd)",
    "Time": "Time of match kick off",
    "HomeTeam": "Home Team",
    "AwayTeam": "Away Team",
    "FTHG": "Full Time Home Team Goals",
    "HG": "Full Time Home Team Goals",
    "FTAG": "Full Time Away Team Goals",
    "AG": "Full Time Away Team Goals",
    "FTR": "Full Time Result (H=Home Win, D=Draw, A=Away Win)",
    "Res": "Full Time Result (H=Home Win, D=Draw, A=Away Win)",
    "HTHG": "Half Time Home Team Goals",
    "HTAG": "Half Time Away Team Goals",
    "HTR": "Half Time Result (H=Home Win, D=Draw, A=Away Win)",
    "Attendance": "Crowd Attendance",
    "Referee": "Match Referee",
    "HS": "Home Team Shots",
    "AS": "Away Team Shots",
    "HST": "Home Team Shots on Target",
    "AST": "Away Team Shots on Target",
    "HHW": "Home Team Hit Woodwork",
    "AHW": "Away Team Hit Woodwork",
    "HC": "Home Team Corners",
    "AC": "Away Team Corners",
    "HF": "Home Team Fouls Committed",
    "AF": "Away Team Fouls Committed",
    "HFKC": "Home Team Free Kicks Conceded",
    "AFKC": "Away Team Free Kicks Conceded",
    "HO": "Home Team Offsides",
    "AO": "Away Team Offsides",
    "HY": "Home Team Yellow Cards",
    "AY": "Away Team Yellow Cards",
    "HR": "Home Team Red Cards",
    "AR": "Away Team Red Cards",
    "HBP": "Home Team Bookings Points (10 = yellow, 25 = red)",
    "ABP": "Away Team Bookings Points (10 = yellow, 25 = red)",
    "B365H": "Bet365 home win odds",
    "B365D": "Bet365 draw odds",
    "B365A": "Bet365 away win odds",
    "BSH": "Blue Square home win odds",
    "BSD": "Blue Square draw odds",
    "BSA": "Blue Square away win odds",
    "BWH": "Bet&Win home win odds",
    "BWD": "Bet&Win draw odds",
    "BWA": "Bet&Win away win odds",
    "GBH": "Gamebookers home win odds",
    "GBD": "Gamebookers draw odds",
    "GBA": "Gamebookers away win odds",
    "IWH": "Interwetten home win odds",
    "IWD": "Interwetten draw odds",
    "IWA": "Interwetten away win odds",
    "LBH": "Ladbrokes home win odds",
    "LBD": "Ladbrokes draw odds",
    "LBA": "Ladbrokes away win odds",
    "PSH": "Pinnacle home win odds",
    "PH": "Pinnacle home win odds",
    "PSD": "Pinnacle draw odds",
    "PD": "Pinnacle draw odds",
    "PSA": "Pinnacle away win odds",
    "PA": "Pinnacle away win odds",
    "SOH": "Sporting Odds home win odds",
    "SOD": "Sporting Odds draw odds",
    "SOA": "Sporting Odds away win odds",
    "SBH": "Sportingbet home win odds",
    "SBD": "Sportingbet draw odds",
    "SBA": "Sportingbet away win odds",
    "SJH": "Stan James home win odds",
    "SJD": "Stan James draw odds",
    "SJA": "Stan James away win odds",
    "SYH": "Stanleybet home win odds",
    "SYD": "Stanleybet draw odds",
    "SYA": "Stanleybet away win odds",
    "VCH": "VC Bet home win odds",
    "VCD": "VC Bet draw odds",
    "VCA": "VC Bet away win odds",
    "WHH": "William Hill home win odds",
    "WHD": "William Hill draw odds",
    "WHA": "William Hill away win odds",
    "Bb1X2": "Number of BetBrain bookmakers used to calculate match odds averages and maximums",
    "BbMxH": "Betbrain maximum home win odds",
    "BbAvH": "Betbrain average home win odds",
    "BbMxD": "Betbrain maximum draw odds",
    "BbAvD": "Betbrain average draw odds",
    "BbMxA": "Betbrain maximum away win odds",
    "BbAvA": "Betbrain average away win odds",
    "MaxH": "Market maximum home win odds",
    "MaxD": "Market maximum draw win odds",
    "MaxA": "Market maximum away win odds",
    "AvgH": "Market average home win odds",
    "AvgD": "Market average draw win odds",
    "AvgA": "Market average away win odds",
    "BbOU": "Number of BetBrain bookmakers used to calculate over/under 2.5 goals averages and maximums",
    "BbMx>2.5": "Betbrain maximum over 2.5 goals",
    "BbAv>2.5": "Betbrain average over 2.5 goals",
    "BbMx<2.5": "Betbrain maximum under 2.5 goals",
    "BbAv<2.5": "Betbrain average under 2.5 goals",
    "GB>2.5": "Gamebookers over 2.5 goals",
    "GB<2.5": "Gamebookers under 2.5 goals",
    "B365>2.5": "Bet365 over 2.5 goals",
    "B365<2.5": "Bet365 under 2.5 goals",
    "P>2.5": "Pinnacle over 2.5 goals",
    "P<2.5": "Pinnacle under 2.5 goals",
    "Max>2.5": "Market maximum over 2.5 goals",
    "Max<2.5": "Market maximum under 2.5 goals",
    "Avg>2.5": "Market average over 2.5 goals",
    "Avg<2.5": "Market average under 2.5 goals",
    "BbAH": "Number of BetBrain bookmakers used to calculate Asian handicap averages and maximums",
    "BbAHh": "Betbrain size of handicap (home team)",
    "AHh": "Market size of handicap (home team)",
    "BbMxAHH": "Betbrain maximum Asian handicap home team odds",
    "BbAvAHH": "Betbrain average Asian handicap home team odds",
    "BbMxAHA": "Betbrain maximum Asian handicap away team odds",
    "BbAvAHA": "Betbrain average Asian handicap away team odds",
    "GBAHH": "Gamebookers Asian handicap home team odds",
    "GBAHA": "Gamebookers Asian handicap away team odds",
    "GBAH": "Gamebookers size of handicap (home team)",
    "LBAHH": "Ladbrokes Asian handicap home team odds",
    "LBAHA": "Ladbrokes Asian handicap away team odds",
    "LBAH": "Ladbrokes size of handicap (home team)",
    "B365AHH": "Bet365 Asian handicap home team odds",
    "B365AHA": "Bet365 Asian handicap away team odds",
    "B365AH": "Bet365 size of handicap (home team)",
    "PAHH": "Pinnacle Asian handicap home team odds",
    "PAHA": "Pinnacle Asian handicap away team odds",
    "MaxAHH": "Market maximum Asian handicap home team odds",
    "MaxAHA": "Market maximum Asian handicap away team odds",
    "AvgAHH": "Market average Asian handicap home team odds",
    "AvgAHA": "Market average Asian handicap away team odds"
}
df_buli_preprocessed = df_buli_preprocessed.rename(columns=translation_dict)
# convert all the odds to float
odds_columns = [col for col in df_buli_preprocessed.columns if "odds" in col]
for col in odds_columns:
    df_buli_preprocessed[col] = pd.to_numeric(df_buli_preprocessed[col], errors='coerce')
df_buli_preprocessed

Unnamed: 0,League Division,Match Date (yyyy/mm/dd),Home Team,Away Team,Full Time Home Team Goals,Full Time Away Team Goals,"Full Time Result (H=Home Win, D=Draw, A=Away Win)",Half Time Home Team Goals,Half Time Away Team Goals,"Half Time Result (H=Home Win, D=Draw, A=Away Win)",...,AvgCAHA,Stan James home win odds,Stan James draw odds,Stan James away win odds,Gamebookers home win odds,Gamebookers draw odds,Gamebookers away win odds,Blue Square home win odds,Blue Square draw odds,Blue Square away win odds
0,D1,2015-08-14,Bayern Munich,Hamburg,5,0,H,1,0,H,...,,,,,,,,,,
1,D1,2015-08-15,Augsburg,Hertha,0,1,A,0,0,D,...,,,,,,,,,,
2,D1,2015-08-15,Darmstadt,Hannover,2,2,D,1,0,H,...,,,,,,,,,,
3,D1,2015-08-15,Dortmund,M'gladbach,4,0,H,3,0,H,...,,,,,,,,,,
4,D1,2015-08-15,Leverkusen,Hoffenheim,2,1,H,1,1,D,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3613,D1,2020-06-27,Leverkusen,Mainz,1,0,H,1,0,H,...,1.89,,,,,,,,,
3614,D1,2020-06-27,M'gladbach,Hertha,2,1,H,1,0,H,...,1.95,,,,,,,,,
3615,D1,2020-06-27,Union Berlin,Fortuna Dusseldorf,3,0,H,1,0,H,...,2.07,,,,,,,,,
3616,D1,2020-06-27,Werder Bremen,FC Koln,6,1,H,3,0,H,...,1.98,,,,,,,,,


In [17]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

# Assuming `df` is your DataFrame
encoder = OneHotEncoder()
# One-hot encode the 'Home Team' and 'Away Team'
teams_encoded = encoder.fit_transform(df_buli_preprocessed[['Home Team', 'Away Team']])

# Targets are the goals
y_home_goals = df_buli_preprocessed['Full Time Home Team Goals'].values
y_away_goals = df_buli_preprocessed['Full Time Away Team Goals'].values

# Since we're predicting two quantities (home and away goals), you can either create two separate models or a single model that predicts both values.
# For simplicity, we'll prepare the data as if feeding into a single model, but separate models might be necessary depending on performance.
X_train, X_test, y_home_train, y_home_test, y_away_train, y_away_test = train_test_split(teams_encoded, y_home_goals, y_away_goals, test_size=0.2, random_state=42)

# Note: Depending on the model you choose, you might need to normalize/standardize your input features (X_train, X_test).


In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Assuming `df` is your DataFrame containing the match results
# One-hot encode the 'Home Team' and 'Away Team' columns
team_dummies = pd.get_dummies(df_buli_preprocessed[['Home Team', 'Away Team']])

# Assuming 'Home Team Full Time Goals' and 'Away Team Full Time Goals' are your target variables
# Concatenate the one-hot encoded team names with other features (if any)
X = pd.concat([team_dummies, df_buli_preprocessed['Match Date (yyyy/mm/dd)']], axis=1)
X = team_dummies

# Targets
y_home_goals = df_buli_preprocessed['Full Time Home Team Goals']
y_away_goals = df_buli_preprocessed['Full Time Away Team Goals']

# Prepare the targets as a single array if predicting both with one model
y = pd.concat([y_home_goals, y_away_goals], axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [28]:
X_train

Unnamed: 0,Home Team_Augsburg,Home Team_Bayern Munich,Home Team_Bielefeld,Home Team_Bochum,Home Team_Braunschweig,Home Team_Darmstadt,Home Team_Dortmund,Home Team_Ein Frankfurt,Home Team_FC Koln,Home Team_Fortuna Dusseldorf,...,Away Team_M'gladbach,Away Team_Mainz,Away Team_Nurnberg,Away Team_Paderborn,Away Team_RB Leipzig,Away Team_Schalke 04,Away Team_Stuttgart,Away Team_Union Berlin,Away Team_Werder Bremen,Away Team_Wolfsburg
1965,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1670,False,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
1320,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1018,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3153,False,False,False,False,False,False,False,True,False,False,...,False,False,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1130,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1294,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
860,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,False
3507,False,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False


In [29]:
y_train

Unnamed: 0,Full Time Home Team Goals,Full Time Away Team Goals
1965,1,0
1670,1,1
1320,1,3
1018,3,0
3153,2,2
...,...,...
1130,2,2
1294,1,1
860,1,1
3507,1,4


In [30]:
from keras.models import Sequential
from keras.layers import Dense
import numpy as np

# Determine the number of input features from your encoded data
input_features = X_train.shape[1]

# Model architecture
model = Sequential()
model.add(Dense(64, input_dim=input_features, activation='relu'))
model.add(Dense(32, activation='relu'))
# Two output neurons for predicting home and away goals
model.add(Dense(2, activation='linear'))  # No activation for the output layer in regression

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model.fit(X_train, y_train, epochs=100, batch_size=10, validation_split=0.1)

# Evaluate the model
loss = model.evaluate(X_test, y_test)


Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 597us/step - loss: 2.4884 - val_loss: 1.5997
Epoch 2/100
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 432us/step - loss: 1.5015 - val_loss: 1.5946
Epoch 3/100
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 435us/step - loss: 1.4391 - val_loss: 1.5775
Epoch 4/100
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 399us/step - loss: 1.4520 - val_loss: 1.5772
Epoch 5/100
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 377us/step - loss: 1.4461 - val_loss: 1.5806
Epoch 6/100
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 381us/step - loss: 1.4595 - val_loss: 1.5910
Epoch 7/100
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 388us/step - loss: 1.3915 - val_loss: 1.5729
Epoch 8/100
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 400us/step - loss: 1.3940 - val_loss: 1.6082
Epoch 9/100
[1m261/261[0m 

In [31]:
import numpy as np

# Assuming 'team_dummies' is your DataFrame after applying pd.get_dummies
# Create a new DataFrame with the same columns filled with zeros
match_input = pd.DataFrame(0, index=np.arange(1), columns=team_dummies.columns)

# Set the columns for "Bor. Dortmund" (home) and "Bayern Munich" (away) to 1
match_input['Home Team_Dortmund'] = 1
match_input['Away Team_Bayern Munich'] = 1

# Convert the DataFrame to the appropriate input format for the model (e.g., numpy array)
match_input = match_input.values


In [32]:
match_input

array([[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [33]:
# Predict the result
# If predicting scores:
predicted_scores = model.predict(match_input)

# If the model predicts [home_goals, away_goals]:
home_goals_predicted = predicted_scores[0][0]
away_goals_predicted = predicted_scores[0][1]

print(f"Predicted Score: Bor. Dortmund {home_goals_predicted:.0f} - {away_goals_predicted:.0f} Bayern Munich")

# Adjust the print statement based on what your model predicts


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
Predicted Score: Bor. Dortmund 1 - 2 Bayern Munich


In [None]:
X_train[0]

In [6]:
from keras.models import Sequential
from keras.layers import Dense
import numpy as np

# Determine the number of input features from your encoded data
input_features = X_train.shape[1]

# Model architecture
model = Sequential()
model.add(Dense(64, input_dim=input_features, activation='relu'))
model.add(Dense(32, activation='relu'))
# Two output neurons for predicting home and away goals
model.add(Dense(2, activation='linear'))  # No activation for the output layer in regression

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Prepare the targets as a single array with 2 columns: home goals and away goals
y_train = np.column_stack((y_home_train, y_away_train))
y_test = np.column_stack((y_home_test, y_away_test))

# Train the model
model.fit(X_train, y_train, epochs=100, batch_size=10, validation_split=0.1)

# Evaluate the model
loss = model.evaluate(X_test, y_test)


Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 847us/step - loss: 2.3569 - val_loss: 1.6477
Epoch 2/100
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 499us/step - loss: 1.5708 - val_loss: 1.5813
Epoch 3/100
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 458us/step - loss: 1.4972 - val_loss: 1.5779
Epoch 4/100
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 502us/step - loss: 1.4313 - val_loss: 1.5817
Epoch 5/100
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.3926 - val_loss: 1.5891
Epoch 6/100
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 554us/step - loss: 1.4244 - val_loss: 1.6032
Epoch 7/100
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 498us/step - loss: 1.4430 - val_loss: 1.5980
Epoch 8/100
[1m261/261[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 475us/step - loss: 1.3679 - val_loss: 1.6093
Epoch 9/100
[1m261/261[0m [

In [15]:
teams_encoded[0,4]

0.0