Importing the Necessary Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

Importing Match, Odds and Current Form Data

In [None]:
import pandas as pd


from google.colab import files
uploaded = files.upload()

file_name = list(uploaded.keys())[0]
df = pd.read_csv(file_name)


# Preview(dataset)
print("Dataset preview:")
print(df.head())


Saving starting_dataset.csv to starting_dataset.csv
Dataset preview:
   Matches        Date   HomeTeam    AwayTeam  FTHG  FTAG FTR  MW  HTGS  ATGS  \
0        0  2002-08-17  Blackburn  Sunderland     0     0   D   1     0     0   
1        1  2002-08-17   Charlton     Chelsea     2     3   A   1     0     0   
2        2  2002-08-17    Everton   Tottenham     2     2   D   1     0     0   
3        3  2002-08-17     Fulham      Bolton     4     1   H   1     0     0   
4        4  2002-08-17      Leeds    Man City     3     0   H   1     0     0   

   ...  IWD  IWA    LBH   LBD   LBA    WHH    WHD    WHA  Home Win % (Before)  \
0  ...  3.1  3.8  1.615  3.25  5.00  1.660  3.465  4.500                  0.0   
1  ...  3.0  2.2  2.800  3.20  2.20  2.750  3.100  2.415                  0.0   
2  ...  3.0  2.7  2.250  3.20  2.75  2.300  3.255  2.750                  0.0   
3  ...  3.1  3.8  1.833  3.20  3.75  1.806  3.200  4.330                  0.0   
4  ...  3.2  4.2  1.615  3.50  4.50  1.

Calculating Head-To-Head Data

In [None]:
df['Date'] = pd.to_datetime(df['Date'])

# Initialize new columns for win percentages
df['Home Win % (Before)'] = 0.0
df['Away Win % (Before)'] = 0.0

# Calculate win percentages for all prior head-to-head matches
for idx, row in df.iterrows():
    home_team = row['HomeTeam']
    away_team = row['AwayTeam']
    match_date = pd.to_datetime(row['Date'])

    # Filter matches played before
    previous_matches = df[
        ((df['HomeTeam'] == home_team) & (df['AwayTeam'] == away_team)) |
        ((df['HomeTeam'] == away_team) & (df['AwayTeam'] == home_team))
    ]
    previous_matches = previous_matches[pd.to_datetime(previous_matches['Date']) < match_date]

    # Total matches played before this match
    total_matches = len(previous_matches)

    # Calculate home and away wins
    if total_matches > 0:
        # Home team wins
        home_wins = sum(
            ((previous_matches['HomeTeam'] == home_team) & (previous_matches['FTR'] == 'H')) |
            ((previous_matches['AwayTeam'] == home_team) & (previous_matches['FTR'] == 'A'))
        )

        # Away team wins
        away_wins = sum(
            ((previous_matches['HomeTeam'] == away_team) & (previous_matches['FTR'] == 'H')) |
            ((previous_matches['AwayTeam'] == away_team) & (previous_matches['FTR'] == 'A'))
        )

        # Calculate win percentages
        home_win_percentage = (home_wins / total_matches) * 100
        away_win_percentage = (away_wins / total_matches) * 100
    else:
        home_win_percentage = 0.0
        away_win_percentage = 0.0


    df.at[idx, 'Home Win % (Before)'] = home_win_percentage
    df.at[idx, 'Away Win % (Before)'] = away_win_percentage


updated_file_name = "updated_final_dataset_with_win_percentages.csv"
df.to_csv(updated_file_name, index=False)
print(f"Updated dataset saved as '{updated_file_name}'.")

files.download(updated_file_name)


Updated dataset saved as 'updated_final_dataset_with_win_percentages.csv'.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Softmax Regression without bookie odds

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


file_path = 'updated_final_dataset_with_win_percentages.csv'
df = pd.read_csv(file_path)

# target column (FTR) into numerical values
df['FTR_encoded'] = df['FTR'].map({'H': 0, 'D': 1, 'A': 2})

# Convert (HTFormPtsStr, ATFormPtsStr) to numerical representations
df['HTFormPtsStr'] = df['HTFormPtsStr'].apply(lambda x: sum(3 if ch == 'W' else 1 if ch == 'D' else 0 for ch in x))
df['ATFormPtsStr'] = df['ATFormPtsStr'].apply(lambda x: sum(3 if ch == 'W' else 1 if ch == 'D' else 0 for ch in x))

# Define features
features = [
    'Home Win % (Before)', 'Away Win % (Before)',
    'HTFormPts', 'ATFormPts',
    'HTWinStreak3', 'HTWinStreak5', 'HTLossStreak3', 'HTLossStreak5',
    'ATWinStreak3', 'ATWinStreak5', 'ATLossStreak3', 'ATLossStreak5',
    'HTGC', 'ATGC',
    'HTFormPtsStr', 'ATFormPtsStr',  # Recent form strings
    'HomeTeamLP', 'AwayTeamLP',      # League positions
    'DiffLP',                        # Difference in league positions
    'HTP', 'ATP',                    # Team points
    'HTGD', 'ATGD',                  # Goal differences
    'DiffPts', 'DiffFormPts'         # Differences in points and form points
]

# target variable
target = 'FTR_encoded'


X = df[features]
y = df[target]

# Stratified split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Standardize
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the Softmax Regression model
softmax_model = LogisticRegression(
    multi_class='multinomial',
    solver='lbfgs',
    max_iter=500,
    C=0.5,
    random_state=42
)
softmax_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred_train = softmax_model.predict(X_train_scaled)
y_pred_test = softmax_model.predict(X_test_scaled)

# Evaluate model
train_accuracy = accuracy_score(y_train, y_pred_train) * 100
test_accuracy = accuracy_score(y_test, y_pred_test) * 100


class_mapping = {0: 'Home Win', 1: 'Draw', 2: 'Away Win'}
target_names = [class_mapping[cls] for cls in sorted(y.unique())]
classification_rep = classification_report(y_test, y_pred_test, target_names=target_names)

# Bookmaker accuracy calculation
def calculate_bookie_accuracy(df, bookies, index):
    accuracies = {}
    for bookie in bookies:
        # Use odds for all bookmakers
        bookie_predictions = df.loc[index, [f"{bookie}H", f"{bookie}D", f"{bookie}A"]].idxmin(axis=1)
        bookie_predictions = bookie_predictions.map({f"{bookie}H": 'H', f"{bookie}D": 'D', f"{bookie}A": 'A'})
        # Calculate accuracy against the actual results (FTR)
        accuracy = (bookie_predictions == df.loc[index, 'FTR']).mean() * 100
        accuracies[bookie] = accuracy
    return accuracies

# Calculate bookmaker accuracies for training and test data
bookmakers = ['B365', 'LB', 'IW', 'WH']
bookie_accuracies_train = calculate_bookie_accuracy(df, bookmakers, X_train.index)
bookie_accuracies_test = calculate_bookie_accuracy(df, bookmakers, X_test.index)


print(f"Training Accuracy (Softmax): {train_accuracy:.2f}%")
print(f"Testing Accuracy (Softmax): {test_accuracy:.2f}%")
print("\nClassification Report (Softmax):")
print(classification_rep)

print("\nBookmaker Accuracies (Training Data):")
for bookie, accuracy in bookie_accuracies_train.items():
    print(f"{bookie}: {accuracy:.2f}%")

print("\nBookmaker Accuracies (Test Data):")
for bookie, accuracy in bookie_accuracies_test.items():
    print(f"{bookie}: {accuracy:.2f}%")



Training Accuracy (Softmax): 53.54%
Testing Accuracy (Softmax): 53.67%

Classification Report (Softmax):
              precision    recall  f1-score   support

    Home Win       0.56      0.82      0.66       557
        Draw       0.44      0.01      0.03       303
    Away Win       0.49      0.54      0.51       340

    accuracy                           0.54      1200
   macro avg       0.50      0.46      0.40      1200
weighted avg       0.51      0.54      0.46      1200


Bookmaker Accuracies (Training Data):
B365: 54.31%
LB: 54.40%
IW: 54.33%
WH: 52.79%

Bookmaker Accuracies (Test Data):
B365: 53.83%
LB: 53.83%
IW: 54.08%
WH: 52.50%


## Softmax Regression with 3 bookie odds

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


file_path = 'updated_final_dataset_with_win_percentages.csv'
df = pd.read_csv(file_path)

# Encode the target column (FTR) into numerical values
df['FTR_encoded'] = df['FTR'].map({'H': 0, 'D': 1, 'A': 2})

# Convert columns (HTFormPtsStr, ATFormPtsStr) to numerical representations
df['HTFormPtsStr'] = df['HTFormPtsStr'].apply(lambda x: sum(3 if ch == 'W' else 1 if ch == 'D' else 0 for ch in x))
df['ATFormPtsStr'] = df['ATFormPtsStr'].apply(lambda x: sum(3 if ch == 'W' else 1 if ch == 'D' else 0 for ch in x))

# Define features
features = [
    'Home Win % (Before)', 'Away Win % (Before)',
    'HTFormPts', 'ATFormPts',
    'HTWinStreak3', 'HTWinStreak5', 'HTLossStreak3', 'HTLossStreak5',
    'ATWinStreak3', 'ATWinStreak5', 'ATLossStreak3', 'ATLossStreak5',
    'HTFormPtsStr', 'ATFormPtsStr',  # Recent form strings converted to numeric
    'HomeTeamLP', 'AwayTeamLP',      # League positions
    'DiffLP',                        # Difference in league positions
    'IWH', 'IWD', 'IWA',             # IW bookmaker odds
    'B365H', 'B365D', 'B365A',       # B365 bookmaker odds
    'LBH', 'LBD', 'LBA',             # LB bookmaker odds
    'HTP', 'ATP',                    # Team points
    'HTGD', 'ATGD',                  # Goal differences
    'DiffPts', 'DiffFormPts'         # Differences in points and form points
]
target = 'FTR_encoded'


X = df[features]
y = df[target]

# Handle missing values in target
y = y.dropna()
X = X.loc[y.index]

# Stratified split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the Softmax Regression model
softmax_model = LogisticRegression(
    multi_class='multinomial',
    solver='lbfgs',
    max_iter=500,
    C=0.5,
    random_state=42
)
softmax_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred_train = softmax_model.predict(X_train_scaled)
y_pred_test = softmax_model.predict(X_test_scaled)

# Evaluate the model
train_accuracy = accuracy_score(y_train, y_pred_train) * 100
test_accuracy = accuracy_score(y_test, y_pred_test) * 100


class_mapping = {0: 'Home Win', 1: 'Draw', 2: 'Away Win'}
target_names = [class_mapping[cls] for cls in sorted(y.unique())]
classification_rep = classification_report(y_test, y_pred_test, target_names=target_names)

# Bookmaker accuracy calculation
def calculate_bookie_accuracy(df, bookies, index):
    accuracies = {}
    for bookie in bookies:
        # Determine the bookmaker's prediction
        bookie_predictions = df.loc[index, [f"{bookie}H", f"{bookie}D", f"{bookie}A"]].idxmin(axis=1)
        bookie_predictions = bookie_predictions.map({f"{bookie}H": 'H', f"{bookie}D": 'D', f"{bookie}A": 'A'})
        # Calculate accuracy against the actual results (FTR)
        accuracy = (bookie_predictions == df.loc[index, 'FTR']).mean() * 100
        accuracies[bookie] = accuracy
    return accuracies

# Calculate bookmaker accuracies for training and test data
bookmakers_model = ['B365', 'IW', 'LB']
bookmakers_excluded = ['WH']

bookie_accuracies_train = calculate_bookie_accuracy(df, bookmakers_model, X_train.index)
bookie_accuracies_test = calculate_bookie_accuracy(df, bookmakers_model, X_test.index)
bookie_accuracies_excluded_test = calculate_bookie_accuracy(df, bookmakers_excluded, X_test.index)


print(f"Training Accuracy (Softmax): {train_accuracy:.2f}%")
print(f"Testing Accuracy (Softmax): {test_accuracy:.2f}%")
print("\nClassification Report (Softmax):")
print(classification_rep)

print("\nBookmaker Accuracies (Training Data - Included in Model):")
for bookie, accuracy in bookie_accuracies_train.items():
    print(f"{bookie}: {accuracy:.2f}%")

print("\nBookmaker Accuracies (Test Data - Included in Model):")
for bookie, accuracy in bookie_accuracies_test.items():
    print(f"{bookie}: {accuracy:.2f}%")

print("\nBookmaker Accuracies (Test Data - Excluded from Model):")
for bookie, accuracy in bookie_accuracies_excluded_test.items():
    print(f"{bookie}: {accuracy:.2f}%")



Training Accuracy (Softmax): 54.69%
Testing Accuracy (Softmax): 53.50%

Classification Report (Softmax):
              precision    recall  f1-score   support

    Home Win       0.54      0.85      0.66       557
        Draw       0.38      0.01      0.02       303
    Away Win       0.51      0.48      0.50       340

    accuracy                           0.54      1200
   macro avg       0.48      0.45      0.39      1200
weighted avg       0.49      0.54      0.45      1200


Bookmaker Accuracies (Training Data - Included in Model):
B365: 54.31%
IW: 54.33%
LB: 54.40%

Bookmaker Accuracies (Test Data - Included in Model):
B365: 53.83%
IW: 54.08%
LB: 53.83%

Bookmaker Accuracies (Test Data - Excluded from Model):
WH: 52.50%


## SVM without bookie odds





In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report


file_path = 'updated_final_dataset_with_win_percentages.csv'
df = pd.read_csv(file_path)

# Encode the target column (FTR) into numerical values
df['FTR_encoded'] = df['FTR'].map({'H': 0, 'D': 1, 'A': 2})

# Convert columns (HTFormPtsStr, ATFormPtsStr) to numerical representations
df['HTFormPtsStr'] = df['HTFormPtsStr'].apply(
    lambda x: sum(3 if ch == 'W' else 1 if ch == 'D' else 0 for ch in x) if isinstance(x, str) else 0
)
df['ATFormPtsStr'] = df['ATFormPtsStr'].apply(
    lambda x: sum(3 if ch == 'W' else 1 if ch == 'D' else 0 for ch in x) if isinstance(x, str) else 0
)


numeric_columns = [
    'Home Win % (Before)', 'Away Win % (Before)',
    'HTGS', 'ATGS', 'HTGC', 'ATGC',
    'HTFormPts', 'ATFormPts',
    'HTWinStreak3', 'HTWinStreak5', 'HTLossStreak3', 'HTLossStreak5',
    'ATWinStreak3', 'ATWinStreak5', 'ATLossStreak3', 'ATLossStreak5',
    'HTFormPtsStr', 'ATFormPtsStr',
    'HomeTeamLP', 'AwayTeamLP',
    'DiffLP',
    'HTP', 'ATP',
    'HTGD', 'ATGD',
    'DiffPts', 'DiffFormPts'
]

# Replace NaN values with 0 for numeric stability
df.fillna(0, inplace=True)

# Define features and target
X = df[numeric_columns]
y = df['FTR_encoded']

# Stratified split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the SVM model
svm_model = SVC(kernel='linear', C=1.0, random_state=42)
svm_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred_train = svm_model.predict(X_train_scaled)
y_pred_test = svm_model.predict(X_test_scaled)

# Evaluate the model
train_accuracy = accuracy_score(y_train, y_pred_train) * 100
test_accuracy = accuracy_score(y_test, y_pred_test) * 100


unique_classes = sorted(y.unique())
class_mapping = {0: 'Home Win', 1: 'Draw', 2: 'Away Win'}
target_names = [class_mapping[cls] for cls in unique_classes]

# Generate the classification report for testing data
classification_rep = classification_report(y_test, y_pred_test, target_names=target_names)

# Bookmaker accuracy calculation for both train and test sets
def calculate_bookie_accuracy(df_subset, bookies):
    accuracies = {}
    for bookie in bookies:

        bookie_predictions = df_subset[[f"{bookie}H", f"{bookie}D", f"{bookie}A"]].idxmin(axis=1)
        bookie_predictions = bookie_predictions.map({f"{bookie}H": 'H', f"{bookie}D": 'D', f"{bookie}A": 'A'})
        # Calculate accuracy against the actual results (FTR)
        accuracy = (bookie_predictions == df_subset['FTR']).mean() * 100
        accuracies[bookie] = accuracy
    return accuracies


train_indices = X_train.index
test_indices = X_test.index

bookmakers = ['B365', 'LB', 'IW', 'WH']
bookie_accuracies_train = calculate_bookie_accuracy(df.loc[train_indices], bookmakers)
bookie_accuracies_test = calculate_bookie_accuracy(df.loc[test_indices], bookmakers)


print(f"Training Accuracy (SVM): {train_accuracy:.2f}%")
print(f"Testing Accuracy (SVM): {test_accuracy:.2f}%")
print("\nClassification Report (SVM):")
print(classification_rep)

print("\nBookmaker Accuracies on Train Data:")
for bookie, accuracy in bookie_accuracies_train.items():
    print(f"{bookie}: {accuracy:.2f}%")

print("\nBookmaker Accuracies on Test Data:")
for bookie, accuracy in bookie_accuracies_test.items():
    print(f"{bookie}: {accuracy:.2f}%")

Training Accuracy (SVM): 53.40%
Testing Accuracy (SVM): 53.67%

Classification Report (SVM):
              precision    recall  f1-score   support

    Home Win       0.56      0.83      0.66       557
        Draw       0.50      0.01      0.01       303
    Away Win       0.49      0.54      0.51       340

    accuracy                           0.54      1200
   macro avg       0.52      0.46      0.40      1200
weighted avg       0.52      0.54      0.46      1200


Bookmaker Accuracies on Train Data:
B365: 54.31%
LB: 54.40%
IW: 54.33%
WH: 52.79%

Bookmaker Accuracies on Test Data:
B365: 53.83%
LB: 53.83%
IW: 54.08%
WH: 52.50%


## SVM with bookie odds

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report


file_path = 'updated_final_dataset_with_win_percentages.csv'
df = pd.read_csv(file_path)

# Encode the target column (FTR) into numerical values
df['FTR_encoded'] = df['FTR'].map({'H': 0, 'D': 1, 'A': 2})

# Convert columns (HTFormPtsStr, ATFormPtsStr) to numerical representations
df['HTFormPtsStr'] = df['HTFormPtsStr'].apply(
    lambda x: sum(3 if ch == 'W' else 1 if ch == 'D' else 0 for ch in x) if isinstance(x, str) else 0
)
df['ATFormPtsStr'] = df['ATFormPtsStr'].apply(
    lambda x: sum(3 if ch == 'W' else 1 if ch == 'D' else 0 for ch in x) if isinstance(x, str) else 0
)

# Ensure all required columns are numeric and fill NaN values with 0
df.fillna(0, inplace=True)

# Define feature columns
features = [
    'Home Win % (Before)', 'Away Win % (Before)',
    'HTGS', 'ATGS', 'HTGC', 'ATGC',
    'HTFormPts', 'ATFormPts',
    'HTWinStreak3', 'HTWinStreak5', 'HTLossStreak3', 'HTLossStreak5',
    'ATWinStreak3', 'ATWinStreak5', 'ATLossStreak3', 'ATLossStreak5',
    'HTFormPtsStr', 'ATFormPtsStr',
    'HomeTeamLP', 'AwayTeamLP',
    'DiffLP',
    'HTP', 'ATP',
    'HTGD', 'ATGD',
    'DiffPts', 'DiffFormPts',
    'IWH', 'IWD', 'IWA',
    'B365H', 'B365D', 'B365A',
    'LBH', 'LBD', 'LBA'
]
target = 'FTR_encoded'

# Prepare data
X = df[features]
y = df[target]

# Stratified split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the SVM model
svm_model = SVC(kernel='linear', C=1.0, random_state=42)
svm_model.fit(X_train_scaled, y_train)

# Make predictions
y_pred_train = svm_model.predict(X_train_scaled)
y_pred_test = svm_model.predict(X_test_scaled)

# Evaluate the model
train_accuracy = accuracy_score(y_train, y_pred_train) * 100
test_accuracy = accuracy_score(y_test, y_pred_test) * 100

# Dynamically generate target names based on unique classes in the test set
unique_classes = sorted(y.unique())
class_mapping = {0: 'Home Win', 1: 'Draw', 2: 'Away Win'}
target_names = [class_mapping[cls] for cls in unique_classes]

# classification report
classification_rep = classification_report(y_test, y_pred_test, target_names=target_names)

# Bookmaker accuracy calculation for both train and test sets
def calculate_bookie_accuracy(df_subset, bookies):
    accuracies = {}
    for bookie in bookies:

        bookie_predictions = df_subset[[f"{bookie}H", f"{bookie}D", f"{bookie}A"]].idxmin(axis=1)
        bookie_predictions = bookie_predictions.map({f"{bookie}H": 'H', f"{bookie}D": 'D', f"{bookie}A": 'A'})
        # Calculate accuracy against the actual results (FTR)
        accuracy = (bookie_predictions == df_subset['FTR']).mean() * 100
        accuracies[bookie] = accuracy
    return accuracies

# Calculate bookmaker accuracies for train and test data
train_indices = X_train.index
test_indices = X_test.index

bookmakers_included = ['B365', 'IW', 'LB']
bookmakers_excluded = ['WH']

bookie_accuracies_train = calculate_bookie_accuracy(df.loc[train_indices], bookmakers_included + bookmakers_excluded)
bookie_accuracies_test = calculate_bookie_accuracy(df.loc[test_indices], bookmakers_included + bookmakers_excluded)


print(f"Training Accuracy (SVM): {train_accuracy:.2f}%")
print(f"Testing Accuracy (SVM): {test_accuracy:.2f}%")
print("\nClassification Report (SVM):")
print(classification_rep)

print("\nBookmaker Accuracies on Train Data:")
for bookie, accuracy in bookie_accuracies_train.items():
    print(f"{bookie}: {accuracy:.2f}%")

print("\nBookmaker Accuracies on Test Data:")
for bookie, accuracy in bookie_accuracies_test.items():
    print(f"{bookie}: {accuracy:.2f}%")

Training Accuracy (SVM): 53.48%
Testing Accuracy (SVM): 52.50%

Classification Report (SVM):
              precision    recall  f1-score   support

    Home Win       0.51      0.92      0.66       557
        Draw       0.00      0.00      0.00       303
    Away Win       0.58      0.35      0.44       340

    accuracy                           0.53      1200
   macro avg       0.37      0.42      0.37      1200
weighted avg       0.40      0.53      0.43      1200


Bookmaker Accuracies on Train Data:
B365: 54.31%
IW: 54.33%
LB: 54.40%
WH: 52.79%

Bookmaker Accuracies on Test Data:
B365: 53.83%
IW: 54.08%
LB: 53.83%
WH: 52.50%


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## NEURAL NETWORK without bookie odds

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import accuracy_score, classification_report


file_path = 'updated_final_dataset_with_win_percentages.csv'
df = pd.read_csv(file_path)

# Encode the target column (FTR) into numerical values
df['FTR_encoded'] = df['FTR'].map({'H': 0, 'D': 1, 'A': 2})

# Convert columns (HTFormPtsStr, ATFormPtsStr) to numerical representations
df['HTFormPtsStr'] = df['HTFormPtsStr'].apply(
    lambda x: sum(3 if ch == 'W' else 1 if ch == 'D' else 0 for ch in x) if isinstance(x, str) else 0
)
df['ATFormPtsStr'] = df['ATFormPtsStr'].apply(
    lambda x: sum(3 if ch == 'W' else 1 if ch == 'D' else 0 for ch in x) if isinstance(x, str) else 0
)

# Ensure all required columns are numeric and fill NaN values with 0
df.fillna(0, inplace=True)

# Define feature columns
features = [
    'Home Win % (Before)', 'Away Win % (Before)',
    'HTGS', 'ATGS', 'HTGC', 'ATGC',
    'HTFormPts', 'ATFormPts',
    'HTWinStreak3', 'HTWinStreak5', 'HTLossStreak3', 'HTLossStreak5',
    'ATWinStreak3', 'ATWinStreak5', 'ATLossStreak3', 'ATLossStreak5',
    'HTFormPtsStr', 'ATFormPtsStr',
    'HomeTeamLP', 'AwayTeamLP',
    'DiffLP',
    'HTP', 'ATP',
    'HTGD', 'ATGD',
    'DiffPts', 'DiffFormPts'
]
target = 'FTR_encoded'


X = df[features]
y = df[target]


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert the target variable to categorical format for DNN
y_train_categorical = to_categorical(y_train, num_classes=3)
y_test_categorical = to_categorical(y_test, num_classes=3)

# Define the DNN model
model = Sequential([
    Dense(128, input_dim=X_train_scaled.shape[1], activation='relu'),
    Dropout(0.2),  # Prevent overfitting
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(3, activation='softmax')  # Output layer for 3 classes
])


model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# Train the model
history = model.fit(
    X_train_scaled, y_train_categorical,
    epochs=50,
    batch_size=32,
    validation_split=0.2,
    verbose=1
)

# Evaluate the model on the training set
train_loss, train_accuracy = model.evaluate(X_train_scaled, y_train_categorical, verbose=0)
train_accuracy *= 100

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test_scaled, y_test_categorical, verbose=0)
test_accuracy *= 100

# Generate predictions for the test set
y_pred_test_prob = model.predict(X_test_scaled)
y_pred_test = np.argmax(y_pred_test_prob, axis=1)

# Generate classification report
class_mapping = {0: 'Home Win', 1: 'Draw', 2: 'Away Win'}
target_names = [class_mapping[cls] for cls in sorted(y.unique())]
classification_rep = classification_report(y_test, y_pred_test, target_names=target_names)

# Bookmaker accuracy calculation
def calculate_bookie_accuracy(df_subset, bookies):
    accuracies = {}
    for bookie in bookies:

        bookie_predictions = df_subset[[f"{bookie}H", f"{bookie}D", f"{bookie}A"]].idxmin(axis=1)
        bookie_predictions = bookie_predictions.map({f"{bookie}H": 'H', f"{bookie}D": 'D', f"{bookie}A": 'A'})
        # Calculate accuracy against the actual results (FTR)
        accuracy = (bookie_predictions == df_subset['FTR']).mean() * 100
        accuracies[bookie] = accuracy
    return accuracies

# Calculate bookmaker accuracies for test data
train_indices = X_train.index
test_indices = X_test.index
bookmakers = ['B365', 'LB', 'IW', 'WH']

bookie_accuracies_train = calculate_bookie_accuracy(df.loc[train_indices], bookmakers)
bookie_accuracies_test = calculate_bookie_accuracy(df.loc[test_indices], bookmakers)


print(f"Training Accuracy (DNN): {train_accuracy:.2f}%")
print(f"Testing Accuracy (DNN): {test_accuracy:.2f}%")
print("\nClassification Report (DNN):")
print(classification_rep)

print("\nBookmaker Accuracies on Train Data:")
for bookie, accuracy in bookie_accuracies_train.items():
    print(f"{bookie}: {accuracy:.2f}%")

print("\nBookmaker Accuracies on Test Data:")
for bookie, accuracy in bookie_accuracies_test.items():
    print(f"{bookie}: {accuracy:.2f}%")

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.4867 - loss: 1.0248 - val_accuracy: 0.5188 - val_loss: 0.9921
Epoch 2/50
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.5381 - loss: 0.9778 - val_accuracy: 0.5260 - val_loss: 0.9851
Epoch 3/50
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.5407 - loss: 0.9749 - val_accuracy: 0.5250 - val_loss: 0.9855
Epoch 4/50
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5216 - loss: 0.9768 - val_accuracy: 0.5323 - val_loss: 0.9889
Epoch 5/50
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5458 - loss: 0.9575 - val_accuracy: 0.5292 - val_loss: 0.9861
Epoch 6/50
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5366 - loss: 0.9668 - val_accuracy: 0.5281 - val_loss: 0.9770
Epoch 7/50
[1m120/120[0m [32m━━━━━━━

Neural Network with Bookie Odds

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report


file_path = 'updated_final_dataset_with_win_percentages.csv'
df = pd.read_csv(file_path)

# Encode the target column (FTR) into numerical values
df['FTR_encoded'] = df['FTR'].map({'H': 0, 'D': 1, 'A': 2})

# Convert columns (HTFormPtsStr, ATFormPtsStr) to numerical representations
df['HTFormPtsStr'] = df['HTFormPtsStr'].apply(
    lambda x: sum(3 if ch == 'W' else 1 if ch == 'D' else 0 for ch in x) if isinstance(x, str) else 0
)
df['ATFormPtsStr'] = df['ATFormPtsStr'].apply(
    lambda x: sum(3 if ch == 'W' else 1 if ch == 'D' else 0 for ch in x) if isinstance(x, str) else 0
)

# Ensure all required columns are numeric and fill NaN values with 0
df.fillna(0, inplace=True)

# Define feature columns
features = [
    'Home Win % (Before)', 'Away Win % (Before)',
    'HTGS', 'ATGS', 'HTGC', 'ATGC',
    'HTFormPts', 'ATFormPts',
    'HTWinStreak3', 'HTWinStreak5', 'HTLossStreak3', 'HTLossStreak5',
    'ATWinStreak3', 'ATWinStreak5', 'ATLossStreak3', 'ATLossStreak5',
    'HTFormPtsStr', 'ATFormPtsStr',
    'HomeTeamLP', 'AwayTeamLP',
    'DiffLP',
    'HTP', 'ATP',
    'HTGD', 'ATGD',
    'DiffPts', 'DiffFormPts'
]
target = 'FTR_encoded'


X = df[features]
y = df[target]

# Stratified split to ensure all classes are represented
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert the target variable to categorical format for DNN
y_train_categorical = to_categorical(y_train, num_classes=3)
y_test_categorical = to_categorical(y_test, num_classes=3)

# Define the DNN model
model = Sequential([
    Dense(128, input_dim=X_train_scaled.shape[1], activation='relu'),
    Dropout(0.2),  # Prevent overfitting
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(3, activation='softmax')
])


model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# Train the model
history = model.fit(
    X_train_scaled, y_train_categorical,
    epochs=50,  # Increase or decrease based on performance
    batch_size=32,
    validation_split=0.2,
    verbose=1
)

# Evaluate the model on the training set
train_loss, train_accuracy = model.evaluate(X_train_scaled, y_train_categorical, verbose=0)
train_accuracy *= 100

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test_scaled, y_test_categorical, verbose=0)
test_accuracy *= 100

# Generate predictions for the test set
y_pred_test_prob = model.predict(X_test_scaled)
y_pred_test = np.argmax(y_pred_test_prob, axis=1)

# Generate classification report
class_mapping = {0: 'Home Win', 1: 'Draw', 2: 'Away Win'}
target_names = [class_mapping[cls] for cls in sorted(y.unique())]
classification_rep = classification_report(y_test, y_pred_test, target_names=target_names)

# Bookmaker accuracy calculation
def calculate_bookie_accuracy(df_subset, bookies):
    accuracies = {}
    for bookie in bookies:

        bookie_predictions = df_subset[[f"{bookie}H", f"{bookie}D", f"{bookie}A"]].idxmin(axis=1)
        bookie_predictions = bookie_predictions.map({f"{bookie}H": 'H', f"{bookie}D": 'D', f"{bookie}A": 'A'})

        accuracy = (bookie_predictions == df_subset['FTR']).mean() * 100
        accuracies[bookie] = accuracy
    return accuracies

# Calculate bookmaker accuracies for test data
train_indices = X_train.index
test_indices = X_test.index
bookmakers = ['B365', 'LB', 'IW', 'WH']

bookie_accuracies_train = calculate_bookie_accuracy(df.loc[train_indices], bookmakers)
bookie_accuracies_test = calculate_bookie_accuracy(df.loc[test_indices], bookmakers)


print(f"Training Accuracy (DNN): {train_accuracy:.2f}%")
print(f"Testing Accuracy (DNN): {test_accuracy:.2f}%")
print("\nClassification Report (DNN):")
print(classification_rep)

print("\nBookmaker Accuracies on Train Data:")
for bookie, accuracy in bookie_accuracies_train.items():
    print(f"{bookie}: {accuracy:.2f}%")

print("\nBookmaker Accuracies on Test Data:")
for bookie, accuracy in bookie_accuracies_test.items():
    print(f"{bookie}: {accuracy:.2f}%")

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.4899 - loss: 1.0270 - val_accuracy: 0.5323 - val_loss: 0.9786
Epoch 2/50
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5385 - loss: 0.9813 - val_accuracy: 0.5208 - val_loss: 0.9809
Epoch 3/50
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5332 - loss: 0.9733 - val_accuracy: 0.5156 - val_loss: 0.9813
Epoch 4/50
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5291 - loss: 0.9821 - val_accuracy: 0.5177 - val_loss: 0.9859
Epoch 5/50
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5442 - loss: 0.9694 - val_accuracy: 0.5240 - val_loss: 0.9810
Epoch 6/50
[1m120/120[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5170 - loss: 0.9848 - val_accuracy: 0.5229 - val_loss: 0.9853
Epoch 7/50
[1m120/120[0m [32m━━━━━━━