# Balance Data:

#### - Check if the data is balanced or not:

In [4]:
import pandas as pd

# Load the dataset
data = pd.read_csv('Dataset/Preprocessed_dataset.csv')

# Access the 'outcome' column
outcomes = data['outcome']

# Count occurrences of each outcome
num_home_wins = outcomes[outcomes == 'H'].count()
num_away_wins = outcomes[outcomes == 'A'].count()
num_draws = outcomes[outcomes == 'D'].count()

# Calculate the total number of entries
total = num_home_wins + num_away_wins + num_draws

# Calculate the percentage of each outcome
percentage_home_wins = (num_home_wins / total) * 100
percentage_away_wins = (num_away_wins / total) * 100
percentage_draws = (num_draws / total) * 100

# Print the counts and percentages
print("Number of Home Wins:", num_home_wins)
print("Number of Away Wins:", num_away_wins)
print("Number of Draws:", num_draws)
print("_")
print("Percentage of Home Wins: {:.2f}%".format(percentage_home_wins))
print("Percentage of Away Wins: {:.2f}%".format(percentage_away_wins))
print("Percentage of Draws: {:.2f}%".format(percentage_draws))

Number of Home Wins: 429
Number of Away Wins: 302
Number of Draws: 169
_
Percentage of Home Wins: 47.67%
Percentage of Away Wins: 33.56%
Percentage of Draws: 18.78%


#### - Process of correcting data balancing:

In [3]:
import pandas as pd
from sklearn.utils import resample

# Load the dataset
data = pd.read_csv('Dataset/Preprocessed_dataset.csv')

# Determine the number of each outcome
num_home_wins = 429  # Provided count for home wins
num_away_wins = 302  # Provided count for away wins
num_draws = 169      # Provided count for draws

# Access data for each outcome
home_wins_data = data[data['outcome'] == 'H']
away_wins_data = data[data['outcome'] == 'A']
draws_data = data[data['outcome'] == 'D']

# Choose the number to resample to (the number of draws)
target_samples = num_draws  # The smallest group size

# Resample the majority classes to the size of the minority class
home_wins_resampled = resample(home_wins_data, replace=False, n_samples=target_samples, random_state=42)
away_wins_resampled = resample(away_wins_data, replace=False, n_samples=target_samples, random_state=42)

# Concatenate the resampled data back together
final_data = pd.concat([home_wins_resampled, away_wins_resampled, draws_data])

# Save the balanced dataset
final_data.to_csv('Dataset/Balanced_Outcomes.csv', index=False)

# Print the final counts to verify balance
print("Final number of Home Wins:", len(final_data[final_data['outcome'] == 'H']))
print("Final number of Away Wins:", len(final_data[final_data['outcome'] == 'A']))
print("Final number of Draws:", len(final_data[final_data['outcome'] == 'D']))

Final number of Home Wins: 169
Final number of Away Wins: 169
Final number of Draws: 169


In [4]:
import pandas as pd

# Load the dataset
data = pd.read_csv('Dataset/Balanced_Outcomes.csv')  
outcome_values = data['outcome']

num_home_wins = outcome_values[outcome_values == 'H'].count()
num_away_wins = outcome_values[outcome_values == 'A'].count()
num_draws = outcome_values[outcome_values == 'D'].count()

# Calculate the total number of outcomes
total = num_home_wins + num_away_wins + num_draws

percentage_home_wins = (num_home_wins / total) * 100
percentage_away_wins = (num_away_wins / total) * 100
percentage_draws = (num_draws / total) * 100

# Print the percentages
print("Percentage of Home Wins: {:.2f}%".format(percentage_home_wins))
print("Percentage of Away Wins: {:.2f}%".format(percentage_away_wins))
print("Percentage of Draws: {:.2f}%".format(percentage_draws))

Percentage of Home Wins: 33.33%
Percentage of Away Wins: 33.33%
Percentage of Draws: 33.33%


In [5]:
import pandas as pd
from scipy import stats
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split
from sklearn import metrics 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from sklearn import tree
df = pd.read_csv('Dataset/Balanced_Outcomes.csv');
print(df)

     year  country  city  stage  home_team  away_team  home_score  away_score  \
0    2018        9    94     19         16         19         0.1         0.1   
1    1962        2   127      4         38         69         0.3         0.0   
2    1986        8    82     15         53         54         0.1         0.0   
3    1938        4    13     21          8         68         0.4         0.2   
4    2010       10    55     15         62         37         0.3         0.2   
..    ...      ...   ...    ...        ...        ...         ...         ...   
502  2018        9    33     17         41         60         0.2         0.2   
503  2018        9    57     10         67         43         0.2         0.2   
504  2018        9   129     10         35         54         0.1         0.1   
505  2018        9    85     11         20         27         0.0         0.0   
506  2018        9    94     14         69         15         0.2         0.2   

    outcome                

In [8]:
fn= df.keys().tolist()[:-1]
X= df[fn]
y=df['outcome']
print(y)

0      H
1      H
2      H
3      H
4      H
      ..
502    D
503    D
504    D
505    D
506    D
Name: outcome, Length: 507, dtype: object
