## Cleaning Kaggle Dataset

The Kaggle datset containts match data with relevant statistics from both home and away teams

In [1]:
import pandas as pd
import json

#### Load dataset

In [2]:
kaggle_original_dataset_dir = f"../original/Kaggle_Dataset.csv"
kaggle_df = pd.read_csv(kaggle_original_dataset_dir, encoding='ISO-8859-1')


#### Standardize the team names in the dataset

In [3]:
standardized_team_names_dir = f"../resources/standardized_team_names.json"
with open(standardized_team_names_dir, 'r') as file:
    standardized_team_names = json.load(file)

In [4]:
def standardize_team_name(name, standardized_names):
    for standard_name, variations in standardized_names.items():
        if name in variations:
            return standard_name
    return name

In [5]:
kaggle_df['HomeTeam'] = kaggle_df['HomeTeam'].apply(lambda x: standardize_team_name(x, standardized_team_names))
kaggle_df['AwayTeam'] = kaggle_df['AwayTeam'].apply(lambda x: standardize_team_name(x, standardized_team_names))

#### Remove rows with missing data

In [6]:
nan_row_count = kaggle_df.isna().any(axis=1).sum()
total_rows = len(kaggle_df)

kaggle_df = kaggle_df.dropna()

nan_row_count_after_drop = kaggle_df.isna().any(axis=1).sum()
total_rows_after_drop = len(kaggle_df)

if nan_row_count != total_rows - total_rows_after_drop:
    raise("Error: NaN values were not removed.")
elif nan_row_count_after_drop != 0:
    raise("Error: NaN values still present.")

print(f"Removed {nan_row_count} rows from dataset, reducing size by {nan_row_count/total_rows*100:.2f}%")

Removed 2824 rows from dataset, reducing size by 25.41%


#### Save cleaned dataset

In [7]:
kaggle_dataset_cleaned_dir = f"../processed/Kaggle_Dataset_Cleaned.csv"
kaggle_df.to_csv(kaggle_dataset_cleaned_dir, index=False)