## Cleaning Guardian Dataset

The Guardian datset containts match previews

In [1]:
import pandas as pd
import chardet
import json

#### Load dataset

In [2]:
guardian_original_dataset_dir = f"../original/Guardian_Dataset.csv"

with open(guardian_original_dataset_dir, 'rb') as f:
    encoding = chardet.detect(f.read())['encoding']

guardian_df = pd.read_csv(guardian_original_dataset_dir, encoding=encoding)


#### Standardize the team names in the dataset

In [3]:
standardized_team_names_dir = f"../resources/standardized_team_names.json"
with open(standardized_team_names_dir, 'r') as file:
    standardized_team_names = json.load(file)

In [4]:
def standardize_team_name(name, standardized_names):
    for standard_name, variations in standardized_names.items():
        if name in variations:
            return standard_name
    return name

In [5]:
guardian_df['Home'] = guardian_df['Home'].apply(lambda x: standardize_team_name(x, standardized_team_names))
guardian_df['Away'] = guardian_df['Away'].apply(lambda x: standardize_team_name(x, standardized_team_names))

#### Remove rows with missing data

In [6]:
nan_row_count = guardian_df.isna().any(axis=1).sum()
total_rows = len(guardian_df)

guardian_df = guardian_df.dropna()

nan_row_count_after_drop = guardian_df.isna().any(axis=1).sum()
total_rows_after_drop = len(guardian_df)

if nan_row_count != total_rows - total_rows_after_drop:
    raise("Error: NaN values were not removed.")
elif nan_row_count_after_drop != 0:
    raise("Error: NaN values still present.")

print(f"Removed {nan_row_count} rows from dataset, reducing size by {nan_row_count/total_rows*100:.2f}%")

Removed 1 rows from dataset, reducing size by 0.07%


#### Save cleaned dataset

In [7]:
guardian_dataset_cleaned_dir = f"../processed/Guardian_Dataset_Cleaned.csv"
guardian_df.to_csv(guardian_dataset_cleaned_dir, index=False, encoding=encoding)