## Combining the Guardian and Kaggle Datasets

In [1]:
import pandas as pd
import chardet

#### Load datasets

In [2]:
guardian_dataset_cleaned_dir = f"../processed/Guardian_Dataset_Cleaned.csv"
kaggle_dataset_cleaned_dir = f"../processed/Kaggle_Dataset_Cleaned.csv"

with open(guardian_dataset_cleaned_dir, 'rb') as f:
    encoding = chardet.detect(f.read())['encoding']

guardian_df = pd.read_csv(guardian_dataset_cleaned_dir, encoding=encoding)
kaggle_df = pd.read_csv(kaggle_dataset_cleaned_dir)

#### Combine datasets

In [3]:
guardian_kaggle_combined_df = pd.merge(guardian_df, kaggle_df, left_on=['Home', 'Away', 'Season'], right_on=['HomeTeam', 'AwayTeam', 'Season'], how='inner')
guardian_kaggle_combined_df = guardian_kaggle_combined_df.drop(columns=['HomeTeam', 'AwayTeam'])

In [4]:
nan_row_count = guardian_kaggle_combined_df.isna().any(axis=1).sum()
total_rows = len(guardian_kaggle_combined_df)

if nan_row_count > 0:
    raise(f"Dataset has {nan_row_count} NaN values")
elif total_rows == 0:
    raise("Dataset is empty")

print(f"Combined dataset containes {total_rows} rows")

Combined dataset containes 1338 rows


#### Save combined dataset

In [5]:
guardian_kaggle_combined_dataset_dir = f"../processed/Guardian_Kaggle_Combined_Dataset.csv"
guardian_kaggle_combined_df.to_csv(guardian_kaggle_combined_dataset_dir, index=False, encoding=encoding)