In [1]:
# Change directory to level up to import functions
import os
os.chdir('..')

import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd

In [5]:
premier_league_goals = pd.read_csv('data/premier_league_goals/combined_seasons/premier_league_goals.csv')
premier_league_assists = pd.read_csv('data/premier_league_assists/combined_seasons/premier_league_assists.csv')
championship_goals = pd.read_csv('data/championship_goals/combined_seasons/championship_goals.csv')
championship_assists = pd.read_csv('data/championship_assists/combined_seasons/championship_assists.csv')


In [39]:
import yaml

# Define the file path
file_path = "conf/promoted_teams_by_season.yaml"

# Open and load the YAML file
with open(file_path, 'r') as file:
    promoted_teams = yaml.safe_load(file)


In [52]:
championship_assists

Unnamed: 0,Player,Country,Team,Assists,Season,season_start
0,Carlton Cole,England,West Ham United,2,2011-2012,2011
1,James Coppinger,England,Doncaster Rovers,2,2011-2012,2011
2,Jimmy Kébé,France,Reading FC,2,2011-2012,2011
3,Chris Burke,Scotland,Birmingham City,1,2011-2012,2011
4,Kalifa Cissé,Mali,Bristol City,1,2011-2012,2011
...,...,...,...,...,...,...
3579,Duncan Watmore,England,Millwall FC,1,2023-2024,2023
3580,Luke Woolfenden,England,Ipswich Town,1,2023-2024,2023
3581,Callum Wright,England,Plymouth Argyle,1,2023-2024,2023
3582,Jerry Yates,England,Swansea City,1,2023-2024,2023


In [104]:
# Join PL Data

# Perform a full outer join
premier_league_merged = pd.merge(premier_league_assists, premier_league_goals,
                     on=['Player', 'Country', 'Team', 'Season', 'season_start'],
                     how='outer')

# Fill missing values with 0
premier_league_merged['Assists'] = premier_league_merged['Assists'].fillna(0)
premier_league_merged['Goals'] = premier_league_merged['Goals'].fillna(0)


# Select final columns
premier_league_merged = premier_league_merged[['Player', 'Country', 'Team', 'Assists', 'Season', 'season_start', 'Goals']]


# Check for NaN values and fill them with a placeholder (if needed)
premier_league_merged['Team'] = premier_league_merged['Team'].fillna('Unknown')  # Fill NaNs with a placeholder
premier_league_merged['Team'] = premier_league_merged['Team'].astype(str)  # Ensure all entries are strings

# Group by 'Player', 'Season', and 'season_start'
# This is for players who played for multiple clubs in the season
premier_league_merged = (premier_league_merged.groupby(['Player', 'Season', 'Country', 'season_start'])
          .agg({
              'Team': lambda x: ' / '.join(sorted(set(x))),
              'Assists': 'sum',
              'Goals': 'sum'
          })
          .reset_index())

In [105]:
# Join Championship Data
# Join PL Data

# Perform a full outer join
championship_merged = pd.merge(championship_goals, championship_assists,
                     on=['Player', 'Country', 'Team', 'Season', 'season_start'],
                     how='outer')

# Fill missing values with 0
championship_merged.loc[championship_merged['season_start'] >= 2014, 'Assists'] = championship_merged.loc[championship_merged['season_start'] > 2014, 'Assists'].fillna(0)
championship_merged['Goals'] = championship_merged['Goals'].fillna(0)


# Select final columns
championship_merged = championship_merged[['Player', 'Country', 'Team', 'Assists', 'Season', 'season_start', 'Goals']]

# Group by 'Player', 'Season', and 'season_start'
# This is for players who played for multiple clubs in the season
championship_merged = (championship_merged.groupby(['Player', 'Season', 'Country', 'season_start'])
          .agg({
              'Team': lambda x: ' / '.join(sorted(set(x))),
              'Assists': 'sum',
              'Goals': 'sum'
          })
          .reset_index())


In [106]:
# Merge premier league and championship
# Create a lagged season_start in the championship DataFrame
championship_merged['lagged_season_start'] = championship_merged['season_start'] + 1

# Perform inner join
result = pd.merge(
    premier_league_merged,
    championship_merged,
    left_on=['Player', 'Country', 'season_start'],
    right_on=['Player', 'Country', 'lagged_season_start'],
    suffixes=('_premier_league', '_championship'),
    how='inner'
)

# Drop the lagged_season_start column if not needed
result = result.drop(columns=['lagged_season_start'])

In [110]:
# Check for dupes
# Step 3: Check for duplicates in the result DataFrame
duplicates_result = (result
                     .groupby(['Player', 'Country', 'season_start_premier_league'])
                     .size()
                     .reset_index(name='count'))

# Filter for duplicates (count > 1)
duplicates_result = duplicates_result[duplicates_result['count'] > 1]
