In [1]:
import pandas as pd
from getpass import getuser
import os
import re


In [2]:
# Get the current user's name
user = getuser()

# Specify the full file path
file_path = fr'C:\Users\{user}\Documents\GitHub\dream-team-fpl-prediction\data\players_stats.xlsx'

# Import the Excel file as a DataFrame
players_stats = pd.read_excel(file_path)

# Display the DataFrame
players_stats


Unnamed: 0,Season,Player,Nation,Pos,Squad,Age,Born,MP,Starts,Min,...,xAG,npxG+xAG,PrgC,PrgP,PrgR,xG_2,xAG_2,xG+xAG,npxG_2,npxG+xAG_2
0,2016-2017,Patrick van Aanholt,nl NED,DF,Crystal Palace,25.0,1990.0,11,8,714,...,,,,,,,,,,
1,2016-2017,Patrick van Aanholt,nl NED,DF,Sunderland,25.0,1990.0,21,20,1784,...,,,,,,,,,,
2,2016-2017,Charlie Adam,sct SCO,MF,Stoke City,30.0,1985.0,24,17,1491,...,,,,,,,,,,
3,2016-2017,Albert Adomah,gh GHA,"DF,FW",Middlesbrough,28.0,1987.0,2,1,122,...,,,,,,,,,,
4,2016-2017,Adrián,es ESP,GK,West Ham,29.0,1987.0,16,16,1440,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4898,2024-2025,Ashley Young,eng ENG,DF,Everton,39.0,1985.0,32,19,1875,...,2.3,2.6,24.0,90.0,32.0,0.01,0.11,0.12,0.01,0.12
4899,2024-2025,Illia Zabarnyi,ua UKR,DF,Bournemouth,21.0,2002.0,36,35,3109,...,0.7,2.0,27.0,140.0,4.0,0.04,0.02,0.06,0.04,0.06
4900,2024-2025,Oleksandr Zinchenko,ua UKR,"DF,MF",Arsenal,27.0,1996.0,15,5,527,...,0.2,0.8,10.0,48.0,14.0,0.11,0.03,0.14,0.11,0.14
4901,2024-2025,Joshua Zirkzee,nl NED,"FW,MF",Manchester Utd,23.0,2001.0,32,14,1402,...,1.5,6.3,14.0,44.0,69.0,0.31,0.10,0.41,0.31,0.41


In [3]:
players_stats = players_stats.drop('Matches', axis=1)

In [4]:
# replace all variable ending in _2 with _90min since they are referring to the stastic for 90 minutes 
new_columns = {col: col.replace('_2', '_90min') for col in players_stats.columns if col.endswith('_2')}
players_stats.rename(columns=new_columns, inplace=True)
print(players_stats.columns)


Index(['Season', 'Player', 'Nation', 'Pos', 'Squad', 'Age', 'Born', 'MP',
       'Starts', 'Min', '90s', 'Gls', 'Ast', 'G+A', 'G-PK', 'PK', 'PKatt',
       'CrdY', 'CrdR', 'Gls_90min', 'Ast_90min', 'G+A_90min', 'G-PK_90min',
       'G+A-PK', 'xG', 'npxG', 'xAG', 'npxG+xAG', 'PrgC', 'PrgP', 'PrgR',
       'xG_90min', 'xAG_90min', 'xG+xAG', 'npxG_90min', 'npxG+xAG_90min'],
      dtype='object')


In [5]:
# clean the Nation variable by remving the lowercase letters at the beginning of the string 
players_stats['Nation'] = players_stats['Nation'].str.replace(r'[a-z\s]', '', regex=True)
players_stats.head(20)

Unnamed: 0,Season,Player,Nation,Pos,Squad,Age,Born,MP,Starts,Min,...,xAG,npxG+xAG,PrgC,PrgP,PrgR,xG_90min,xAG_90min,xG+xAG,npxG_90min,npxG+xAG_90min
0,2016-2017,Patrick van Aanholt,NED,DF,Crystal Palace,25.0,1990.0,11,8,714,...,,,,,,,,,,
1,2016-2017,Patrick van Aanholt,NED,DF,Sunderland,25.0,1990.0,21,20,1784,...,,,,,,,,,,
2,2016-2017,Charlie Adam,SCO,MF,Stoke City,30.0,1985.0,24,17,1491,...,,,,,,,,,,
3,2016-2017,Albert Adomah,GHA,"DF,FW",Middlesbrough,28.0,1987.0,2,1,122,...,,,,,,,,,,
4,2016-2017,Adrián,ESP,GK,West Ham,29.0,1987.0,16,16,1440,...,,,,,,,,,,
5,2016-2017,Ibrahim Afellay,NED,"FW,MF",Stoke City,30.0,1986.0,12,3,384,...,,,,,,,,,,
6,2016-2017,Benik Afobe,COD,FW,Bournemouth,23.0,1993.0,31,14,1461,...,,,,,,,,,,
7,2016-2017,Sergio Agüero,ARG,FW,Manchester City,28.0,1988.0,31,25,2403,...,,,,,,,,,,
8,2016-2017,Dan Agyei,ENG,"FW,MF",Burnley,19.0,1997.0,3,0,25,...,,,,,,,,,,
9,2016-2017,Ola Aina,NGA,"DF,MF",Chelsea,19.0,1996.0,3,0,26,...,,,,,,,,,,


In [6]:
players_stats['Pos'] = players_stats['Pos'].str.split(',').str[0]


In [7]:
# Group by Player and Season, and count the number of unique squads
player_season_counts = players_stats.groupby(['Player', 'Season'])['Squad'].nunique().reset_index()

# Filter the counts where it is higher than one
duplicate_players = player_season_counts[player_season_counts['Squad'] > 1]

# Merge the original DataFrame with the duplicate players
merged_data = pd.merge(duplicate_players, players_stats, on=['Player', 'Season'], how='inner')

# Display the Player, Season, Squad information, and count where count is higher than 1
result = merged_data[['Player', 'Season', 'Squad_x']].rename(columns={'Squad_x': 'Squad'})
result['Count'] = result.groupby(['Player', 'Season'])['Squad'].transform('count')
result = result[result['Count'] > 1].reset_index(drop=True)

print(result)




                     Player     Season  Squad  Count
0              Aaron Lennon  2017-2018      2      2
1              Aaron Lennon  2017-2018      2      2
2          Adlène Guedioura  2016-2017      2      2
3          Adlène Guedioura  2016-2017      2      2
4    Ainsley Maitland-Niles  2020-2021      2      2
..                      ...        ...    ...    ...
189           Wesley Fofana  2022-2023      2      2
190         Yannick Bolasie  2016-2017      2      2
191         Yannick Bolasie  2016-2017      2      2
192           Younès Kaboul  2016-2017      2      2
193           Younès Kaboul  2016-2017      2      2

[194 rows x 4 columns]


In [8]:

# Replace NaN values with 0
players_stats.fillna(0, inplace=True)

# Convert 'MP' and 'Min' columns to numeric format
players_stats['MP'] = pd.to_numeric(players_stats['MP'], errors='coerce')
players_stats['Min'] = players_stats['Min'].str.replace(',', '')  # Remove commas
players_stats['Min'] = pd.to_numeric(players_stats['Min'], errors='coerce')  # Convert to numeric format

# Calculate the observation count for each player in each season
players_stats['ObservationCount'] = players_stats.groupby(['Player', 'Season'])['Player'].transform('count')

# Create a subset with players appearing more than once in a season
subset = players_stats[players_stats['ObservationCount'] > 1].copy()

# Perform the custom aggregation
grouped_data = subset.groupby(['Player', 'Season']).agg({
    'Age': 'first',
    'Born': 'first',
    'Squad': 'last',
    'Nation': 'first',
    'Pos': 'first',
    'ObservationCount': 'first',
    'MP': 'sum',
    'Min': 'sum',
    'Starts': 'sum',
    '90s': 'sum',
    'Gls': 'sum',
    'Ast': 'sum',
    'G+A': 'sum',
    'G-PK': 'sum',
    'PK': 'sum',
    'PKatt': 'sum',
    'CrdY': 'sum',
    'CrdR': 'sum',
    'Gls_90min': 'sum',
    'Ast_90min': 'sum',
    'G+A_90min': 'sum',
    'G-PK_90min': 'sum',
    'G+A-PK': 'sum',
    'xG': 'sum',
    'npxG': 'sum',
    'xAG': 'sum',
    'npxG+xAG': 'sum',
    'PrgC': 'sum',
    'PrgP': 'sum',
    'PrgR': 'sum',
    'xG_90min': 'sum',
    'xAG_90min': 'sum',
    'xG+xAG': 'sum',
    'npxG_90min': 'sum',
    'npxG+xAG_90min': 'sum'
    # Add other variables to sum here
}).reset_index()

# Specify the merge keys and update keys
merge_keys = ['Player', 'Season', 'Age', 'Born', 'Nation']
update_keys = ['Squad','MP', 'Min', 'Starts', '90s', 'Gls', 'Ast', 'G+A', 'G-PK', 'PK', 'PKatt',
               'CrdY', 'CrdR', 'Gls_90min', 'Ast_90min', 'G+A_90min', 'G-PK_90min',
               'G+A-PK', 'xG', 'npxG', 'xAG', 'npxG+xAG', 'PrgC', 'PrgP', 'PrgR',
               'xG_90min', 'xAG_90min', 'xG+xAG', 'npxG_90min', 'npxG+xAG_90min']


# Merge the original dataset with the aggregated data
merged_data = players_stats.merge(grouped_data, on=merge_keys, how='left', suffixes=('_x', '_y'))
for key in update_keys:
    merged_data[key + '_x'].update(merged_data[key + '_y'])
    merged_data.drop(columns=[key + '_y'], inplace=True)
    merged_data.rename(columns={key + '_x': key}, inplace=True)

# Drop columns ending with '_y'
merged_data.drop(merged_data.filter(regex='_y$').columns, axis=1, inplace=True)

# Remove duplicate rows
merged_data.drop_duplicates(inplace=True)

# Remove the '_x' suffix from Pos and ObservationCount columns
merged_data.rename(columns={'Pos_x': 'Pos', 'ObservationCount_x': 'ObservationCount'}, inplace=True)

# List of columns to exclude from renaming
exclude_columns = ['season', 'player', 'nation', 'pos', 'squad', 'born']

# Apply lowercase and suffix '_player' only to columns not in the exclude list
merged_data.columns = [
    (col.lower() + '_player' if col.lower() not in exclude_columns else col.lower()) 
    for col in merged_data.columns
]

# Export the merged dataset to an xlsx file
output_folder = fr'C:\Users\{user}\Documents\GitHub\dream-team-fpl-prediction\data'
output_path = os.path.join(output_folder, 'players_agg.xlsx')
merged_data.to_excel(output_path, index=False)
