In [71]:
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from getpass import getuser

# Get the current user's name
user = getuser()

# Paths to the datasets
fpl_data_path = rf'C:\Users\{user}\Documents\GitHub\dream-team-fpl-prediction\data\fpl_data.xlsx'
players_agg_path = rf'C:\Users\{user}\Documents\GitHub\dream-team-fpl-prediction\data\players_agg.xlsx'
team_stats_path = rf'C:\Users\{user}\Documents\GitHub\dream-team-fpl-prediction\data\teams_stats.xlsx'

# Read the datasets
fpl_data = pd.read_excel(fpl_data_path)
players_agg = pd.read_excel(players_agg_path)
team_stats = pd.read_excel(team_stats_path)

# Convert all relevant columns to lowercase
fpl_data.columns = fpl_data.columns.str.lower()
players_agg.columns = players_agg.columns.str.lower()
team_stats.columns = team_stats.columns.str.lower()

# List of seasons to analyze (2016-2017 to 2022-2023)
seasons = [f'{year}-{year+1}' for year in range(2016, 2023)]

# Initialize lists to store the final merged data and matched goalkeepers for all seasons
final_merged_data = []
all_matched_goalkeepers = []

# Function to get the best fuzzy match for a player name, with a threshold of 83
def get_best_match(player_name, player_list, threshold=83):
    result = process.extractOne(player_name, player_list, scorer=fuzz.token_sort_ratio)
    if result and result[1] >= threshold:  # Ensure score meets the threshold
        return result[0], result[1]  # Return both the matched player name and the score
    else:
        return None, None  # Explicitly return None for both values

# Iterate through each season
for season in seasons:
    # Filter for the current season
    fpl_season_data = fpl_data[fpl_data['season'] == season]
    players_agg_season_data = players_agg[(players_agg['season'] == season) & 
                                          (players_agg['pos'] == 'GK') &  # Ensure correct case for 'GK'
                                          (players_agg['mp_player'].between(10, 38))]

    # Filter out players from fpl_data where all columns except 'player', 'season', and 'selected_by_percent' are zero
    fpl_season_data = fpl_season_data[(fpl_season_data.drop(columns=['player', 'season', 'selected_by_percent']) != 0).any(axis=1)]

    # Keep only those observations where 'total_points' is in the 60% percentile
    percentile_60 = fpl_season_data['total_points'].quantile(0.6)
    fpl_season_data = fpl_season_data[fpl_season_data['total_points'] >= percentile_60]

    # List to store the results
    matched_players = []

    # Iterate through each player in the players_agg_season_data dataset (to match players_agg to fpl)
    for player in players_agg_season_data['player']:
        best_match, score = get_best_match(player, fpl_season_data['player'].tolist())
        if best_match and score:  # Ensure we have a valid match and score
            matched_players.append((player, best_match, score))

    # Convert the matched players list to a DataFrame
    matched_players_df = pd.DataFrame(matched_players, columns=['player_agg', 'player_fpl', 'score'])

    # Remove any duplicates in the matched_players_df based on Player_Agg
    matched_players_df = matched_players_df.drop_duplicates(subset=['player_agg'])

    # Store the matched goalkeepers for the season
    matched_players_df['season'] = season
    all_matched_goalkeepers.append(matched_players_df)

    # Merge the matched players with the original data to get a complete dataset
    merged_fpl = fpl_season_data.merge(matched_players_df, left_on='player', right_on='player_fpl')
    merged_data = merged_fpl.merge(players_agg_season_data, left_on='player_agg', right_on='player', suffixes=('_fpl', '_agg'))

    # Remove the player_fpl column and rename player_agg to player
    merged_data.drop(columns=['player_fpl'], inplace=True)
    merged_data.rename(columns={'player_agg': 'player'}, inplace=True)

    # Remove season_x and season_y columns and keep one season column
    if 'season_x' in merged_data.columns and 'season_y' in merged_data.columns:
        merged_data['season'] = merged_data['season_x'].combine_first(merged_data['season_y'])
        merged_data.drop(columns=['season_x', 'season_y'], inplace=True)

    # Append the season's merged data to the final list
    final_merged_data.append(merged_data)

# Concatenate all the season data into one final DataFrame
final_merged_data_df = pd.concat(final_merged_data, ignore_index=True)

# Check if final_merged_data_df is empty
if final_merged_data_df.empty:
    print("final_merged_data_df is empty. Please check the filtering and merging steps.")
else:
    print("final_merged_data_df has data.")

# Remove specified columns from the final DataFrame
columns_to_remove = [
    'crdr_player', 'gls_90min_player', 'ast_90min_player', 'g+a_90min_player',
    'g-pk_90min_player', 'g+a-pk_player', 'xg_player', 'npxg_player', 'xag_player',
    'npxg+xag_player', 'prgc_player', 'prgp_player', 'prgr_player',
    'xg_90min_player', 'xag_90min_player', 'xg+xag_player', 'npxg_90min_player',
    'npxg+xag_90min_player', 'observationcount_player'
]
final_merged_data_df.drop(columns=columns_to_remove, inplace=True, errors='ignore')

# Concatenate all the matched goalkeepers data into one DataFrame
all_matched_goalkeepers_df = pd.concat(all_matched_goalkeepers, ignore_index=True)

# Merge the final goalkeeper dataset with team_stats
final_dataset = final_merged_data_df.merge(team_stats, on=['season', 'squad'], how='left')

# Check if final_dataset is empty
if final_dataset.empty:
    print("final_dataset is empty. Please check the merging with team_stats.")
else:
    print("final_dataset has data.")

# Save the final merged dataset to an Excel file
output_path = rf'C:\Users\{user}\Documents\GitHub\dream-team-fpl-prediction\data\goalkeeper_dataset.xlsx'
final_dataset.to_excel(output_path, index=False)

# Save the list of all matched goalkeepers to a separate Excel file
matched_goalkeepers_output_path = rf'C:\Users\{user}\Documents\GitHub\dream-team-fpl-prediction\data\matched_goalkeepers.xlsx'
all_matched_goalkeepers_df.to_excel(matched_goalkeepers_output_path, index=False)

print(f"Final goalkeeper dataset saved as 'goalkeeper_dataset.xlsx'.")
print(f"List of all matched goalkeepers saved as 'matched_goalkeepers.xlsx'.")


final_merged_data_df has data.
final_dataset has data.
Final goalkeeper dataset saved as 'goalkeeper_dataset.xlsx'.
List of all matched goalkeepers saved as 'matched_goalkeepers.xlsx'.


In [77]:
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from getpass import getuser

# Get the current user's name
user = getuser()

# Paths to the datasets
fpl_data_path = rf'C:\Users\{user}\Documents\GitHub\dream-team-fpl-prediction\data\fpl_data.xlsx'
players_agg_path = rf'C:\Users\{user}\Documents\GitHub\dream-team-fpl-prediction\data\players_agg.xlsx'
team_stats_path = rf'C:\Users\{user}\Documents\GitHub\dream-team-fpl-prediction\data\teams_stats.xlsx'

# Read the datasets
fpl_data = pd.read_excel(fpl_data_path)
players_agg = pd.read_excel(players_agg_path)
team_stats = pd.read_excel(team_stats_path)

# Convert all relevant columns to lowercase
fpl_data.columns = fpl_data.columns.str.lower()
players_agg.columns = players_agg.columns.str.lower()
team_stats.columns = team_stats.columns.str.lower()

# List of seasons to analyze (2016-2017 to 2022-2023)
seasons = [f'{year}-{year+1}' for year in range(2016, 2023)]

# Initialize lists to store the final merged data and matched goalkeepers for all seasons
final_merged_data = []
all_matched_goalkeepers = []

# Function to get the best fuzzy match for a player name, with a threshold of 83
def get_best_match(player_name, player_list, threshold=83):
    result = process.extractOne(player_name, player_list, scorer=fuzz.token_sort_ratio)
    if result and result[1] >= threshold:  # Ensure score meets the threshold
        return result[0], result[1]  # Return both the matched player name and the score
    else:
        return None, None  # Explicitly return None for both values

# Iterate through each season
for season in seasons:
    # Filter for the current season
    fpl_season_data = fpl_data[fpl_data['season'] == season]
    players_agg_season_data = players_agg[(players_agg['season'] == season) & 
                                          (players_agg['pos'] == 'GK') &  # Ensure correct case for 'GK'
                                          (players_agg['mp_player'].between(10, 38))]

    # Filter out players from fpl_data where all columns except 'player', 'season', and 'selected_by_percent' are zero
    fpl_season_data = fpl_season_data[(fpl_season_data.drop(columns=['player', 'season', 'selected_by_percent']) != 0).any(axis=1)]

    # Keep only those observations where 'total_points' is in the 60% percentile
    percentile_60 = fpl_season_data['total_points'].quantile(0.6)
    fpl_season_data = fpl_season_data[fpl_season_data['total_points'] >= percentile_60]

    # List to store the results
    matched_players = []

    # Iterate through each player in the players_agg_season_data dataset (to match players_agg to fpl)
    for player in players_agg_season_data['player']:
        best_match, score = get_best_match(player, fpl_season_data['player'].tolist())
        if best_match and score:  # Ensure we have a valid match and score
            matched_players.append((player, best_match, score))

    # Convert the matched players list to a DataFrame
    matched_players_df = pd.DataFrame(matched_players, columns=['player_agg', 'player_fpl', 'score'])

    # Remove any duplicates in the matched_players_df based on Player_Agg
    matched_players_df = matched_players_df.drop_duplicates(subset=['player_agg'])

    # Store the matched goalkeepers for the season
    matched_players_df['season'] = season
    all_matched_goalkeepers.append(matched_players_df)

    # Merge the matched players with the original data to get a complete dataset
    merged_fpl = fpl_season_data.merge(matched_players_df, left_on='player', right_on='player_fpl')
    merged_data = merged_fpl.merge(players_agg_season_data, left_on='player_agg', right_on='player', suffixes=('_fpl', '_agg'))

    # Remove the player_fpl column and rename player_agg to player
    merged_data.drop(columns=['player_fpl'], inplace=True)
    merged_data.rename(columns={'player_agg': 'player', 'score': 'matching_name_score'}, inplace=True)

    # Remove season_x and season_y columns and keep one season column
    if 'season_x' in merged_data.columns and 'season_y' in merged_data.columns:
        merged_data['season'] = merged_data['season_x'].combine_first(merged_data['season_y'])
        merged_data.drop(columns=['season_x', 'season_y'], inplace=True)

    # Append the season's merged data to the final list
    final_merged_data.append(merged_data)

# Concatenate all the season data into one final DataFrame
final_merged_data_df = pd.concat(final_merged_data, ignore_index=True)


# Remove specified columns from the final DataFrame
columns_to_remove = [
    'crdr_player', 'gls_90min_player', 'ast_90min_player', 'g+a_90min_player',
    'g-pk_90min_player', 'g+a-pk_player', 'xg_player', 'npxg_player', 'xag_player',
    'npxg+xag_player', 'prgc_player', 'prgp_player', 'prgr_player',
    'xg_90min_player', 'xag_90min_player', 'xg+xag_player', 'npxg_90min_player',
    'npxg+xag_90min_player', 'observationcount_player', 'pos', 'gls_player', 
    'ast_player', 'g+a_player', 'g-pk_player', 'pk_player', 'pkatt_player',
    'starts_player'
]
final_merged_data_df.drop(columns=columns_to_remove, inplace=True, errors='ignore')

# Remove duplicate columns
final_merged_data_df = final_merged_data_df.loc[:, ~final_merged_data_df.columns.duplicated()]


# Concatenate all the matched goalkeepers data into one DataFrame
all_matched_goalkeepers_df = pd.concat(all_matched_goalkeepers, ignore_index=True)

# Merge the final goalkeeper dataset with team_stats
final_dataset = final_merged_data_df.merge(team_stats, on=['season', 'squad'], how='left')

# Remove duplicate columns after final merge
final_dataset = final_dataset.loc[:, ~final_dataset.columns.duplicated()]


# Desired order of columns
desired_order = [
    'season', 'matching_name_score', 'player', 'nation', 'squad', 'age_player', 'born', 'mp_player', 
    'goals_scored', 'assists', 'total_points', 'minutes', 'goals_conceded', 'creativity', 'influence', 
    'threat', 'bonus', 'bps', 'ict_index', 'clean_sheets', 'red_cards', 'yellow_cards', 'selected_by_percent', 
    'min_player', '90s_player', 'crdy_player', 'num_players_team', 'avg_age_team', 'poss_team', 
    'gls_team', 'ast_team', 'g+a_team', 'g-pk_team', 'pk_team', 'pkatt_team', 'crdy_team', 'crdr_team', 
    'gls_90minutes_team', 'ast_90minutes_team', 'g+a_90minutes_team', 'g-pk_90minutes_team', 'g+a-pk_team', 
    'xg_team', 'npxg_team', 'xag_team', 'npxg+xag_team', 'prgc_team', 'prgp_team', 'xg_90minutes_team', 
    'xag_90minutes_team', 'xg+xag_team', 'npxg_90minutes_team', 'npxg+xag_90minutes_team'
]

# Reorder the columns
final_dataset = final_dataset[desired_order]

# Save the final merged dataset to an Excel file
output_path = rf'C:\Users\{user}\Documents\GitHub\dream-team-fpl-prediction\data\goalkeeper_dataset.xlsx'
final_dataset.to_excel(output_path, index=False)

# Save the list of all matched goalkeepers to a separate Excel file
matched_goalkeepers_output_path = rf'C:\Users\{user}\Documents\GitHub\dream-team-fpl-prediction\data\matched_goalkeepers.xlsx'
all_matched_goalkeepers_df.to_excel(matched_goalkeepers_output_path, index=False)

print(f"Final goalkeeper dataset saved as 'goalkeeper_dataset.xlsx'.")
print(f"List of all matched goalkeepers saved as 'matched_goalkeepers.xlsx'.")


Final goalkeeper dataset saved as 'goalkeeper_dataset.xlsx'.
List of all matched goalkeepers saved as 'matched_goalkeepers.xlsx'.
