In [20]:
import pandas as pd
from unidecode import unidecode

# Load the players_agg.xlsx file
players_agg_path = r'C:\Users\aldi\Documents\GitHub\dream-team-fpl-prediction\data\players_agg.xlsx'
#players_agg_path = r'C:\Users\ALESSANDRO\Documents\GitHub\dream-team-fpl-prediction\data\players_agg.xlsx'
players_agg_df = pd.read_excel(players_agg_path)

# Load the fpl_data.csv file
fpl_data_path = r'C:\Users\aldi\Documents\GitHub\dream-team-fpl-prediction\data\fpl_data.xlsx'
#fpl_data_path = r'C:\Users\ALESSANDRO\Documents\GitHub\dream-team-fpl-prediction\data\fpl_data.csv'
fpl_data_df = pd.read_excel(fpl_data_path)

# Clean the 'Player' column in players_agg_df
players_agg_df['Player'] = players_agg_df['Player'].apply(unidecode)

# Clean the 'Player' column in fpl_data_df
fpl_data_df['Player'] = fpl_data_df['Player'].apply(unidecode)

# Perform an outer join on 'Player' and 'Season' variables
merged_df = pd.merge(players_agg_df, fpl_data_df, on=['Player', 'Season'], how='outer')

# Filter the merged DataFrame for season 2016-2017 onwards and total_points greater than 0
matched_obs = merged_df[(merged_df['Season'] >= '2016-2017') & (merged_df['total_points'] > 0)]


In [21]:
# Count observations with 'total_points' greater than 0
count_total_points = len(matched_obs)

# Filter the merged DataFrame for season 2016-2017 onwards and total_points equal NaN
unmatched_obs = merged_df[(merged_df['Season'] >= '2016-2017') & (merged_df['total_points'].isna())]
# Count observations with NaN 'total_points'
count_nan_total_points = len(unmatched_obs[unmatched_obs['total_points'].isna()])

print("Number of observations with 'total_points' > 0:", count_total_points)
print("Number of observations with NaN 'total_points':", count_nan_total_points)

Number of observations with 'total_points' > 0: 3667
Number of observations with NaN 'total_points': 764


In [22]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

# Filter the merged_df for the season 2016-2017
filtered_df = merged_df[merged_df['Season'] >= '2016-2017']

# Set a threshold for matching similarity
threshold = 70

# Create a dictionary to store the matched names
matched_names = {}

# Iterate over each unique name in the filtered DataFrame
for name in filtered_df['Player'].unique():
    # Check if the name has already been matched and is not empty
    if name not in matched_names and name != '':
        # Split the name into first name and surname
        name_parts = name.split(' ')
        first_name = name_parts[0] if len(name_parts) > 0 else ''
        surname = ' '.join(name_parts[1:]) if len(name_parts) > 1 else ''

        # Find potential matches for the first name
        potential_matches_first = process.extractBests(first_name, filtered_df['Player'].unique(), scorer=fuzz.token_set_ratio, score_cutoff=threshold)

        # Find potential matches for the surname
        potential_matches_surname = process.extractBests(surname, filtered_df['Player'].unique(), scorer=fuzz.token_set_ratio, score_cutoff=threshold)

        # Check if there are any potential matches for both first name and surname
        if len(potential_matches_first) > 1 and len(potential_matches_surname) > 1:
            # Iterate over potential matches and merge the data
            for match_first in potential_matches_first:
                for match_surname in potential_matches_surname:
                    if match_first[0] != name and match_surname[0] != name and match_first[0] == match_surname[0]:
                        matched_names[match_first[0]] = name

        # Check for substring matches
        potential_matches_substring = process.extractBests(name, filtered_df['Player'].unique(), scorer=fuzz.token_set_ratio, score_cutoff=threshold)
        for match_substring in potential_matches_substring:
            if match_substring[0] != name and name in match_substring[0]:
                matched_names[match_substring[0]] = name

# Update the 'Player' column with the merged names
filtered_df.loc[:, 'Player'] = filtered_df['Player'].replace(matched_names)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [24]:

# Create an empty DataFrame to store the merged information
merged_df = pd.DataFrame()

# Iterate over each unique name in the filtered DataFrame
for name in filtered_df['Player'].unique():
    # Check if the name has a match in the matched_names dictionary
    if name in matched_names:
        # Get the original name for the match
        original_name = matched_names[name]
        
        # Retrieve the rows for the current name and original name
        name_rows = filtered_df[filtered_df['Player'] == name]
        original_name_rows = filtered_df[filtered_df['Player'] == original_name]
        
        # Merge the information for the matched rows
        merged_rows = pd.concat([name_rows, original_name_rows], axis=0)
        
        # Append the merged rows to the merged_df DataFrame
        merged_df = merged_df.append(merged_rows, ignore_index=True)
    else:
        # If there is no match for the name, simply append the rows to the merged_df DataFrame
        merged_df = merged_df.append(filtered_df[filtered_df['Player'] == name], ignore_index=True)

# Print the merged_df DataFrame
print(merged_df)





         Season                             Player Nation  Pos  \
0     2016-2017                Patrick van Aanholt    NED   DF   
1     2017-2018                Patrick van Aanholt    NED   DF   
2     2018-2019                Patrick van Aanholt    NED   DF   
3     2019-2020                Patrick van Aanholt    NED   DF   
4     2020-2021                Patrick van Aanholt    NED   DF   
5     2016-2017                       Charlie Adam    SCO   MF   
6     2017-2018                       Charlie Adam    SCO   MF   
7     2016-2017                      Albert Adomah    GHA   MF   
8     2016-2017                             Adrian    ESP   GK   
9     2016-2017                             Adrian    JAM   DF   
10    2017-2018                             Adrian    ESP   GK   
11    2017-2018                             Adrian    JAM   DF   
12    2018-2019                             Adrian    JAM   DF   
13    2019-2020                             Adrian    ESP   GK   
14    2019

In [None]:
# matched_names return a short list of all the names that should be matched
# information from the original datasets (fpl_data_df and players_agg_df) are not merged
#Data cleaning and preprocessing: 
    #Perform data cleaning steps to standardize the names and remove any unnecessary characters, accents, or special symbols.
    #This can help in reducing noise and improving matching accuracy.

In [25]:
import os
# Export the merged dataset to an xlsx file
output_folder = r'C:\Users\aldi\Documents\GitHub\dream-team-fpl-prediction\data'
output_path = os.path.join(output_folder, 'merged_df.xlsx')
merged_df.to_excel(output_path, index=False)