# Scale the market values
### Scale the market values from the past seasons according to the inflation rates according to each position

## Import the libraries and mount drive

In [None]:
import pandas as pd
import os
import glob
from google.colab import drive
from sklearn.preprocessing import MinMaxScaler
import numpy as np
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Set the rates taken from the CIES website reports , and set the multipliers

In [None]:
# Define directories
data_dir = "/content/drive/MyDrive/masters thesis ali alhaj hassan/data_scraping_and_preperation/merged_data/data_per_season_per_position_merged"
output_dir = "/content/drive/MyDrive/masters thesis ali alhaj hassan/data_scraping_and_preperation/merged_data"
summary_file = "/content/drive/MyDrive/masters thesis ali alhaj hassan/data_scraping_and_preperation/merged_data/player_summary.csv"

# Define interest rates based on position
interest_rates = {
    'GK': 0.052,
    'DF': 0.125,
    'MF': 0.085,
    'FW': 0.082
}

# Define season multipliers for market value adjustment
season_multipliers = {
    '2020_2021': 3,
    '2021_2022': 2,
    '2022_2023': 1,
    '2023_2024': 0
}

## Load the files and apply the inflation rule

The inflation-adjusted value is calculated using the formula:

$
\text{current_value} = \text{old_value} \times (1 + r)^{\text{multiplier}}
$


In [None]:



# Dictionary to store dataframes per position
position_dfs = {"GK": [], "DF": [], "MF": [], "FW": []}

# List to store player summary data
player_summary = []

def smart_round(value):
    if value >= 100_000_000:
        return round(value, -7)
    elif value >= 10_000_000:
        return round(value, -6)
    elif value >= 1_000_000:
        return round(value, -5)
    else:
        return round(value, -4)




def convert_numeric_columns(df):
    for col in df.columns:
        # Check the content of the column, not its name
        if df[col].astype(str).apply(lambda x: any(c.isalpha() for c in str(x))).any():
            continue  # Keep the column as is if it contains letters

        # Convert purely numeric columns to integers
        df[col] = pd.to_numeric(df[col], errors='coerce')  # Convert non-numeric values to NaN
        if df[col].notna().all():  # Ensure conversion only for purely numeric columns
            df[col] = df[col].astype(int)  # Convert to integers

    return df

import pandas as pd
import numpy as np

import pandas as pd
import numpy as np

def compute_influence_scores(df):
    if 'Squad' in df.columns and 'Source_League' in df.columns and 'Nation' in df.columns and 'Matches Played' in df.columns:
        # Compute weighted mean market value for teams, leagues, and nations using Matches Played as weight
        team_weighted_mean_value = df.groupby('Squad').apply(lambda x: np.average(x['MarketValue'], weights=x['Matches Played']) if x['Matches Played'].sum() > 0 else np.nan)
        league_weighted_mean_value = df.groupby('Source_League').apply(lambda x: np.average(x['MarketValue'], weights=x['Matches Played']) if x['Matches Played'].sum() > 0 else np.nan)
        country_weighted_mean_value = df.groupby('Nation').apply(lambda x: np.average(x['MarketValue'], weights=x['Matches Played']) if x['Matches Played'].sum() > 0 else np.nan)

        # Function to assign discrete scores based on custom percentiles
        def assign_scores(series):
            series = series.dropna()  # Remove NaN values before processing
            if len(series) > 1:
                quantiles = series.quantile([0.1, 0.3, 0.6, 0.8]).values
                return series.apply(lambda x: 5 if x >= quantiles[3] else
                                              4 if x >= quantiles[2] else
                                              3 if x >= quantiles[1] else
                                              2 if x >= quantiles[0] else 1)
            else:
                return pd.Series(3, index=series.index)  # Default to 3 if only one unique value

        # Assign scores
        team_scores = assign_scores(team_weighted_mean_value)
        league_scores = assign_scores(league_weighted_mean_value)
        country_scores = assign_scores(country_weighted_mean_value)

        # Ensure all players from the same team, league, and nation get the same score
        team_scores = team_scores.groupby(team_scores.index).transform('first')
        league_scores = league_scores.groupby(league_scores.index).transform('first')
        country_scores = country_scores.groupby(country_scores.index).transform('first')

        # Map scores back to DataFrame and handle missing values
        df['team_score'] = df['Squad'].map(team_scores).fillna(3).astype(int)
        df['league_score'] = df['Source_League'].map(league_scores).fillna(3).astype(int)
        df['country_score'] = df['Nation'].map(country_scores).fillna(3).astype(int)

    return df





# Loop through each position folder
for position in position_dfs.keys():
    position_path = os.path.join(data_dir, position)
    all_files = glob.glob(os.path.join(position_path, "*.csv"))

    for file in all_files:
        df = pd.read_csv(file)

        # Rename Age_x to Age if it exists
        if 'Age_x' in df.columns:
            df.rename(columns={'Age_x': 'Age'}, inplace=True)

        # Extract season from filename
        filename = os.path.basename(file)
        season_parts = filename.replace(".csv", "").split("_")

        # Ensure season format is correct
        if len(season_parts) >= 3:
            season = f"{season_parts[-2]}_{season_parts[-1]}"
            season_year = season_parts[-1]  # Extract second part of season

        else:
            print(f"Skipping file {filename}: Unable to extract season correctly.")
            continue  # Skip this file if season extraction fails

        # Check if extracted season exists in the season_multipliers dictionary
        if season not in season_multipliers:
            print(f"Skipping file {filename}: Season '{season}' not found in season_multipliers.")
            continue
       # Add season column to the dataframe
        df['Season'] = season_year
        # Drop unnecessary columns safely
        drop_columns = [col for col in ['Player', 'Age_y', 'ID', 'Best Position'] if col in df.columns]
        df.drop(columns=drop_columns, inplace=True, errors='ignore')

        # Ensure MarketValue column exists and is numeric
        if 'MarketValue' in df.columns:
            df['MarketValue'] = pd.to_numeric(df['MarketValue'], errors='coerce')  # Convert to numeric safely
            df['MarketValue'].fillna(0, inplace=True)  # Replace NaN with 0 to avoid errors

            # Apply market value adjustments
            multiplier = season_multipliers[season]
            if position in interest_rates:
                df['MarketValue'] *= (1 + interest_rates[position]) ** multiplier

            # Apply smart rounding function
            df['MarketValue'] = df['MarketValue'].apply(smart_round).astype(int)

        else:
            print(f"Warning: 'MarketValue' column not found in {filename}")

        df = compute_influence_scores(df)

        # Store processed dataframe per position
        position_dfs[position].append(df)

        # Collect player summary data
        required_columns = {'Name', 'Best position', 'MarketValue', 'Best overall', 'Growth', 'Age',
                            'Matches Played', 'Sprint speed', 'foot', 'Nation',
                            'International reputation','Source_League','Season','team_score',"Squad", 'league_score', 'country_score'}

        if required_columns.issubset(df.columns):
            player_summary.append(df[list(required_columns)])
        else:
            missing_cols = required_columns - set(df.columns)
            print(f"Skipping player summary for {filename}: Missing columns {missing_cols}")

# Save files per position
for position, dfs in position_dfs.items():
    if dfs:
        position_df = pd.concat(dfs, ignore_index=True)
        position_df.drop_duplicates(inplace=True)
        position_df = convert_numeric_columns(position_df)
        output_file = os.path.join(output_dir, f"{position}.csv")
        position_df.to_csv(output_file, index=False)
        print(f"✅ File saved: {output_file}")

# Process and save player summary
if player_summary:
    # Ensure column names are unique in all DataFrames
    for i in range(len(player_summary)):
        player_summary[i] = player_summary[i].loc[:, ~player_summary[i].columns.duplicated()]

    # Concatenate and remove duplicates
    summary_df = pd.concat(player_summary, ignore_index=True).drop_duplicates()
    summary_df = convert_numeric_columns(summary_df)
    # Save summary file
    summary_df.to_csv(summary_file, index=False)
    print(f"✅ Player summary saved: {summary_file}")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['MarketValue'].fillna(0, inplace=True)  # Replace NaN with 0 to avoid errors
  team_weighted_mean_value = df.groupby('Squad').apply(lambda x: np.average(x['MarketValue'], weights=x['Matches Played']) if x['Matches Played'].sum() > 0 else np.nan)
  league_weighted_mean_value = df.groupby('Source_League').apply(lambda x: np.average(x['MarketValue'], weights=x['Matches Played']) if x['Matches Played'].sum() > 0 else np.nan)
  country_weighted_mean_value = df.groupby('Nation').apply(lambda x: np.average(x['MarketValue'], weights=x['Matches Played']) if x['Matches Played'].sum() > 0 else np.nan)
The behavior wi

✅ File saved: /content/drive/MyDrive/masters thesis ali alhaj hassan/data_scraping_and_preperation/merged_data/GK.csv
✅ File saved: /content/drive/MyDrive/masters thesis ali alhaj hassan/data_scraping_and_preperation/merged_data/DF.csv
✅ File saved: /content/drive/MyDrive/masters thesis ali alhaj hassan/data_scraping_and_preperation/merged_data/MF.csv
✅ File saved: /content/drive/MyDrive/masters thesis ali alhaj hassan/data_scraping_and_preperation/merged_data/FW.csv
✅ Player summary saved: /content/drive/MyDrive/masters thesis ali alhaj hassan/data_scraping_and_preperation/merged_data/player_summary.csv


In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
def compute_influence_scores(df):
    # Compute raw team influence (average market value per team)
    team_avg_value = df.groupby('team')['market_value'].mean()

    # Compute raw league influence (average market value per league)
    league_avg_value = df.groupby('league')['market_value'].mean()

    # Compute raw country influence (average market value per country)
    country_avg_value = df.groupby('country')['market_value'].mean()

    # Normalize values to be between 0 and 5
    scaler = MinMaxScaler(feature_range=(0, 5))

    df['team_score'] = df['team'].map(team_avg_value)
    df['league_score'] = df['league'].map(league_avg_value)
    df['country_score'] = df['country'].map(country_avg_value)

    df[['team_score', 'league_score', 'country_score']] = scaler.fit_transform(df[['team_score', 'league_score', 'country_score']])

    return df