In [None]:
%pip install pandas
%pip install numpy
%pip install scikit-learn
%pip install category_encoders

import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_distances , cosine_similarity


pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Show all columns



In [None]:
players = pd.read_csv("player-data-full.csv")

In [None]:
players.head()

# Removing Unwanted features

###### These features will later be used as filters, so we stored them in a seperate dataset. 
###### For example, the user will be able to set a maximum budget, age or wage when searching for the perfect fit


In [None]:
# List of columns to remove
columns_to_remove = [
    'image', 'description', 'real_face', 'club_logo', 'country_flag', 'version', 'full_name', 
    'club_id', 'club_league_id', 'country_id', 'country_league_id', 
    'club_kit_number', 'country_kit_number', 'club_league_name', 'club_rating', 'country_position', 
    'club_contract_valid_until', 'club_name', 'potential', 'value', 'wage', 'overall_rating',  
    'international_reputation', 'release_clause', 'body_type', 'specialities', 'club_position', 
    'club_joined', 'country_name', 'country_league_name', 'country_rating'
]

# Ensure 'player_id' is included for the filters DataFrame
filter_columns = ['player_id'] + columns_to_remove

# Create the filters DataFrame
filters = players[filter_columns].copy()

# Drop the filter columns from the original players DataFrame
players = players.drop(columns=columns_to_remove)


In [None]:
columns_to_remove = [
    'country_rating', 'country_league_name', 'club_joined', 'club_position',
    'specialities', 'body_type', 'image', 'description', 'real_face', 
    'club_logo', 'country_flag', 'version', 'full_name', 'club_id', "country_id",
    'club_league_id', 'country_league_id', 'club_kit_number', 
    'country_kit_number', 'club_rating', 'country_position', 
    'international_reputation'
]

filters = filters.drop(columns=columns_to_remove, errors='ignore')



In [None]:
def convert_to_numeric(value):
    if isinstance(value, str):
        value = value.lower().replace('€', '').replace(',', '')  # Remove euro sign and commas
        if 'k' in value:
            return float(value.replace('k', '').strip()) * 1000  # Multiply by 1000 for 'k'
        elif 'm' in value:
            return float(value.replace('m', '').strip()) * 1000000  # Multiply by 1,000,000 for 'm'
        else:
            return float(value)  # If no 'k' or 'm', just convert the number directly
    return value  # If already a number, return it unchanged

# Apply this function to the 'value', 'wage', and 'release_clause' columns
for column in ['value', 'wage', 'release_clause']:
    filters[column] = filters[column].apply(convert_to_numeric)


In [None]:
filters.head()

# Feature Engineering

## 1. Creating the Age Feature

To make the dataset more suitable for analysis and modeling, we replaced the `date_of_birth` column with a new column called `age`.

While `date_of_birth` contains important information, it is stored in a **date format** that is not directly useful for most machine learning algorithms. Dates must typically be transformed into a numerical representation, and without context (like the current year), they are hard for models to interpret effectively.

By calculating the **age of each player at the time of data collection**, we converted this temporal data into a **single, meaningful numeric value**. This simplifies the feature and allows the model to learn patterns based on age — a highly relevant factor in player performance, development potential, and


In [None]:
from datetime import datetime
import pandas as pd

# Convert 'dob' to datetime format
players['dob'] = pd.to_datetime(players['dob'], errors='coerce')

# Calculate age based on today's date
today = datetime.today()
players['age'] = players['dob'].apply(lambda x: today.year - x.year - ((today.month, today.day) < (x.month, x.day)))

# Drop the original 'dob' column if not needed
players = players.drop(columns=['dob'])

# Display the dataset with the new 'age' column
print(players[['name', 'age']].head())


In [None]:
# Ensure player_id exists in both DataFrames
assert 'player_id' in players.columns and 'player_id' in filters.columns, "player_id must be in both DataFrames"

# Merge the age column into filters
filters = filters.merge(players[['player_id', 'age']], on='player_id', how='left')

# Drop age from players
players = players.drop(columns=['age'])

# Check results
print("✅ 'age' successfully moved to filters.")
print("filters columns:", filters.columns)
print("players columns:", players.columns)


In [None]:
filters.head()

## 2. One-Hot Encoding the Positions Feature

The `positions` column in our dataset often includes **multiple positions per player**, such as `"CM, CDM"` or `"RW, ST"`. These entries represent the different roles a player is capable of performing on the field.

To make this information usable for machine learning models, we applied **one-hot encoding** to this feature. This technique transforms each unique position into a binary feature (1 if the player plays in that position, 0 otherwise).

By doing so, we created a set of new columns — one for each individual position — enabling the model to understand positional versatility. This is especially important in the context of **player flexibility**, which can be a critical indicator of value or role suitability in tactical systems.

This transformation ensures that players who can operate in multiple roles contribute more richly to the model, without losing any semantic detail.


In [None]:
# Import necessary library
import pandas as pd

# Define all possible positions in FIFA
fifa_positions = ['GK', 'CB', 'LB', 'RB', 'LWB', 'RWB', 
                  'CDM', 'CM', 'CAM', 'LM', 'RM', 
                  'LW', 'RW', 'ST', 'CF']

# Create new columns for each position and set them to 0
for pos in fifa_positions:
    players[pos] = players['positions'].apply(lambda x: 1 if pos in x else 0)

# Drop the original 'positions' column as it's now redundant
players = players.drop(columns=['positions'])

# Display the first few rows
print(players.head())


## 3. Creating Four Main Playstyles

To manage the high cardinality of the `play_styles` column, we took a dimensionality reduction approach rather than applying one-hot encoding.

Originally, this column contained **around 50 unique playstyles**, each describing a specific attribute or behavior of a player. One-hot encoding these values would have dramatically increased the number of features in our dataset, adding significant noise and computational complexity, especially given that many of these playstyles have only marginal influence on model performance.

Instead, we consolidated them into **four broader, technically meaningful categories**, each representing a set of similar playstyles. This approach preserved important semantic information while drastically reducing dimensionality, resulting in **only four new binary features**. Each new column corresponds to one of the grouped playstyle categories and indicates whether a player fits within that group.

This method strikes a balance between data richness and model simplicity, contributing to more efficient learning and better generalization.


In [None]:
# Define playstyle categories
categories = {
    "Playmakers_Technicians": ["Incisive Pass", "Tiki Taka", "Press Proven", "Pinged Pass", "First Touch", "Flair", "Trivela", "Technical", "Solid Player"],
    "Clinical_Finishers_GoalPoachers": ["Finesse Shot", "Power Shot", "Poacher", "Chip Shot", "Trivela", "Aerial", "Acrobatic", "Quick Step"],
    "Defensive_Walls_Destroyers": ["Bruiser", "Intercept", "Slide Tackle", "Block", "Press Proven", "Relentless", "Aerial"],
    "Physical_Speed_Based_Players": ["Rapid", "Quick Step", "Speedster", "Power Header", "Aerial", "Relentless"]
    
}

# Function to assign binary values for each category
def assign_binary_columns(play_styles):
    if pd.isna(play_styles): 
        return {key: 0 for key in categories}
    
    styles = set(play_styles.split(","))  # Convert to a set for quick lookup
    
    return {category: int(any(style.strip() in keywords for style in styles)) for category, keywords in categories.items()}

# Apply function to the 'Play_Style' column in your existing `players` dataset
binary_columns = players["play_styles"].apply(assign_binary_columns).apply(pd.Series)

# Merge binary columns into the `players` DataFrame
players = pd.concat([players, binary_columns], axis=1)

players = players.drop(columns=['play_styles'])




## 4. One-Hot Encoding Work Rate and Preferred Foot Feature

To prepare the dataset for machine learning algorithms, we performed one-hot encoding on two categorical features: `preferred_foot` and `work_rate`.

- The `preferred_foot` column, which originally had values `"Right"` or `"Left"`, was converted to binary format: Right → 1 and Left → 0.

- The `work_rate` column included combined values such as `"High/Medium"` to represent attacking and defensive work rates. We split this feature into two separate columns: `Attacking Work Rate` and `Defensive Work Rate`.

- Each new column was then numerically encoded using the following mapping: `Low = 0`, `Medium = 1`, and `High = 2`.

Finally, we dropped the original `work_rate` column after encoding, as its information was now captured in a machine-readable format.


In [None]:
# One-hot encoding for 'preferred_foot'
players['preferred_foot'] = players['preferred_foot'].map({'Right': 1, 'Left': 0})

# Split 'work_rate' into two new columns
players[['Attacking Work Rate', 'Defensive Work Rate']] = players['work_rate'].str.split('/', expand=True)

# Define mapping for work rate categories
work_rate_mapping = {'Low': 0, 'Medium': 1, 'High': 2}

# Apply mapping to the new columns
players['Attacking Work Rate'] = players['Attacking Work Rate'].map(work_rate_mapping)
players['Defensive Work Rate'] = players['Defensive Work Rate'].map(work_rate_mapping)

# Drop the original 'work_rate' column
players.drop(columns=['work_rate'], inplace=True)

# Check result
print(players[['preferred_foot', 'Attacking Work Rate', 'Defensive Work Rate']].head())

## 5. Removing Goalkeeper Features and PlayStyles Column

The dataset contains several columns related to **goalkeeper (GK) attributes**, such as `gk_reflexes` and `gk_diving`, which are not relevant to outfield players like attackers and midfielders. Including these features would introduce noise and reduce model accuracy for player roles unrelated to goalkeeping.

To address this, we removed all columns that contain `'gk'` in their names. This ensures the model is trained only on attributes meaningful to the playing positions under study.

Additionally, we dropped the `play_styles` column, which was already processed and represented in the dataset through four newly engineered features (as discussed earlier). Keeping it would have introduced redundancy.

This cleanup step streamlines the dataset and ensures that only relevant and non-redundant features are passed to the machine learning models.


In [None]:
# Drop the 'play_styles' column and any column that contains 'gk' in its name
players = players.drop(columns=[col for col in players.columns if 'gk' in col.lower()] + ['play_styles'], errors='ignore')

# Display remaining columns to verify
print(players.columns)


# Modelling


## 1. Position-Based Splitting of the Dataset

To improve the accuracy and relevance of our player similarity model, we decided to split the main dataset into **smaller subsets based on player positions** (e.g., attackers, midfielders, defenders, goalkeepers).

This approach allows us to train and analyze each group separately, as **different positions require different skill sets and performance indicators**. For example:
- A forward’s similarity should be based on shooting, positioning, and pace.
- A midfielder would be evaluated using passing, vision, and stamina.
- A defender’s value comes from tackling, marking, and strength.

By splitting the dataset, we can **customize the feature selection and similarity logic** for each role, which leads to more meaningful and position-specific comparisons between players.


In [None]:

center_backs = players[players['CB'] == 1]
full_backs = players[(players['LB'] == 1) | (players['RB'] == 1) | (players['LWB'] == 1) | (players['RWB'] == 1)]
defensive_mids = players[players['CDM'] == 1]
midfielders = players[players['CM'] == 1]
attacking_mids = players[players['CAM'] == 1| (players['CF'] == 1)]
wingers = players[(players['LW'] == 1) | (players['RW'] == 1) ]
strikers = players[(players['ST'] == 1) | (players['CF'] == 1)]

In [None]:
position_data = {
    'CAM': {
        'dataset_path': 'attack_mid.pkl',
        'features_path': 'attack_mid_features.pkl'
    },
    'LW': {
        'dataset_path': 'wingers.pkl',
        'features_path': 'wingers_features.pkl'
    },
    'RW': {
        'dataset_path': 'wingers.pkl',
        'features_path': 'wingers_features.pkl'
    },
    'ST': {
        'dataset_path': 'strikers.pkl',
        'features_path': 'strikers_features.pkl'
    },
    'CDM': {
        'dataset_path': 'defensive_mids.pkl',
        'features_path': 'defensive_mids_features.pkl'
    },
    'CM': {
        'dataset_path': 'defensive_mids.pkl',
        'features_path': 'defensive_mids_features.pkl'
    },
    'CB': {
        'dataset_path': 'center_backs.pkl',
        'features_path': 'center_backs_features.pkl'
    },
    'LB': {
        'dataset_path': 'full_backs.pkl',
        'features_path': 'full_backs_features.pkl'
    },
    'RB': {
        'dataset_path': 'full_backs.pkl',
        'features_path': 'full_backs_features.pkl'
    }
    # Add more as needed
}


In [None]:
defensive_mids.to_csv('defensive_mids.csv', index=False)

In [None]:
def find_similar_players(input_name, top_n=10):
    matches = players[players['name'].str.contains(input_name, case=False, na=False)]
    
    if matches.empty:
        print("❌ No matching player found.")
        return
    
    for _, row in matches.iterrows():
        player_name = row['name']
        player_id = row['player_id']
        
        # Detect position from one-hot columns
        position_cols = [col for col in players.columns if col in position_data]
        player_position = None
        for pos in position_cols:
            if row[pos] == 1:
                player_position = pos
                break
        
        if not player_position:
            print(f"⚠️ No known position found for {player_name}")
            continue
        
        print(f"\n🔍 Similar players to: {player_name} ({player_position})")

        if player_position not in position_data:
            print(f"⚠️ No data defined for position: {player_position}")
            continue

        # Load dataset and features
        dataset = pd.read_pickle(position_data[player_position]['dataset_path'])
        with open(position_data[player_position]['features_path'], "rb") as f:
            features = pickle.load(f)

        if player_id not in dataset['player_id'].values:
            print("❌ Player not found in position-specific dataset.")
            continue

        # Prepare data
        df = dataset[['name', 'player_id'] + features].dropna()
        names = df['name']
        X = df[features]

        # PCA pipeline
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        pca = PCA(n_components=0.95)
        X_pca = pca.fit_transform(X_scaled)

        # Cosine similarity
        cosine_sim = cosine_distances(X_pca)
        idx = df[df['player_id'] == player_id].index[0]
        sim_scores = sorted(list(enumerate(cosine_sim[idx])), key=lambda x: x[1])[1:top_n+1]

        for i, (sim_idx, dist) in enumerate(sim_scores, 1):
            print(f"{i}. {names.iloc[sim_idx]} — Similarity Score: {1 - dist:.4f}")


# with filters

In [None]:
import pandas as pd
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_distances

def find_similar_players(input_name, top_n=10, max_wage=None, max_age=None, max_value=None, 
                          max_release_clause=None, club_name=None, club_league_name=None, 
                          country_name=None, min_overall_rating=None):
    matches = players[players['name'].str.contains(input_name, case=False, na=False)]
    
    if matches.empty:
        print("❌ No matching player found.")
        return
    
    for _, row in matches.iterrows():
        player_name = row['name']
        player_id = row['player_id']
        
        # Detect position from one-hot columns
        position_cols = [col for col in players.columns if col in position_data]
        player_position = None
        for pos in position_cols:
            if row[pos] == 1:
                player_position = pos
                break
        
        if not player_position:
            print(f"⚠️ No known position found for {player_name}")
            continue
        
        print(f"\n🔍 Similar players to: {player_name} ({player_position})")

        if player_position not in position_data:
            print(f"⚠️ No data defined for position: {player_position}")
            continue

        # Load dataset and features
        dataset = pd.read_pickle(position_data[player_position]['dataset_path'])
        with open(position_data[player_position]['features_path'], "rb") as f:
            features = pickle.load(f)

        if player_id not in dataset['player_id'].values:
            print("❌ Player not found in position-specific dataset.")
            continue

        # Prepare data
        df = dataset[['name', 'player_id'] + features].dropna()
        names = df['name']
        X = df[features]

        # PCA pipeline
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        pca = PCA(n_components=0.95)
        X_pca = pca.fit_transform(X_scaled)

        # Cosine similarity setup
        idx = df[df['player_id'] == player_id].index[0]
        input_vector = X_pca[idx]

        eligible_players = []

        for i, row in df.iterrows():
            sim_id = row['player_id']
            if sim_id == player_id:
                continue  # Skip the input player

            # Apply filters
            filter_conditions = True

            if max_wage is not None:
                wage = filters.loc[filters['player_id'] == sim_id, 'wage'].values
                if wage.size > 0 and wage[0] > max_wage:
                    filter_conditions = False

            if max_value is not None:
                value = filters.loc[filters['player_id'] == sim_id, 'value'].values
                if value.size > 0 and value[0] > max_value:
                    filter_conditions = False

            if max_release_clause is not None:
                clause = filters.loc[filters['player_id'] == sim_id, 'release_clause'].values
                if clause.size > 0 and clause[0] > max_release_clause:
                    filter_conditions = False

            if max_age is not None:
                age = filters.loc[filters['player_id'] == sim_id, 'age'].values
                if age.size > 0 and int(age[0]) > int(max_age):
                    filter_conditions = False

            if club_name is not None:
                club = filters.loc[filters['player_id'] == sim_id, 'club_name'].values
                if club.size > 0 and club[0] != club_name:
                    filter_conditions = False

            if club_league_name is not None:
                league = filters.loc[filters['player_id'] == sim_id, 'club_league_name'].values
                if league.size > 0 and league[0] != club_league_name:
                    filter_conditions = False

            if country_name is not None:
                country = filters.loc[filters['player_id'] == sim_id, 'country_name'].values
                if country.size > 0 and country[0] != country_name:
                    filter_conditions = False

            if min_overall_rating is not None:
                rating = filters.loc[filters['player_id'] == sim_id, 'overall_rating'].values
                if rating.size > 0 and rating[0] < min_overall_rating:
                    filter_conditions = False

            if filter_conditions:
                # Get the correct index for X_pca
                pca_idx = df.index.get_loc(i)  # This maps the `i` to the correct index in `X_pca`
                candidate_vector = X_pca[pca_idx]  # Access the PCA vector using the correct index
                similarity_score = 1 - cosine_distances([input_vector], [candidate_vector])[0][0]
                eligible_players.append((row['name'], similarity_score))

        # Sort by similarity and show top N
        eligible_players.sort(key=lambda x: x[1], reverse=True)

        if eligible_players:
            print("\nTop eligible players:")
            for i, (name, score) in enumerate(eligible_players[:top_n], 1):
                print(f"{i}. {name} — Similarity Score: {score:.4f}")
        else:
            print("⚠️ No players meet the filter criteria.")


In [None]:
players

In [None]:
find_similar_players("declan")

In [None]:
filters.head(20)