# Ask meaningful questions and answer using collected data

## Import necessary packages

In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import StandardScaler

In [21]:
# function to remove outliers
def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

## Read raw data from csv file

In [22]:
df = pd.DataFrame()
df = pd.read_csv('players_transformed.csv')

# drop unecessary column
df = df.drop(df.columns[0], axis=1)

# test output
display(df.head())

# size of the data
print("Size of data: ", df.shape)

Unnamed: 0,name,age,nationality,club,position,height,weight,foot,total_matches,total_goals,...,shot_accuracy,pass_completion_rate,cross_completion_rate,dribble_success_rate,tackles,interception,market_value,titles,injuries,general_position
0,Ardian Ismajli,28,Albania,Empoli,Defender,185,76,Right,227,5,...,0.0,82.93,0.0,0.0,8,16,5.0,0,3,Defender
1,Berat Djimsiti,31,Albania,Atalanta,Defender,190,83,Right,472,15,...,0.0,88.32,0.0,100.0,21,16,10.0,2,3,Defender
2,Elseid Hysaj,30,Albania,Lazio,Defender,182,75,Right,487,6,...,0.0,86.67,0.0,0.0,1,0,2.5,1,2,Defender
3,Ivan Balliu,32,Albania,Rayo Vallecano,Defender,172,63,Right,421,3,...,0.0,76.73,0.0,66.67,10,6,2.0,2,19,Defender
4,Kristjan Asllani,22,Albania,Inter Milan,Midfielder,175,63,Right,107,6,...,50.0,90.57,50.0,0.0,4,0,18.0,6,1,Midfielder


Size of data:  (2092, 24)


## Question 1
> Can we identify young players (e.g., under 23) who have high efficiency and are undervalued in the market compared to their peers?

### Purpose
To identify promising young players who deliver exceptional performance relative to their market value, making them attractive targets for clubs seeking high-value talent on a budget.

### Relevant attributes
- `age`
- `market_value`
- `total_goals`
- `total_assists`
- `shot_accuracy`
- `dribble_success_rate`

### Filter Players Under 23 and Define Efficiency Metrics

Efficiency is calculated by adding the products of the relevant statistics and their corresponding weight. 1 is chosen as the total weight for ease of comparision.

In [23]:
young_players = df[df['age'] < 23]

metrics = ['total_goals', 'total_assists', 'shot_accuracy', 'dribble_success_rate']
# remove outliers
for column_name in metrics:
    young_players = remove_outliers_iqr(young_players, column_name)

# normalize the data
for column_name in metrics:
        col_transformed, col_lambda  = stats.yeojohnson(young_players[column_name])
        young_players[column_name] = col_transformed

young_players['efficiency'] = (
    young_players['total_goals'] * 0.3 +
    young_players['total_assists'] * 0.2 +
    young_players['shot_accuracy'] * 0.4 +
    young_players['dribble_success_rate'] * 0.1
)

### Compare Market Value

Identify players who are "undervalued" by comparing their efficiency to their market value. A player is "undervalued" when they are priced lower than the average market value of peers who has similar efficiency value (within 5%)

In [24]:
undervalued_players = []

for index, player in young_players.iterrows():
    efficiency = player['efficiency']
    
    # Define the efficiency range (10%)
    lower_bound = efficiency * 0.95
    upper_bound = efficiency * 1.05
    
    # Find peers within this range
    peers = young_players[(young_players['efficiency'] >= lower_bound) & 
                          (young_players['efficiency'] <= upper_bound) &
                          (young_players.index != index)]  # Exclude the player themselves
    
    # Calculate the average market value of peers
    if not peers.empty:
        avg_peer_value = peers['market_value'].mean()
        
        # Check if the player is undervalued
        if player['market_value'] < avg_peer_value:
            undervalued_players.append({
                'name': player['name'],
                'age': player['age'],
                'efficiency': efficiency,
                'market_value': player['market_value'],
                'avg_peer_value': avg_peer_value
            })

# Convert the results into a DataFrame
undervalued_df = pd.DataFrame(undervalued_players)

# Rank Undervalued Players
undervalued_df['value_gap'] = undervalued_df['avg_peer_value'] - undervalued_df['market_value']
undervalued_df = undervalued_df.sort_values(by='value_gap', ascending=False)

display(undervalued_df)

Unnamed: 0,name,age,efficiency,market_value,avg_peer_value,value_gap
180,German Valera Karabinaite,22,4.181303,2.0,25.388889,23.388889
114,Mikael Egill Ellertsson,22,12.056186,2.0,23.500000,21.500000
179,Gerard Fernandez Castellano,22,12.623599,3.0,23.736842,20.736842
110,Lasso Coulibaly,22,12.106713,1.5,21.869565,20.369565
82,Timothee Pembele,22,4.447339,2.5,22.777778,20.277778
...,...,...,...,...,...,...
208,Selvi Clua,19,8.246067,7.0,7.011290,0.011290
189,Israel Dominguez Velasco,21,8.246067,7.0,7.011290,0.011290
161,Umit Akdag,20,8.246067,7.0,7.011290,0.011290
176,Alberto Risco,19,8.246067,7.0,7.011290,0.011290
