# Ask meaningful questions and answer using collected data

## Import necessary packages

In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import StandardScaler

## Read raw data from csv file

In [44]:
df = pd.DataFrame()
df = pd.read_csv('players_transformed.csv')

# drop unecessary column
df = df.drop(df.columns[0], axis=1)

# test output
display(df.head())

# size of the data
print("Size of data: ", df.shape)

Unnamed: 0,name,age,nationality,club,position,height,weight,foot,total_matches,total_goals,...,shot_accuracy,pass_completion_rate,cross_completion_rate,dribble_success_rate,tackles,interception,market_value,titles,injuries,general_position
0,Ardian Ismajli,28,Albania,Empoli,Defender,185,76,Right,227,5,...,0.0,82.93,0.0,0.0,8,16,5.0,0,3,Defender
1,Berat Djimsiti,31,Albania,Atalanta,Defender,190,83,Right,472,15,...,0.0,88.32,0.0,100.0,21,16,10.0,2,3,Defender
2,Elseid Hysaj,30,Albania,Lazio,Defender,182,75,Right,487,6,...,0.0,86.67,0.0,0.0,1,0,2.5,1,2,Defender
3,Ivan Balliu,32,Albania,Rayo Vallecano,Defender,172,63,Right,421,3,...,0.0,76.73,0.0,66.67,10,6,2.0,2,19,Defender
4,Kristjan Asllani,22,Albania,Inter Milan,Midfielder,175,63,Right,107,6,...,50.0,90.57,50.0,0.0,4,0,18.0,6,1,Midfielder


Size of data:  (2092, 24)


## Question 1
> Can we identify young players (e.g., under 23) who have high efficiency and are undervalued in the market compared to their peers?

### Purpose
To identify promising young players who deliver exceptional performance relative to their market value, making them attractive targets for clubs seeking high-value talent on a budget.

### Relevant attributes
- `age`
- `market_value`
- `total_goals`
- `total_assists`
- `shot_accuracy`
- `dribble_success_rate`

### Filter Players Under 23 and Define Efficiency Metrics

Efficiency is calculated by adding the products of the relevant statistics and their corresponding weight. 1 is chosen as the total weight for ease of comparision.

In [45]:
young_players = df[df['age'] < 23]

metrics = ['total_goals', 'total_assists', 'shot_accuracy', 'dribble_success_rate']
# normalize the data
for column_name in metrics:
        col_transformed, col_lambda  = stats.yeojohnson(young_players[column_name])
        young_players[column_name] = col_transformed

young_players['efficiency'] = (
    young_players['total_goals'] * 0.3 +
    young_players['total_assists'] * 0.2 +
    young_players['shot_accuracy'] * 0.4 +
    young_players['dribble_success_rate'] * 0.1
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  young_players[column_name] = col_transformed
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  young_players['efficiency'] = (


### Compare Market Value

Identify players who are "undervalued" by comparing their efficiency to their market value. A player is "undervalued" when they are priced lower than the average market value of peers who has similar efficiency value (within 5%)

In [46]:
undervalued_players = []

for index, player in young_players.iterrows():
    efficiency = player['efficiency']
    
    # Define the efficiency range (10%)
    lower_bound = efficiency * 0.95
    upper_bound = efficiency * 1.05
    
    # Find peers within this range
    peers = young_players[(young_players['efficiency'] >= lower_bound) & 
                          (young_players['efficiency'] <= upper_bound) &
                          (young_players.index != index)]  # Exclude the player themselves
    
    # Calculate the average market value of peers
    if not peers.empty:
        avg_peer_value = peers['market_value'].mean()
        
        # Check if the player is undervalued
        if player['market_value'] < avg_peer_value:
            undervalued_players.append({
                'name': player['name'],
                'age': player['age'],
                'efficiency': efficiency,
                'market_value': player['market_value'],
                'avg_peer_value': avg_peer_value
            })

# Convert the results into a DataFrame
undervalued_df = pd.DataFrame(undervalued_players)

# Rank Undervalued Players
undervalued_df['value_gap'] = undervalued_df['avg_peer_value'] - undervalued_df['market_value']
undervalued_df = undervalued_df.sort_values(by='value_gap', ascending=False)

display(undervalued_df)

Unnamed: 0,name,age,efficiency,market_value,avg_peer_value,value_gap
28,Mahamadou Nagida,19,9.461167,0.50,46.600000,46.100000
74,El Chadaille Bitshiabu,19,9.461167,12.00,45.162500,33.162500
22,Stanis Idumbo Muzambo,19,9.049879,0.80,33.922222,33.122222
210,Kamory Doumbia,21,9.033112,5.00,33.688889,28.688889
312,Alan Matturro,20,8.300677,4.00,31.269737,27.269737
...,...,...,...,...,...,...
254,Adam Aznou Ben Cheikh,18,4.779276,7.00,7.324528,0.324528
186,Francesco Camarda,16,5.192622,7.00,7.283750,0.283750
117,Saimon Nadelia Bouabre,18,5.192622,7.00,7.283750,0.283750
109,Ousmane Toure,19,0.814997,5.75,5.773077,0.023077
