# Ask meaningful questions and answer using collected data

## Import necessary packages

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import StandardScaler

## Read raw data from csv file

In [7]:
df = pd.DataFrame()
df = pd.read_csv('players_transformed.csv')

# drop unecessary column
df = df.drop(df.columns[0], axis=1)

# test output
display(df.head())

# size of the data
print("Size of data: ", df.shape)

Unnamed: 0,name,age,nationality,club,height,weight,foot,total_matches,total_goals,total_assists,total_yellow,total_red,pass_completion_rate,dribble_success_rate,tackles,interception,market_value,titles,injuries,general_position
0,Ivan Balliu,32,Albania,Rayo Vallecano,172,63,Right,421,3,14,91,2,76.73,66.67,10,6,2.0,2,19,Defender
1,Marash Kumbulla,24,Albania,RCD Espanyol,191,78,Right,132,6,14,32,2,86.24,33.33,12,11,4.5,1,2,Defender
2,Abderrahman Rebbach,26,Algeria,Deportivo Alavés,176,75,Right,154,34,4,32,1,72.09,27.27,0,0,0.8,0,0,Forward
3,Farid El Melali,27,Algeria,Angers SCO,168,65,Right,157,19,8,11,0,82.0,54.29,10,3,1.5,1,12,Forward
4,Haris Belkebla,30,Algeria,Angers SCO,177,68,Right,323,9,2,57,3,87.0,25.0,9,5,1.5,0,6,Midfielder


Size of data:  (1057, 20)


## Question 1
> Can we identify young players (e.g., under 23) who have high efficiency and are undervalued in the market compared to their peers?

### Purpose
To identify promising young players who deliver exceptional performance relative to their market value, making them attractive targets for clubs seeking high-value talent on a budget.

### Relevant attributes
- `age`
- `market_value`
- `total_goals`
- `total_assists`
- `dribble_success_rate`

### Filter Players Under 23 and Define Efficiency Metrics

Efficiency is calculated by adding the products of the relevant statistics and their corresponding weight. 1 is chosen as the total weight for ease of comparision.

In [8]:
young_players = df[df['age'] < 23]

metrics = ['total_goals', 'total_assists', 'dribble_success_rate']

# normalize the data
for column_name in metrics:
        col_transformed, col_lambda  = stats.yeojohnson(young_players[column_name])
        young_players[column_name] = col_transformed

young_players['efficiency'] = (
    young_players['total_goals'] * 0.5 +
    young_players['total_assists'] * 0.3 +
    young_players['dribble_success_rate'] * 0.2
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  young_players[column_name] = col_transformed
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  young_players['efficiency'] = (


### Compare Market Value

Identify players who are "undervalued" by comparing their efficiency to their market value. A player is "undervalued" when they are priced lower than the average market value of peers who has similar efficiency value (within 5%)

In [9]:
undervalued_players = []

for index, player in young_players.iterrows():
    efficiency = player['efficiency']
    
    # Define the efficiency range (10%)
    lower_bound = efficiency * 0.95
    upper_bound = efficiency * 1.05
    
    # Find peers within this range
    peers = young_players[(young_players['efficiency'] >= lower_bound) & 
                          (young_players['efficiency'] <= upper_bound) &
                          (young_players.index != index)]  # Exclude the player themselves
    
    # Calculate the average market value of peers
    if not peers.empty:
        avg_peer_value = peers['market_value'].mean()
        
        # Check if the player is undervalued
        if player['market_value'] < avg_peer_value:
            undervalued_players.append({
                'name': player['name'],
                'age': player['age'],
                'efficiency': efficiency,
                'market_value': player['market_value'],
                'avg_peer_value': avg_peer_value
            })

# Convert the results into a DataFrame
undervalued_df = pd.DataFrame(undervalued_players)

# Rank Undervalued Players
undervalued_df['value_gap'] = undervalued_df['avg_peer_value'] - undervalued_df['market_value']
undervalued_df = undervalued_df.sort_values(by='value_gap', ascending=False)

display(undervalued_df)

Unnamed: 0,name,age,efficiency,market_value,avg_peer_value,value_gap
71,Othmane Maamma,19,2.714655,0.3,15.000000,14.700000
31,Omari Forson,20,9.966614,2.0,13.000000,11.000000
61,Louis Mouton,22,7.161689,0.6,11.160000,10.560000
87,Yllan Okou,21,5.812407,0.9,10.783333,9.883333
170,Cesar Tarrega,22,9.107556,4.0,13.850000,9.850000
...,...,...,...,...,...,...
209,Fabian Rieder,22,7.415049,8.0,8.161111,0.161111
38,Andy Diouf,21,6.636553,9.0,9.068333,0.068333
76,Sael Kumbedi,19,3.746791,5.0,5.039189,0.039189
131,Kassoum Ouattara,20,5.344896,8.0,8.026364,0.026364
