In [3]:
import asyncio
import json
import numpy as np
import pandas as pd
import aiohttp
import nest_asyncio
import warnings
warnings.filterwarnings('ignore')
nest_asyncio.apply()

from understatapi import UnderstatClient

understat = UnderstatClient()

In [4]:
leagues = ['EPL','Bundesliga','Ligue_1','Serie_A','La_Liga']
years = np.arange(2014,2025)
#leagues = ['la_liga']
df_players = pd.DataFrame()
for league_w in leagues:
    for year in years:
        df_league_season = pd.DataFrame(understat.league(league=league_w).get_player_data(season=str(year)))
        df_league_season['season'] = year 
        df_players = pd.concat([df_players,df_league_season])


In [5]:
df_players_copy = df_players.copy()
int_cols =  ['id' ,'games', 'time', 'goals', 'assists','shots', 'key_passes', 'yellow_cards', 'red_cards',
        'npg']
float_cols = ['xG', 'xA','npxG', 'xGChain', 'xGBuildup']

df_players[float_cols] = df_players[float_cols].astype('float')
df_players[int_cols] = df_players[int_cols].astype('uint32')

df_players_time = df_players[df_players['time'] >= 1000]

P90_cols = ['goals', 'assists','shots', 'key_passes','npg','xG', 'xA','npxG', 'xGChain', 'xGBuildup']
percent = list(np.char.add('Percentile_', P90_cols))
p90_pre = list(np.char.add(P90_cols,'P90'))
for i in range(0,len(P90_cols)):
    df_players_time[p90_pre[i]] = (df_players_time[P90_cols[i]]/df_players_time['time']) * 90
    df_players_time[percent[i]] = df_players_time.groupby('season')[p90_pre[i]].rank(pct=True)*100

In [6]:
id_cols = ['id','player_name','team_title','season']
value_cols = set(df_players_time.columns) - set(id_cols)
df_players_unpivot = pd.melt(df_players_time,id_vars = id_cols, value_vars = value_cols,   var_name = 'Measure', 
       value_name = 'value')

In [42]:
playmaking_cols = ['Percentile_assists','Percentile_key_passes','Percentile_xA','Percentile_xGChain']
id_play = id_cols + playmaking_cols 
df_playmaking = df_players_time[id_play].copy()
df_playmaking2024 = df_playmaking[df_playmaking['season'] == 2020].copy()
df_playmaking2024['avg_assists'] = df_playmaking2024[playmaking_cols].sum(axis=1)/len(playmaking_cols)
df_playmaking2024['rank_play'] = df_playmaking2024['avg_assists'].rank(ascending = False)
df_playmaking2024.sort_values(by = ['rank_play']).to_csv('playmaking.csv')

In [45]:
goals_cols = ['Percentile_goals','Percentile_shots','Percentile_npxG',]
id_play = id_cols + goals_cols 
df_goals = df_players_time[id_play].copy()
df_goals2024 = df_goals[df_goals['season'] == 2024].copy()
df_goals2024['avg_assists'] = df_goals2024[goals_cols].sum(axis=1)/len(goals_cols)
df_goals2024['rank_play'] = df_goals2024['avg_assists'].rank(ascending = False)
df_goals2024.sort_values(by = ['rank_play']).to_csv('goals.csv')

In [77]:

conditions = [
    df_shots['h_a'] == 'h',
    df_shots['h_a'] == 'a']
choices = [df_shots['h_team'], df_shots['a_team']]
df_shots['team'] = np.select(conditions, choices)

oppconditions = [
    df_shots['h_a'] == 'h',
    df_shots['h_a'] == 'a']
oppchoices = [df_shots['a_team'], df_shots['h_team']]
df_shots['opp_team'] = np.select(oppconditions, oppchoices)
#df_shots.to_csv('df_shots.csv',index = False)

In [28]:
df_radar = pd.read_csv('radar_data.csv')
measure_df = pd.DataFrame(df_players_unpivot['Measure'].unique(),columns = ['Name'])
measure_radar_df = pd.merge(df_radar,measure_df,how="cross")


In [34]:
cols_to_drop = ['Exclude Grid Line','Line Radius','Line Radius Previous','Number of Records','Point Modified']
measure_radar_df.drop(columns = cols_to_drop,inplace= True)

In [35]:
df_players_unpivot.to_csv('players_data.csv',index = False,encoding = 'utf_8_sig')
measure_radar_df.to_csv('radar_final_data.csv',index = False,encoding = 'utf_8_sig')
#df_shots.to_csv('shots_data.csv',index = False,encoding = 'utf_8_sig')

In [15]:
percentiles = df_players_time[df_players_time['season'] == 2024][percent]
data = percentiles.copy()
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score
import matplotlib.pyplot as plt
"""
labels = kmeans.labels_
centroids = kmeans.cluster_centers_
plt.scatter(centroids[:, 0], centroids[:, 1], c='red', s=200, alpha=0.75)
plt.show()
"""
"""
inertia = []
for k in range(1, 10):
    kmeans = KMeans(n_clusters=k).fit(data)
    inertia.append(kmeans.inertia_)
plt.plot(range(1, 10), inertia, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()
"""
kmeans = KMeans(n_clusters=4).fit(data)
labels = kmeans.labels_
centroids = kmeans.cluster_centers_
pd.DataFrame(centroids).T

Unnamed: 0,0,1,2,3
0,24.647686,76.697159,79.1105,39.093332
1,25.355578,82.42728,57.185101,49.593952
2,20.51001,78.729596,79.62596,41.480369
3,21.203612,84.403827,62.263748,48.998209
4,25.240099,75.995099,78.897178,39.086858
5,21.013202,77.791494,80.21934,41.162419
6,20.131655,86.853976,61.817597,48.865891
7,21.186566,77.432822,80.411911,41.081692
8,17.411142,85.366548,59.845404,53.636589
9,32.994632,74.416428,25.930484,67.818019
