In [31]:
# import necessary packages
import requests
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import json

In [32]:
# initialize api key to .txt file
api_key_path = 'soccer_api.txt'
with open(api_key_path, 'r') as file:
    api_key = file.read().strip()

In [33]:
# setup api according to documentation
headers = {
    'x-rapidapi-host': 'v3.football.api-sports.io',
    'x-rapidapi-key' : api_key
}

# setup function to extract data from the api
def get_api_data(endpoint):
    url = f'https://v3.football.api-sports.io/{endpoint}'
    response = requests.get(url, headers=headers)
    return response.json()


In [34]:
# example of api call to retrieve leagues
leagues = get_api_data('/leagues')
with open ('leagues.json', 'w') as f:
    json.dump(leagues, f, indent=2)
    

EPL 2023-'24

In [35]:
# lets start with '23-'24 season in the EPL
# initialize league id and season
league_id = 39
season = 2023

# setup response protocal for teams
teams_response = get_api_data(f'/teams?league={league_id}&season={season}')
teams = teams_response['response']


for team in teams:
    print(f"{team['team']['name']} - ID: {team['team']['id']}")


Manchester United - ID: 33
Newcastle - ID: 34
Bournemouth - ID: 35
Fulham - ID: 36
Wolves - ID: 39
Liverpool - ID: 40
Arsenal - ID: 42
Burnley - ID: 44
Everton - ID: 45
Tottenham - ID: 47
West Ham - ID: 48
Chelsea - ID: 49
Manchester City - ID: 50
Brighton - ID: 51
Crystal Palace - ID: 52
Brentford - ID: 55
Sheffield Utd - ID: 62
Nottingham Forest - ID: 65
Aston Villa - ID: 66
Luton - ID: 1359


In [36]:
# now lets take a look at the players for a specific team
# arsenal
arsenal_id = 42
# run the api call on arsenal team_id
arsenal_response = get_api_data(f'/players?team={arsenal_id}&season=2023&page=1')
arsenal_players = arsenal_response['response']

# use a sample player to see all player metrics availble to us
sample_player = arsenal_players[0]
print('Player Keys: ')
print(sample_player['player'].keys())

print('Statistics Keys:')
print(sample_player['statistics'][0].keys())

print('Nested Metric Categories in statistics[0]:')
for key, value in sample_player['statistics'][0].items():
    if isinstance(value, dict):
        print(f"{key}: {list(value.keys())}")

Player Keys: 
dict_keys(['id', 'name', 'firstname', 'lastname', 'age', 'birth', 'nationality', 'height', 'weight', 'injured', 'photo'])
Statistics Keys:
dict_keys(['team', 'league', 'games', 'substitutes', 'shots', 'goals', 'passes', 'tackles', 'duels', 'dribbles', 'fouls', 'cards', 'penalty'])
Nested Metric Categories in statistics[0]:
team: ['id', 'name', 'logo']
league: ['id', 'name', 'country', 'logo', 'flag', 'season']
games: ['appearences', 'lineups', 'minutes', 'number', 'position', 'rating', 'captain']
substitutes: ['in', 'out', 'bench']
shots: ['total', 'on']
goals: ['total', 'conceded', 'assists', 'saves']
passes: ['total', 'key', 'accuracy']
tackles: ['total', 'blocks', 'interceptions']
duels: ['total', 'won']
dribbles: ['attempts', 'success', 'past']
fouls: ['drawn', 'committed']
cards: ['yellow', 'yellowred', 'red']
penalty: ['won', 'commited', 'scored', 'missed', 'saved']


In [37]:
# now that we have a good idea of the data structure, lets conduct simple clustering analysis
# lets create a dataframe of all players and their statistics

arsenal_flat = []
for player in arsenal_players:
    p = player['player']
    stats = player['statistics'][0]

    row = {
        'name': p['name'],
        'age': p['age'],
        'position': stats['games']['position'],
        'minutes': stats['games']['minutes'],
        'appearances': stats['games']['appearences'],
        'rating': float(stats['games']['rating']) if stats['games']['rating'] else None,

        'goals': stats['goals']['total'],
        'assists': stats['goals']['assists'],
        'shots': stats['shots']['total'],
        'shots_on_target': stats['shots']['on'],

        'passes': stats['passes']['total'],
        'key_passes': stats['passes']['key'],
        'pass_accuracy': int(stats['passes']['accuracy']) if stats['passes']['accuracy'] else None,

        'tackles': stats['tackles']['total'],
        'interceptions': stats['tackles']['interceptions'],
        'blocks': stats['tackles']['blocks'],

        'duels': stats['duels']['total'],
        'duels_won': stats['duels']['won'],

        'dribbles': stats['dribbles']['attempts'],
        'dribbles_success': stats['dribbles']['success'],

        'fouls_committed': stats['fouls']['committed'],
        'fouls_drawn': stats['fouls']['drawn'],

        'yellow_cards': stats['cards']['yellow'],
        'red_cards': stats['cards']['red']
    }
    
    arsenal_flat.append(row)
# convert to pandas df
df_arsenal = pd.DataFrame(arsenal_flat)
df_arsenal.head()

Unnamed: 0,name,age,position,minutes,appearances,rating,goals,assists,shots,shots_on_target,...,interceptions,blocks,duels,duels_won,dribbles,dribbles_success,fouls_committed,fouls_drawn,yellow_cards,red_cards
0,Cédric Soares,34,Defender,59.0,3.0,6.7,0.0,,,,...,,,2.0,1.0,,,,1.0,0.0,0.0
1,R. Holding,30,Defender,0.0,0.0,,0.0,,,,...,,,,,,,,,0.0,0.0
2,Mohamed Elneny,33,Midfielder,25.0,3.0,7.0,0.0,,2.0,1.0,...,1.0,,4.0,1.0,,,1.0,,0.0,0.0
3,R. Rúnarsson,30,Goalkeeper,0.0,0.0,,0.0,,,,...,,,,,,,,,0.0,0.0
4,Fábio Vieira,25,Midfielder,291.0,11.0,6.96,1.0,2.0,6.0,2.0,...,2.0,,34.0,16.0,8.0,2.0,8.0,10.0,0.0,1.0


In [38]:
# lets make a copy of the df and apply a playing time qualifier
# lets define our features for clustering
features = ['rating', 'goals', 'assists', 'shots', 'shots_on_target', 
     'passes', 'key_passes', 'pass_accuracy', 'tackles', 
     'interceptions', 'blocks', 'duels', 'duels_won', 'dribbles', 
     'dribbles_success', 'fouls_committed', 'fouls_drawn', 
     'yellow_cards', 'red_cards']

# lets clean up the data by standardizing and filling NA with 0
X = df_cluster[features].fillna(0)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


ValueError: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by StandardScaler.

In [None]:
kmeans = KMeans(n_clusters=3, random_state=1234)
df_cluster['cluster'] = kmneans.fit_predict(X_scaled)


In [None]:
# visualize with PCA
from sklearn.decomposition import PCA  
import matplotlib.pyplot as plt
pca = PCA(n_components=2)   
components = pca.fit_transform(X_scaled)

plt.figure(figsize=(8, 6))
plt.scatter(components[:, 0], components[:, 1], c=df_cluster['cluster'], cmap='viridis', alpha=0.5) 
for i, name in enumerate(df_cluster['name']):
    plt.text(components[i, 0], components[i, 1], name, fontsize=9)
plt.title('Clustering of Arsenal Players with PCA')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.grid(True)
plt.show()

NameError: name 'X_scaled' is not defined