In [1]:
import numpy
import pandas
import scipy.spatial
from sklearn import preprocessing
from sklearn.decomposition import PCA

## Load and process data

Data come from [Fifa 20 player database](https://www.kaggle.com/stefanoleone992/fifa-20-complete-player-dataset#players_20.csv). While it is quite difficult to gather a huge player base with features (mainly stats) to define players, Fifa player base can be a good starting point.

In [92]:
data = pandas.read_csv("data/players_20.csv")

In [93]:
def process_data(data):
    features = ["weight_kg", "height_cm", "pace", "shooting","passing","dribbling","defending","physic","gk_diving","gk_handling","gk_kicking","gk_reflexes","gk_speed","gk_positioning","attacking_crossing","attacking_finishing","attacking_heading_accuracy","attacking_short_passing","attacking_volleys","skill_dribbling","skill_curve","skill_fk_accuracy","skill_long_passing","skill_ball_control","movement_acceleration","movement_sprint_speed","movement_agility","movement_reactions","movement_balance","power_shot_power","power_jumping","power_stamina","power_strength","power_long_shots","mentality_aggression","mentality_interceptions","mentality_positioning","mentality_vision","mentality_penalties","mentality_composure","defending_marking","defending_standing_tackle","defending_sliding_tackle","goalkeeping_diving","goalkeeping_handling","goalkeeping_kicking","goalkeeping_positioning","goalkeeping_reflexes"]
    data[features] = data[features].div(data.overall, axis=0)
    imputer = preprocessing.Imputer(missing_values='NaN', strategy='mean', axis=0)
    data[features] = imputer.fit_transform(data[features])
    data[features] = data[features].apply(preprocessing.scale)
    pca = PCA(n_components=2)
    pca.fit(data[features])
    data["component_1"], data["component_2"] = pca.transform(data[features])[:,0], pca.transform(data[features])[:,1]
    print(pca.explained_variance_ratio_)
    return data.copy()[["short_name", "player_positions"] + features]

In [94]:
player_vectors = process_data(data)
player_vectors.head()
player_vectors.to_csv("data/processed_data.csv", index=False)

[0.38188257 0.17108246]


Applying PCA here is not that good. We could up the number of component, but only to reduce the overall data dimension (can't be used to 2D plot...)

## Vector & Similarity functions

In [95]:
def get_player_vector(data, name):
    vector_keys = ["weight_kg", "height_cm", "pace", "shooting","passing","dribbling","defending","physic","gk_diving","gk_handling","gk_kicking","gk_reflexes","gk_speed","gk_positioning","attacking_crossing","attacking_finishing","attacking_heading_accuracy","attacking_short_passing","attacking_volleys","skill_dribbling","skill_curve","skill_fk_accuracy","skill_long_passing","skill_ball_control","movement_acceleration","movement_sprint_speed","movement_agility","movement_reactions","movement_balance","power_shot_power","power_jumping","power_stamina","power_strength","power_long_shots","mentality_aggression","mentality_interceptions","mentality_positioning","mentality_vision","mentality_penalties","mentality_composure","defending_marking","defending_standing_tackle","defending_sliding_tackle","goalkeeping_diving","goalkeeping_handling","goalkeeping_kicking","goalkeeping_positioning","goalkeeping_reflexes"]
    return data[data["short_name"]==name][vector_keys].get_values()

In [172]:
def find_similar_players(data, player, distance_callback=scipy.spatial.distance.cosine, n=2):
    vector_keys = ["weight_kg", "height_cm", "pace", "shooting","passing","dribbling","defending","physic","gk_diving","gk_handling","gk_kicking","gk_reflexes","gk_speed","gk_positioning","attacking_crossing","attacking_finishing","attacking_heading_accuracy","attacking_short_passing","attacking_volleys","skill_dribbling","skill_curve","skill_fk_accuracy","skill_long_passing","skill_ball_control","movement_acceleration","movement_sprint_speed","movement_agility","movement_reactions","movement_balance","power_shot_power","power_jumping","power_stamina","power_strength","power_long_shots","mentality_aggression","mentality_interceptions","mentality_positioning","mentality_vision","mentality_penalties","mentality_composure","defending_marking","defending_standing_tackle","defending_sliding_tackle","goalkeeping_diving","goalkeeping_handling","goalkeeping_kicking","goalkeeping_positioning","goalkeeping_reflexes"]
    if type(player)==str:
        main_player_vector = get_player_vector(data, player)
    else:
        main_player_vector = player
    for index, row in data.iterrows():
        player_vector = row[vector_keys].get_values()
        player_vector = [float(x) for x in player_vector]
        distance = distance_callback(main_player_vector, player_vector)
        data.at[index, "distance"] = distance
    return data.sort_values("distance", ascending = True).head(n)

In [173]:
def gradient_embedding(data, name1, name2, alpha):
    player1_vector = get_player_vector(data, name1)
    player2_vector = get_player_vector(data, name2)
    interpolated_vector = alpha * player1_vector + (1 - alpha) * player2_vector
    return interpolated_vector

In [174]:
def interpolated_players(data, name1, name2, alpha_range=10):
    df = data.drop(data[data.short_name == name1].index)
    df = df.drop(df[df.short_name == name2].index)
    alphas = numpy.linspace(1, 0, alpha_range, endpoint=False)
    print(alphas)
    players = []
    for a in alphas:
        vector = gradient_embedding(data, name1, name2, a)
        player = find_similar_players(df, vector, n=10).iloc[[1]]
        players.append(player.short_name.get_values())
        print(player.short_name.get_values()[0])
    return players

## Tests & Samples

* https://datascience.stackexchange.com/questions/27726/when-to-use-cosine-simlarity-over-euclidean-similarity

In first tries, I applied euclidean distance to compute player similarities. However, it might be better to use cosine similarity, especially to capture similarity between young and more experimented players. For example : 
* Player 1 has 100 pace, 100 shooting
* Player 2 has 50 pace, 50 shooting
* Player 3 has 90 pace, 40 shooting

By cosine similarity, player 1 and player 2 are more similar. By euclidean similarity, player 3 is more similar to player 1.

In [171]:
player_1 = [100, 100]
player_2 = [50, 50]
player_3 = [90, 40]

print(scipy.spatial.distance.cosine(player_1, player_2))
print(scipy.spatial.distance.euclidean(player_1, player_2))

print(scipy.spatial.distance.cosine(player_1, player_3))
print(scipy.spatial.distance.euclidean(player_1, player_3))

0.0
70.71067811865476
0.06665439379694049
60.8276253029822


In [141]:
find_similar_players(player_vectors, "Sergio Ramos", scipy.spatial.distance.cosine, 10)

Unnamed: 0,short_name,player_positions,weight_kg,height_cm,pace,shooting,passing,dribbling,defending,physic,...,mentality_composure,defending_marking,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes,distance
18,Sergio Ramos,CB,-1.50761,-2.247088,-1.25278,-0.1192,-0.153693,-1.029554,0.924432,-0.19696,...,0.484407,0.864895,0.822857,1.037406,-0.477462,-0.613367,-0.573738,-0.655184,-0.477622,0.0
47,J. Vertonghen,CB,-1.059596,-1.910438,-1.739395,-0.348008,-0.082407,-0.967211,1.027368,-0.583607,...,0.648072,1.099904,0.894186,1.038236,-0.678116,-0.517537,-0.564669,-0.428834,-0.633608,0.101107
216,David Luiz,CB,-0.741792,-1.57476,-1.677447,-0.009761,0.288512,-0.833511,0.97218,-0.421869,...,0.362891,0.853656,0.894186,0.92385,-0.444645,-0.404187,-0.498214,-0.633474,-0.314609,0.103109
59,S. Umtiti,CB,-1.836197,-2.09042,-1.290993,-0.299731,-0.518997,-1.181139,1.027368,-0.209855,...,0.557336,1.060602,0.968003,0.963923,-0.29079,-0.512425,-0.33315,-0.42271,-0.293838,0.105853
633,F. Schär,CB,-0.560694,-1.245676,-1.391443,0.290163,0.353372,-0.579072,1.085351,-0.243751,...,1.003667,1.064166,0.974544,0.879245,-0.559478,-0.521433,-0.425029,-0.472168,-0.237951,0.115591
42,Casemiro,CDM,-1.212954,-2.057852,-1.805314,0.245115,0.021601,-0.967211,0.922066,0.361767,...,0.648072,0.901085,0.894186,0.964352,-0.38259,-0.341685,-0.250763,-0.428834,-0.425783,0.12416
497,L. Perrin,CB,-0.982916,-1.621599,-2.164572,-0.343066,0.025501,-0.989113,1.084626,-0.238546,...,0.908183,0.890647,1.052892,1.081643,-0.655952,-0.622499,-0.383052,-0.43048,-0.291736,0.125906
45,Thiago Silva,CB,-1.366313,-2.131559,-1.805314,-0.941132,-0.290422,-0.967211,1.027368,-0.678145,...,0.82148,1.06014,0.93067,0.92741,-0.551462,-0.429611,-0.744044,-0.560502,-0.508913,0.126307
287,D. De Rossi,CDM,-0.901561,-1.681227,-1.765923,-0.024925,0.276546,-0.849687,1.194953,-0.328818,...,1.828087,1.357883,0.894186,0.922905,-0.483501,-0.630661,-0.349747,-0.489851,-0.527668,0.127656
391,Fernando,"CDM, CM",-1.394713,-1.631989,-1.573377,-0.111247,0.152573,-0.657156,1.027368,0.274232,...,0.442553,0.977667,0.972559,1.001294,-0.886079,-0.909947,-0.920096,-0.908365,-0.879919,0.13786


In [144]:
similar = gradient_embedding(player_vectors, "Santi Cazorla", "P. Pogba", 0.5)

In [145]:
find_similar_players(player_vectors, similar, scipy.spatial.distance.cosine, 10)

Unnamed: 0,short_name,player_positions,weight_kg,height_cm,pace,shooting,passing,dribbling,defending,physic,...,mentality_composure,defending_marking,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes,distance
416,Y. Tielemans,"CM, CDM",-1.650027,-1.821989,-1.376017,1.090317,1.495909,0.281212,0.340269,-1.163831,...,1.191054,0.414973,0.616449,0.519201,-0.655952,-0.574689,-0.480587,-0.287291,-0.38214,0.037505
290,Santi Cazorla,"CM, CDM, LM",-2.36595,-2.306837,-1.975738,0.604365,1.600741,0.596211,0.077719,-2.234553,...,1.460126,0.429739,0.158716,0.060624,-0.662671,-0.537374,-0.730372,-0.629547,-0.263071,0.038951
146,I. Gündoğan,"CM, CDM",-1.300588,-2.005204,-1.386258,0.551092,1.269692,0.492932,0.372988,-1.100187,...,1.177584,0.649721,0.516313,0.350851,-0.45044,-0.319704,-0.736036,-0.728221,-0.36492,0.048097
689,A. Lallana,CM,-1.489582,-1.65153,-1.028472,0.653044,1.269692,0.814534,0.041657,-1.28486,...,1.003667,0.232148,0.211144,0.309684,-0.652465,-0.230943,-0.276875,-0.568835,-0.421048,0.052395
452,Campaña,"CM, CDM",-1.816804,-1.701755,-1.519391,0.731971,1.382801,0.175351,0.626561,-0.649784,...,1.002473,0.804161,0.695802,0.519201,-0.564128,-0.43126,-0.675657,-0.621398,-0.472543,0.052705
703,J. Wilshere,"CM, CAM, CDM",-1.911804,-1.895043,-1.246255,0.435315,1.384232,0.707333,0.09964,-1.180749,...,1.003667,0.275938,0.291502,0.431732,-0.652465,-0.230943,-0.22749,-0.520501,-0.421048,0.055461
918,C. Grenier,"CM, CDM",-1.496078,-1.230066,-1.92194,1.013135,1.501709,0.275783,0.087746,-0.776318,...,0.908183,0.39945,0.120999,-0.193638,-0.272174,-0.417775,-0.468083,-0.417018,-0.646397,0.055698
480,Manu Trigueros,"CM, CDM",-1.39986,-1.741833,-1.591078,0.731971,1.043476,0.175351,0.454786,-1.26664,...,0.531022,0.544702,0.616449,0.278154,-0.518215,-0.287831,-0.383052,-0.621398,-0.562947,0.057652
912,Adrien Silva,"CM, CDM",-1.752659,-1.68223,-1.62784,0.939628,1.385701,0.275783,0.557557,-1.198101,...,0.811475,0.222042,0.812798,0.754067,-0.225084,-0.662953,-0.568118,-0.465971,-0.600036,0.057805
213,E. Banega,"CDM, CM, CAM",-1.947413,-2.154204,-1.677447,0.542868,1.378712,0.390898,0.696236,-1.115523,...,0.635537,0.895336,0.741215,0.807683,-0.35614,-0.588513,-0.639228,-0.633474,-0.488881,0.058412


In [175]:
interpolated_players(player_vectors, "Santi Cazorla", "P. Pogba")

[1.  0.9 0.8 0.7 0.6 0.5 0.4 0.3 0.2 0.1]
M. Pjanić
M. Pjanić
I. Gündoğan
Y. Tielemans
I. Gündoğan
I. Gündoğan
Campaña
Campaña
Campaña
V. Birsa


[array(['M. Pjanić'], dtype=object),
 array(['M. Pjanić'], dtype=object),
 array(['I. Gündoğan'], dtype=object),
 array(['Y. Tielemans'], dtype=object),
 array(['I. Gündoğan'], dtype=object),
 array(['I. Gündoğan'], dtype=object),
 array(['Campaña'], dtype=object),
 array(['Campaña'], dtype=object),
 array(['Campaña'], dtype=object),
 array(['V. Birsa'], dtype=object)]