In [1]:
# import 
import pandas as pd

# read data
df = pd.read_csv("player_raw_data.csv")

# feature engineering

# dummy variable that replaces the sofifa_id
df["id"] = df.index
# expanding the WorkRate to Attack and Def, 
# where Work_Rate is = (H/M/L) / (H/M/L), the first entry is attacking, second is defending
df[["atk_workrate", "def_workrate"]] = df["work_rate"].str.split("/", expand=True)

# defining feature types
categorical_feats = ["country", "club", 
                     "best_position"]
# ordinal features
# And their according levels
atk_workrate_levels = ["High", "Medium", "Low"]
def_workrate_levels = [" High", " Medium", " Low"]
ordinal_feats = ["atk_workrate", "def_workrate"]

# binary features
binary_feats = ["preferred_foot"]

# to be dropped feats
drop_feats = ["name", "first_name", "last_name", "work_rate", "id"] 

# numeric_feats
numeric_feats = list(set(df.columns) 
                     - set(categorical_feats) 
                     - set(drop_feats) 
                     - set(binary_feats)
                     - set(ordinal_feats))


assert set(categorical_feats + ordinal_feats + binary_feats + drop_feats + numeric_feats) == set(df.columns)

In [2]:
# imports for preprocessing
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

numeric_transformer = make_pipeline(StandardScaler())
categorical_transformer = make_pipeline(
    OneHotEncoder(handle_unknown="ignore", sparse=False)
)

ordinal_transformer = make_pipeline(
    OrdinalEncoder(categories=[atk_workrate_levels, def_workrate_levels], dtype=int)
)

binary_transformer = make_pipeline(
    OneHotEncoder(drop="if_binary", dtype=int)
)
ct = make_column_transformer(
    (numeric_transformer, numeric_feats),
    (categorical_transformer, categorical_feats),
    (ordinal_transformer, ordinal_feats),
    (binary_transformer, binary_feats),
    ("drop", drop_feats)
)
ct.fit(df)

# get new feature names

In [3]:
# New column names after transformation
#ct.named_transformers_["pipeline-1"].get_feature_names_out() 
ohe_columns = ct.named_transformers_["pipeline-2"].get_feature_names_out().tolist()

bin_columns = ct.named_transformers_["pipeline-4"].get_feature_names_out().tolist()
feature_names = numeric_feats + ohe_columns + bin_columns + ordinal_feats

# creating transformed data to do clustering
transformed = pd.DataFrame(data=ct.fit_transform(df), columns=feature_names)

# all_cols
feature_cols = transformed.columns.tolist()
transformed.head()


Unnamed: 0,shot_power,agility,balance,sprint_speed,overall,reactions,total_stats,interceptions,penalties,defensive_awareness,...,best_position_LW,best_position_LWB,best_position_RB,best_position_RM,best_position_RW,best_position_RWB,best_position_ST,preferred_foot_Right,atk_workrate,def_workrate
0,1.064155,1.750972,1.414297,0.403726,0.177734,0.015906,0.461885,-1.147999,1.339003,-0.575684,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0
1,-1.35038,-1.188725,0.071204,0.067651,-0.816021,-0.86219,-0.804958,0.39016,-0.945426,0.651783,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
2,0.039807,0.383671,0.566028,0.269296,-0.567582,-0.959756,-0.175017,-0.907662,-0.057037,-0.281092,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,1.0
3,0.698316,0.520401,0.495338,0.672586,0.674612,0.40617,0.987415,0.918903,0.196789,0.700882,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,-0.545535,-1.667281,-1.413267,-3.091454,-0.567582,-1.447587,-2.27714,-1.532539,-2.278009,-1.852249,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0


### KNearestNeighbor


In [4]:
from sklearn.neighbors import NearestNeighbors

In [5]:
# Fit a NearestNeightbors first 
# K could be 11 for total of players in a one team at match
# K could also be 15, since there are 15 unique positions
K_recomendations = 14
nbrs = NearestNeighbors(n_neighbors=K_recomendations + 1, metric="cosine", algorithm="brute").fit(transformed)
dist, rank = nbrs.kneighbors(transformed)

In [6]:
similar_df = pd.DataFrame(columns=[f"rank_{i} " for i in range(1, K_recomendations + 1)],
                          index=df["id"].values,
                          data=rank[:,1:])
dist_df = pd.DataFrame(columns=[f"rank_{i} " for i in range(1, K_recomendations + 1)],
                          index=df["id"].values,
                          data=dist[:,1:])

# Assgining the name of similar player to each rank
for cols in list(similar_df):
    tg_col = similar_df[cols]
    new_value = df["id"].iloc[tg_col].tolist()
    similar_df[cols] = new_value

In [7]:
similar_df.head()

Unnamed: 0,rank_1,rank_2,rank_3,rank_4,rank_5,rank_6,rank_7,rank_8,rank_9,rank_10,rank_11,rank_12,rank_13,rank_14
0,299,2237,5958,2209,9558,9173,2302,2647,8663,540,4513,3512,5214,3359
1,493,8960,9204,6246,4462,3137,3028,942,8136,4474,5913,74,2693,2800
2,1486,2662,3136,7412,2177,984,3316,7026,2960,5845,386,5862,2062,4970
3,7940,8494,6674,4782,9644,710,4182,8669,7985,8064,6683,6146,6172,6075
4,3882,1861,7523,9004,5217,4049,6647,8944,1534,3757,9964,8135,121,3960


In [8]:
# # Note these distances are always in monotonicallly increasing order,
# # since k nearest neighbors uses cosine distance to classify closeness of two ojects
# # in this case, players, wehre the smaller is the distance is better
dist_df.head()

Unnamed: 0,rank_1,rank_2,rank_3,rank_4,rank_5,rank_6,rank_7,rank_8,rank_9,rank_10,rank_11,rank_12,rank_13,rank_14
0,0.167262,0.177537,0.192105,0.199396,0.2035,0.205997,0.2121,0.218656,0.218784,0.220201,0.221201,0.222496,0.223998,0.225419
1,0.126701,0.133642,0.137792,0.140418,0.142463,0.142511,0.145103,0.147447,0.148232,0.150271,0.15341,0.15417,0.158459,0.158637
2,0.275572,0.303469,0.305354,0.305382,0.32088,0.321873,0.324428,0.330979,0.334027,0.335312,0.335499,0.335896,0.338983,0.340639
3,0.174796,0.204383,0.204447,0.205279,0.208919,0.20916,0.214225,0.214908,0.216875,0.219779,0.226405,0.227506,0.228518,0.228862
4,0.025261,0.025469,0.025812,0.026836,0.027295,0.027928,0.028383,0.028544,0.028581,0.028935,0.029101,0.029176,0.029246,0.029389


In [46]:
# Use this function to get similar player

# import 

# plotly needs to be added to requirements
# plotly==5.9.0
# ipywidgets as well
# jupyter-dash as well
from plotly.offline import init_notebook_mode, iplot, plot
import plotly.graph_objs as go
import plotly.io as pio
pio.renderers.default = "colab"
# 
def similar_player(similar_df, dist_df, player_id, **last_name):
    player_id = int(player_id)
    player_name = df.query(f"id == {player_id}")["name"].values[0]
    
    id_name_dict = df[['id','name']].set_index('id')['name'].to_dict()
    ## Bar chart 
    # Might need to change here
    # Reversing order?
    Xaxis = dist_df.loc[player_id].values[::-1]
    Yaxis = similar_df.loc[player_id].map(id_name_dict).values[::-1]
    fig = go.Figure(go.Bar(
               x=Xaxis,
               y=Yaxis,
               orientation='h'))
    fig.update_layout(title_text='Players similar to  " '+str(player_name)+' "')
    fig.show()



In [81]:
# Checks
messi = df.query("last_name == 'Messi'").id.values[0]
ronaldo = df.query("name.str.contains('Ronaldo')").iloc[0].id
neymar = df.query("name.str.contains('Neymar')").id.values[0]

for id in [messi, ronaldo, neymar]:
    similar_player(similar_df, dist_df, id)

In [73]:
df.query("best_position == 'ST'").sort_values(by="overall",ascending=False)

Unnamed: 0,name,first_name,last_name,country,age,overall,potential,club,best_position,value,...,defensive_awareness,standing_tackle,sliding_tackle,diving,handling,kicking,reflexes,id,atk_workrate,def_workrate
1444,Robert Lewandowski,Robert,Lewandowski,Poland,32,92,92,FC Bayern München,ST,119500000,...,35,42,19,15,6,12,10,1444,High,Medium
1432,C. Ronaldo dos Santos Aveiro,C.,Aveiro,Portugal,36,91,91,Manchester United,ST,45000000,...,24,32,24,7,11,15,11,1432,High,Medium
1406,Kylian Mbappé,Kylian,Mbappé,France,22,91,95,Paris Saint-Germain,ST,194000000,...,26,34,32,13,5,7,6,1406,High,Low
3815,Harry Kane,Harry,Kane,England,27,89,89,Tottenham Hotspur,ST,112000000,...,50,36,38,8,10,11,11,3815,High,High
1395,Erling Haaland,Erling,Haaland,Norway,20,88,94,Borussia Dortmund,ST,143500000,...,44,53,29,7,14,13,7,1395,High,Medium
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4504,Feng Wei,Feng,Wei,China PR,18,47,60,Guangzhou City,ST,110000,...,13,18,12,10,11,7,10,4504,Medium,Medium
5827,Youzu He,Youzu,He,China PR,22,47,52,Shijiazhuang Ever Bright F.C.,ST,70000,...,20,15,16,12,11,11,9,5827,Medium,Medium
6278,Parmanjan Kyum,Parmanjan,Kyum,China PR,20,47,52,Henan Songshan Longmen FC,ST,70000,...,11,18,17,12,11,9,6,6278,Medium,Low
3003,Gurkirat Singh,Gurkirat,Singh,India,17,47,62,Mumbai City FC,ST,110000,...,44,46,45,6,10,10,13,3003,High,Medium


In [52]:
similar_player(similar_df, dist_df, 1414)