In [1]:
import pandas as pd

In [2]:
# This data should come from the scraper after pipeline of processing
df = pd.read_csv("../data/raw/player_raw_data.csv")

# feature engineering

# dummy variable that replaces the sofifa_id
df["id"] = df.index
# expanding the WorkRate to Attack and Def, 
# where Work_Rate is = (H/M/L) / (H/M/L), the first entry is attacking, second is defending
df[["atk_workrate", "def_workrate"]] = df["work_rate"].str.split("/", expand=True)

processed = df

## Class

In [3]:
# imports for preprocessing
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

class model(object):
    def __init__(self):
        self._data = processed

    def _prepare(self):
        # defining feature types
        categorical_feats = ["country", "club",
                             "best_position"]
        # ordinal features
        # And their according levels
        atk_workrate_levels = ["High", "Medium", "Low"]
        def_workrate_levels = [" High", " Medium", " Low"]
        ordinal_feats = ["atk_workrate", "def_workrate"]

        # binary features
        binary_feats = ["preferred_foot"]

        # to be dropped feats
        drop_feats = ["name", "first_name", "last_name", "work_rate", "id"] 

        # numeric_feats
        numeric_feats = list(set(df.columns)
                             - set(categorical_feats)
                             - set(drop_feats)
                             - set(binary_feats)
                             - set(ordinal_feats))
        assert set(categorical_feats + ordinal_feats + binary_feats
                   + drop_feats + numeric_feats) == set(df.columns)
        numeric_transformer = make_pipeline(StandardScaler())
        categorical_transformer = make_pipeline(
            OneHotEncoder(handle_unknown="ignore", sparse=False)
        )

        ordinal_transformer = make_pipeline(
            OrdinalEncoder(categories=[atk_workrate_levels, def_workrate_levels],
                           dtype=int)
        )

        binary_transformer = make_pipeline(
            OneHotEncoder(drop="if_binary", dtype=int)
        )
        ct = make_column_transformer(
            (numeric_transformer, numeric_feats),
            (categorical_transformer, categorical_feats),
            (ordinal_transformer, ordinal_feats),
            (binary_transformer, binary_feats),
            ("drop", drop_feats)
        )
        return ct.fit(df)

    def prepare(self):
        return self._prepare()

    @property
    def data(self):
        return self._data

## Others


In [4]:
# import 
import pandas as pd

# read data
df = pd.read_csv("../data/raw/player_raw_data.csv")

# feature engineering

# dummy variable that replaces the sofifa_id
df["id"] = df.index
# expanding the WorkRate to Attack and Def, 
# where Work_Rate is = (H/M/L) / (H/M/L), the first entry is attacking, second is defending
df[["atk_workrate", "def_workrate"]] = df["work_rate"].str.split("/", expand=True)
df["def_workrate"] = df["def_workrate"].replace(" ", "")
# defining feature types
categorical_feats = ["country", "club", 
                     "best_position"]
# ordinal features
# And their according levels
atk_workrate_levels = ["High", "Medium", "Low"]
def_workrate_levels = [" High", " Medium", " Low"]
ordinal_feats = ["atk_workrate", "def_workrate"]

# binary features
binary_feats = ["preferred_foot"]

# to be dropped feats
drop_feats = ["name", "first_name", "last_name", "work_rate", "id"] 

# numeric_feats
numeric_feats = list(set(df.columns) 
                     - set(categorical_feats) 
                     - set(drop_feats) 
                     - set(binary_feats)
                     - set(ordinal_feats))


assert set(categorical_feats + ordinal_feats + binary_feats + drop_feats + numeric_feats) == set(df.columns)

In [21]:
df.head()

Unnamed: 0,name,first_name,last_name,country,age,overall,potential,club,best_position,value,...,defensive_awareness,standing_tackle,sliding_tackle,diving,handling,kicking,reflexes,id,atk_workrate,def_workrate
0,Rúben Daniel Fonseca Macedo,Rúben,Macedo,Portugal,25,65,68,Clube Sport Marítimo,RW,950000,...,32,25,21,6,11,14,13,0,Medium,Medium
1,Naif Almas,Naif,Almas,Saudi Arabia,21,57,69,Al Fayha,CB,375000,...,57,57,54,12,15,10,12,1,Medium,Medium
2,Rakan Al Shamlan,Rakan,Shamlan,Saudi Arabia,22,59,68,Al Batin,LW,500000,...,38,43,45,13,6,7,14,2,Medium,Low
3,Erick Wiemberg,Erick,Wiemberg,Chile,27,69,70,Unión La Calera,LB,1500000,...,58,66,64,12,13,7,9,3,High,Medium
4,Nicolás Forastiero,Nicolás,Forastiero,Argentina,22,59,69,Argentinos Juniors,GK,450000,...,6,10,12,56,64,64,61,4,Medium,Medium


In [5]:
# imports for preprocessing
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

numeric_transformer = make_pipeline(StandardScaler())
categorical_transformer = make_pipeline(
    OneHotEncoder(handle_unknown="ignore", sparse=False)
)

ordinal_transformer = make_pipeline(
    OrdinalEncoder(categories=[atk_workrate_levels, def_workrate_levels], dtype=int)
)

binary_transformer = make_pipeline(
    OneHotEncoder(drop="if_binary", dtype=int)
)
ct = make_column_transformer(
    (numeric_transformer, numeric_feats),
    (categorical_transformer, categorical_feats),
    (ordinal_transformer, ordinal_feats),
    (binary_transformer, binary_feats),
    ("drop", drop_feats)
)
ct.fit(df)

# get new feature names

In [6]:
# New column names after transformation
#ct.named_transformers_["pipeline-1"].get_feature_names_out() 
ohe_columns = ct.named_transformers_["pipeline-2"].get_feature_names_out().tolist()

bin_columns = ct.named_transformers_["pipeline-4"].get_feature_names_out().tolist()
feature_names = numeric_feats + ohe_columns + bin_columns + ordinal_feats

# creating transformed data to do clustering
transformed = pd.DataFrame(data=ct.fit_transform(df), columns=feature_names)

# all_cols
feature_cols = transformed.columns.tolist()
transformed.head()


Unnamed: 0,height,reflexes,wage,volleys,jumping,aggression,skill_move,curve,age,positioning,...,best_position_LW,best_position_LWB,best_position_RB,best_position_RM,best_position_RW,best_position_RWB,best_position_ST,preferred_foot_Right,atk_workrate,def_workrate
0,-1.551996,-0.154182,-0.252903,0.629456,0.648062,-0.404052,0.896031,1.508647,0.670645,-0.468141,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0
1,-0.549606,-0.214823,-0.296032,-1.034153,1.522845,0.622119,-0.416201,-1.096001,-0.304918,-0.017855,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
2,-0.836003,-0.093542,-0.252903,-0.058934,-1.801331,-0.524778,-0.416201,-0.190036,-0.061027,-0.210835,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,1.0
3,-0.406408,-0.396745,-0.296032,-0.575226,0.823019,0.923934,-0.416201,0.489437,1.158427,-0.082181,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.166386,2.756575,-0.339161,-2.009372,-0.576634,-1.55095,-1.728434,-2.001965,-0.061027,2.683865,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0


### KNearestNeighbor


In [7]:
from sklearn.neighbors import NearestNeighbors

In [15]:
# Fit a NearestNeightbors first 
# K could be 11 for total of players in a one team at match
# K could also be 15, since there are 15 unique positions
K_recomendations = 5
nbrs = NearestNeighbors(n_neighbors=K_recomendations + 1, metric="cosine", algorithm="brute").fit(transformed)
dist, rank = nbrs.kneighbors(transformed)

In [16]:
similar_df = pd.DataFrame(columns=[f"rank_{i} " for i in range(1, K_recomendations + 1)],
                          index=df["id"].values,
                          data=rank[:,1:])
dist_df = pd.DataFrame(columns=[f"rank_{i} " for i in range(1, K_recomendations + 1)],
                          index=df["id"].values,
                          data=dist[:,1:])

# Assgining the name of similar player to each rank
for cols in list(similar_df):
    tg_col = similar_df[cols]
    new_value = df["id"].iloc[tg_col].tolist()
    similar_df[cols] = new_value

In [17]:
similar_df.head()

Unnamed: 0,rank_1,rank_2,rank_3,rank_4,rank_5
0,299,2237,5958,2209,9558
1,493,8960,9204,6246,4462
2,1486,2662,3136,7412,2177
3,7940,8494,6674,4782,9644
4,3882,1861,7523,9004,5217


In [18]:
# # Note these distances are always in monotonicallly increasing order,
# # since k nearest neighbors uses cosine distance to classify closeness of two ojects
# # in this case, players, wehre the smaller is the distance is better
dist_df.head()

Unnamed: 0,rank_1,rank_2,rank_3,rank_4,rank_5
0,0.167262,0.177537,0.192105,0.199396,0.2035
1,0.126701,0.133642,0.137792,0.140418,0.142463
2,0.275572,0.303469,0.305354,0.305382,0.32088
3,0.174796,0.204383,0.204447,0.205279,0.208919
4,0.025261,0.025469,0.025812,0.026836,0.027295


In [19]:
# Use this function to get similar player

# import 

# plotly needs to be added to requirements
# plotly==5.9.0
# ipywidgets as well
# jupyter-dash as well
from plotly.offline import init_notebook_mode, iplot, plot
import plotly.graph_objs as go
import plotly.io as pio
pio.renderers.default = "colab"


def similar_player(similar_df, dist_df, player_id, **last_name):
    player_id = int(player_id)
    player_name = df.query(f"id == {player_id}")["name"].values[0]

    id_name_dict = df[['id', 'name']].set_index('id')['name'].to_dict()
    ## Bar chart 
    # Might need to change here
    # Reversing order?
    Xaxis = dist_df.loc[player_id].values[::-1]
    Yaxis = similar_df.loc[player_id].map(id_name_dict).values[::-1]
    fig = go.Figure(go.Bar(
               x=Xaxis,
               y=Yaxis,
               orientation='h'))
    fig.update_layout(title_text='Players similar to  " '+str(player_name)+' "')
    fig.show()



In [20]:
# Checks
messi = df.query("last_name == 'Messi'").id.values[0]
ronaldo = df.query("name.str.contains('Ronaldo')").iloc[0].id
neymar = df.query("name.str.contains('Neymar')").id.values[0]

similar_player(similar_df, dist_df, messi)
#for id in [messi, ronaldo, neymar]:
#    similar_player(similar_df, dist_df, id)

In [14]:
df.query("best_position == 'ST'").sort_values(by="overall",ascending=False)

Unnamed: 0,name,first_name,last_name,country,age,overall,potential,club,best_position,value,...,defensive_awareness,standing_tackle,sliding_tackle,diving,handling,kicking,reflexes,id,atk_workrate,def_workrate
1444,Robert Lewandowski,Robert,Lewandowski,Poland,32,92,92,FC Bayern München,ST,119500000,...,35,42,19,15,6,12,10,1444,High,Medium
1432,C. Ronaldo dos Santos Aveiro,C.,Aveiro,Portugal,36,91,91,Manchester United,ST,45000000,...,24,32,24,7,11,15,11,1432,High,Medium
1406,Kylian Mbappé,Kylian,Mbappé,France,22,91,95,Paris Saint-Germain,ST,194000000,...,26,34,32,13,5,7,6,1406,High,Low
3815,Harry Kane,Harry,Kane,England,27,89,89,Tottenham Hotspur,ST,112000000,...,50,36,38,8,10,11,11,3815,High,High
1395,Erling Haaland,Erling,Haaland,Norway,20,88,94,Borussia Dortmund,ST,143500000,...,44,53,29,7,14,13,7,1395,High,Medium
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4504,Feng Wei,Feng,Wei,China PR,18,47,60,Guangzhou City,ST,110000,...,13,18,12,10,11,7,10,4504,Medium,Medium
5827,Youzu He,Youzu,He,China PR,22,47,52,Shijiazhuang Ever Bright F.C.,ST,70000,...,20,15,16,12,11,11,9,5827,Medium,Medium
6278,Parmanjan Kyum,Parmanjan,Kyum,China PR,20,47,52,Henan Songshan Longmen FC,ST,70000,...,11,18,17,12,11,9,6,6278,Medium,Low
3003,Gurkirat Singh,Gurkirat,Singh,India,17,47,62,Mumbai City FC,ST,110000,...,44,46,45,6,10,10,13,3003,High,Medium
