In [2]:
import pandas as pd

In [3]:
# This data should come from the scraper after pipeline of processing
df = pd.read_csv("../data/raw/player_raw_data.csv")

# feature engineering

# dummy variable that replaces the sofifa_id
df["id"] = df.index
# expanding the WorkRate to Attack and Def, 
# where Work_Rate is = (H/M/L) / (H/M/L), the first entry is attacking, second is defending
df[["atk_workrate", "def_workrate"]] = df["work_rate"].str.split("/", expand=True)

# processed = df

## Class

In [4]:
# imports for preprocessing
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

class model(object):
    def __init__(self):
        self._data = processed

    def _prepare(self):
        # defining feature types
        categorical_feats = ["country", "club",
                             "best_position"]
        # ordinal features
        # And their according levels
        atk_workrate_levels = ["High", "Medium", "Low"]
        def_workrate_levels = [" High", " Medium", " Low"]
        ordinal_feats = ["atk_workrate", "def_workrate"]

        # binary features
        binary_feats = ["preferred_foot"]

        # to be dropped feats
        drop_feats = ["name", "first_name", "last_name", "work_rate", "id"] 

        # numeric_feats
        numeric_feats = list(set(df.columns)
                             - set(categorical_feats)
                             - set(drop_feats)
                             - set(binary_feats)
                             - set(ordinal_feats))
        assert set(categorical_feats + ordinal_feats + binary_feats
                   + drop_feats + numeric_feats) == set(df.columns)
        numeric_transformer = make_pipeline(StandardScaler())
        categorical_transformer = make_pipeline(
            OneHotEncoder(handle_unknown="ignore", sparse=False)
        )

        ordinal_transformer = make_pipeline(
            OrdinalEncoder(categories=[atk_workrate_levels, def_workrate_levels],
                           dtype=int)
        )

        binary_transformer = make_pipeline(
            OneHotEncoder(drop="if_binary", dtype=int)
        )
        ct = make_column_transformer(
            (numeric_transformer, numeric_feats),
            (categorical_transformer, categorical_feats),
            (ordinal_transformer, ordinal_feats),
            (binary_transformer, binary_feats),
            ("drop", drop_feats)
        )
        return ct.fit(df)

    def prepare(self):
        return self._prepare()

    @property
    def data(self):
        return self._data

## Others


In [4]:
# read data
df = pd.read_csv("../data/raw/player_raw_data.csv")

# feature engineering

# dummy variable that replaces the sofifa_id
df["id"] = df.index
# expanding the WorkRate to Attack and Def, 
# where Work_Rate is = (H/M/L) / (H/M/L), the first entry is attacking, second is defending
df[["atk_workrate", "def_workrate"]] = df["work_rate"].str.split("/", expand=True)
df["def_workrate"] = df["def_workrate"].replace(" ", "")
# defining feature types
categorical_feats = ["country", "club", 
                     "best_position"]
# ordinal features
# And their according levels
atk_workrate_levels = ["High", "Medium", "Low"]
def_workrate_levels = [" High", " Medium", " Low"]
ordinal_feats = ["atk_workrate", "def_workrate"]

# binary features
binary_feats = ["preferred_foot"]

# to be dropped feats
drop_feats = ["name", "first_name", "last_name", "work_rate", "id"] 

# numeric_feats
numeric_feats = list(set(df.columns) 
                     - set(categorical_feats) 
                     - set(drop_feats) 
                     - set(binary_feats)
                     - set(ordinal_feats))


assert set(categorical_feats + ordinal_feats + binary_feats + drop_feats + numeric_feats) == set(df.columns)

In [5]:
numeric_feats

['overall',
 'crossing',
 'fk_accuracy',
 'curve',
 'weight',
 'reactions',
 'defensive_awareness',
 'height',
 'long_passing',
 'shot_power',
 'value',
 'acceleration',
 'heading_accuracy',
 'wage',
 'jumping',
 'dribbling',
 'short_passing',
 'reflexes',
 'total_stats',
 'agility',
 'aggression',
 'handling',
 'positioning',
 'age',
 'sliding_tackle',
 'ball_control',
 'strength',
 'diving',
 'balance',
 'sprint_speed',
 'stamina',
 'potential',
 'vision',
 'finishing',
 'composure',
 'skill_move',
 'volleys',
 'standing_tackle',
 'weak_foot',
 'interceptions',
 'penalties',
 'kicking',
 'long_shots']

In [5]:
df.head()

Unnamed: 0,name,first_name,last_name,country,age,overall,potential,club,best_position,value,...,defensive_awareness,standing_tackle,sliding_tackle,diving,handling,kicking,reflexes,id,atk_workrate,def_workrate
0,Patrick Vroegh,Patrick,Vroegh,Netherlands,21,64,73,Vitesse,CM,1300000,...,55,61,55,10,7,5,11,0,Medium,Medium
1,Adam Senior,Adam,Senior,England,19,57,67,Bolton Wanderers,CB,325000,...,53,56,55,9,13,14,7,1,Medium,Medium
2,Finlay Lockett,Finlay,Lockett,England,18,54,68,Bolton Wanderers,LW,250000,...,27,29,35,9,6,12,6,2,Medium,Medium
3,Sergio Lozano Lluch,Sergio,Lluch,Spain,22,62,71,Villarreal CF,CAM,900000,...,54,59,52,10,7,12,10,3,Medium,Medium
4,Sam Sanna,Sam,Sanna,France,22,61,70,Toulouse Football Club,RB,700000,...,63,65,66,6,6,6,10,4,Medium,Medium


In [10]:
# imports for preprocessing
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.neighbors import NearestNeighbors

numeric_transformer = make_pipeline(StandardScaler())
categorical_transformer = make_pipeline(
    OneHotEncoder(handle_unknown="ignore", sparse=False)
)

ordinal_transformer = make_pipeline(
    OrdinalEncoder(categories=[atk_workrate_levels, def_workrate_levels], dtype=int)
)

binary_transformer = make_pipeline(
    OneHotEncoder(drop="if_binary", dtype=int)
)
ct = make_column_transformer(
    (numeric_transformer, numeric_feats),
    (categorical_transformer, categorical_feats),
    (ordinal_transformer, ordinal_feats),
    (binary_transformer, binary_feats),
    ("drop", drop_feats)
)
ct.fit(df)

# get new feature names

In [7]:
# New column names after transformation
#ct.named_transformers_["pipeline-1"].get_feature_names_out() 
ohe_columns = ct.named_transformers_["pipeline-2"].get_feature_names_out().tolist()

bin_columns = ct.named_transformers_["pipeline-4"].get_feature_names_out().tolist()
feature_names = numeric_feats + ohe_columns + bin_columns + ordinal_feats

# creating transformed data to do clustering
transformed = pd.DataFrame(data=ct.fit_transform(df), columns=feature_names)

# all_cols
feature_cols = transformed.columns.tolist()
transformed.head()


Unnamed: 0,curve,jumping,stamina,handling,diving,ball_control,weak_foot,defensive_awareness,age,acceleration,...,best_position_LW,best_position_LWB,best_position_RB,best_position_RM,best_position_RW,best_position_RWB,best_position_ST,preferred_foot_Right,atk_workrate,def_workrate
0,0.629457,-0.597288,0.213661,-0.535763,-0.345071,0.579212,0.144635,0.545222,-0.369163,-0.125496,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1,-0.82557,0.950177,-0.413758,-0.159611,-0.405365,-1.116013,-1.357197,0.447276,-0.818834,-0.125496,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
2,-0.489795,-1.28505,-0.413758,-0.598455,-0.405365,-0.180716,-1.357197,-0.826026,-1.043669,0.599207,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
3,0.853308,-0.683258,-0.037307,-0.535763,-0.345071,0.696125,0.144635,0.496249,-0.144328,-0.125496,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
4,0.349644,-1.542961,-1.103918,-0.598455,-0.586247,0.4623,0.144635,0.937008,-0.144328,-0.191378,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0


### KNearestNeighbor


In [105]:
# from sklean.neighbors import NearestNeigbors
import os
class KnnRecommender:
    def __init__(self, path_data):
        """
        Requires path to load the data and use it to fit to model

        Parameters
        ----------
        path_data: str, data file path (could be local or on cloud)
        """
        self.path_data = path_data
        self.model = NearestNeighbors()
        self._data = None

    def set_model_params(self, n_neighbors=10, algorithm="brute",
                         metric="cosine", n_jobs=-1):
        """
        Set model params for sklearn.neighbors.NearestNeighbors

        Parameters
        ----------
        n_neighbors: int, default to 10

        algorithm: {'auto', 'ball_tree', 'kd_tree', 'brute'} default to "brute"

        metric: string or callable, default to 'cosine' here

        n_jobs: int or None, default to -1
        """
        self.set_model_params(**{
            'n_neighbors': n_neighbors,
            'algorithm': algorithm,
            'metric': metric,
            'n_jobs': n_jobs})

    def prep_data(self):
        """
        Prepare data for the recommender
        """
        # read data
        df_full = pd.read_csv(os.path.join(self.path_data))
        self._data = df_full

    @property
    def data(self):
        return self._data

In [106]:
recom = KnnRecommender(path_data=path)
path = "../data/raw/player_raw_data.csv"

In [11]:
# Fit a NearestNeightbors first 
# K could be 10 for total of players in a one team at match except the player itself
# K could also be 14, since there are 15 unique positions
K_recomendations = 5
nbrs = NearestNeighbors(n_neighbors=K_recomendations + 1, metric="cosine", algorithm="brute").fit(transformed)
dist, rank = nbrs.kneighbors(transformed)

In [12]:
similar_df = pd.DataFrame(columns=[f"rank_{i} " for i in range(1, K_recomendations + 1)],
                          index=df["id"].values,
                          data=rank[:,1:])
dist_df = pd.DataFrame(columns=[f"rank_{i} " for i in range(1, K_recomendations + 1)],
                          index=df["id"].values,
                          data=dist[:,1:])

# Assgining the name of similar player to each rank
for cols in list(similar_df):
    tg_col = similar_df[cols]
    new_value = df["id"].iloc[tg_col].tolist()
    similar_df[cols] = new_value

In [13]:
similar_df.head()

Unnamed: 0,rank_1,rank_2,rank_3,rank_4,rank_5
0,4622,2525,11677,2210,7060
1,239,5112,11013,6628,1815
2,10345,3559,3552,6846,4611
3,1831,2147,1706,275,9790
4,2296,10218,275,11582,3938


In [14]:
# # Note these distances are always in monotonicallly increasing order,
# # since k nearest neighbors uses cosine distance to classify closeness of two ojects
# # in this case, players, wehre the smaller is the distance is better
dist_df.head()

Unnamed: 0,rank_1,rank_2,rank_3,rank_4,rank_5
0,0.210047,0.248764,0.249511,0.259342,0.26388
1,0.102824,0.107187,0.112312,0.113496,0.113862
2,0.142874,0.171771,0.18352,0.184633,0.195285
3,0.187842,0.229879,0.233663,0.235297,0.240273
4,0.180648,0.227457,0.241358,0.249612,0.250615


In [15]:
# Use this function to get similar player

# import 

# plotly needs to be added to requirements
# plotly==5.9.0
# ipywidgets as well
# jupyter-dash as well
from plotly.offline import init_notebook_mode, iplot, plot
import plotly.graph_objs as go
import plotly.io as pio
pio.renderers.default = "colab"


def similar_player(similar_df, dist_df, player_id, **last_name):
    player_ids = similar_df.loc[int(player_id)]
    # player_name = df.query(f"id == {player_id}")["name"].values[0]

    # id_name_dict = df[['id', 'name']].set_index('id')['name'].to_dict()

    # Dataframe display
    display_col = ["name", "age", "overall", "potential", "value"] + categorical_feats + binary_feats
    display_df = pd.DataFrame({"id": player_ids}).merge(df[["id"]+display_col], how="left", on="id")[display_col]
    return display_df

#     # Bar chart 
#     # Might need to change here
#     # Reversing order?

#     Xaxis = dist_df.loc[player_id].values[::-1]
#     Yaxis = player_ids.map(id_name_dict).values[::-1]
#     fig = go.Figure(go.Bar(
#                x=Xaxis,
#                y=Yaxis,
#                orientation='h'))
#     fig.update_layout(title_text='Players similar to  " '+str(player_name)+' "')
#     fig.show()

In [16]:
# Checks
messi = df.query("last_name == 'Messi'").id.values[0]
ronaldo = df.query("name.str.contains('Ronaldo')").iloc[0].id
neymar = df.query("name.str.contains('Neymar')").id.values[0]

similar_player(similar_df, dist_df, messi)
# for id in [messi, ronaldo, neymar]:
#    similar_player(similar_df, dist_df, id)

Unnamed: 0,name,age,overall,potential,value,country,club,best_position,preferred_foot
0,Riyad Mahrez,30,86,86,65500000,Algeria,Manchester City,RW,Left
1,Antoine Griezmann,30,85,85,53000000,France,Atlético de Madrid,ST,Left
2,Memphis Depay,27,85,85,58500000,Netherlands,FC Barcelona,CF,Right
3,Pierre-Emerick Aubameyang,32,85,85,43500000,Gabon,FC Barcelona,ST,Right
4,Raheem Sterling,26,87,87,87000000,England,Manchester City,LW,Right


In [17]:
def make_recommendation(first_name, last_name, **id):
    p = df.query(f"first_name == '{first_name}' & last_name == '{last_name}'")
    return similar_player(similar_df, dist_df, p.id.values[0])

In [24]:
first = "Antoine"
last = "Griezmann"
query_df = make_recommendation(first, last)
query_df

Unnamed: 0,name,age,overall,potential,value,country,club,best_position,preferred_foot
0,Lionel Messi,34,92,92,69500000,Argentina,Paris Saint-Germain,RW,Left
1,Roberto Firmino Barbosa de Oliveira,29,85,85,54000000,Brazil,Liverpool,CF,Right
2,Memphis Depay,27,85,85,58500000,Netherlands,FC Barcelona,CF,Right
3,Jamie Vardy,34,86,86,33000000,England,Leicester City,ST,Right
4,Gabriel Fernando de Jesus,24,83,86,49500000,Brazil,Manchester City,RW,Right
