In [2]:
import pandas as pd

In [5]:
# This data should come from the scraper after pipeline of processing
df = pd.read_csv("../data/raw/player_raw_data.csv")

# feature engineering

# dummy variable that replaces the sofifa_id
df["id"] = df.index
# expanding the WorkRate to Attack and Def, 
# where Work_Rate is = (H/M/L) / (H/M/L), the first entry is attacking, second is defending
df[["atk_workrate", "def_workrate"]] = df["work_rate"].str.split("/", expand=True)

processed = df

## Class

In [4]:
# imports for preprocessing
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

class model(object):
    def __init__(self):
        self._data = processed

    def _prepare(self):
        # defining feature types
        categorical_feats = ["country", "club",
                             "best_position"]
        # ordinal features
        # And their according levels
        atk_workrate_levels = ["High", "Medium", "Low"]
        def_workrate_levels = [" High", " Medium", " Low"]
        ordinal_feats = ["atk_workrate", "def_workrate"]

        # binary features
        binary_feats = ["preferred_foot"]

        # to be dropped feats
        drop_feats = ["name", "first_name", "last_name", "work_rate", "id"] 

        # numeric_feats
        numeric_feats = list(set(df.columns)
                             - set(categorical_feats)
                             - set(drop_feats)
                             - set(binary_feats)
                             - set(ordinal_feats))
        assert set(categorical_feats + ordinal_feats + binary_feats
                   + drop_feats + numeric_feats) == set(df.columns)
        numeric_transformer = make_pipeline(StandardScaler())
        categorical_transformer = make_pipeline(
            OneHotEncoder(handle_unknown="ignore", sparse=False)
        )

        ordinal_transformer = make_pipeline(
            OrdinalEncoder(categories=[atk_workrate_levels, def_workrate_levels],
                           dtype=int)
        )

        binary_transformer = make_pipeline(
            OneHotEncoder(drop="if_binary", dtype=int)
        )
        ct = make_column_transformer(
            (numeric_transformer, numeric_feats),
            (categorical_transformer, categorical_feats),
            (ordinal_transformer, ordinal_feats),
            (binary_transformer, binary_feats),
            ("drop", drop_feats)
        )
        return ct.fit(df)

    def prepare(self):
        return self._prepare()

    @property
    def data(self):
        return self._data

## Others


In [6]:
# import 
import pandas as pd

# read data
df = pd.read_csv("../data/raw/player_raw_data.csv")

# feature engineering

# dummy variable that replaces the sofifa_id
df["id"] = df.index
# expanding the WorkRate to Attack and Def, 
# where Work_Rate is = (H/M/L) / (H/M/L), the first entry is attacking, second is defending
df[["atk_workrate", "def_workrate"]] = df["work_rate"].str.split("/", expand=True)
df["def_workrate"] = df["def_workrate"].replace(" ", "")
# defining feature types
categorical_feats = ["country", "club", 
                     "best_position"]
# ordinal features
# And their according levels
atk_workrate_levels = ["High", "Medium", "Low"]
def_workrate_levels = [" High", " Medium", " Low"]
ordinal_feats = ["atk_workrate", "def_workrate"]

# binary features
binary_feats = ["preferred_foot"]

# to be dropped feats
drop_feats = ["name", "first_name", "last_name", "work_rate", "id"] 

# numeric_feats
numeric_feats = list(set(df.columns) 
                     - set(categorical_feats) 
                     - set(drop_feats) 
                     - set(binary_feats)
                     - set(ordinal_feats))


assert set(categorical_feats + ordinal_feats + binary_feats + drop_feats + numeric_feats) == set(df.columns)

In [21]:
df.head()

Unnamed: 0,name,first_name,last_name,country,age,overall,potential,club,best_position,value,...,defensive_awareness,standing_tackle,sliding_tackle,diving,handling,kicking,reflexes,id,atk_workrate,def_workrate
0,Rúben Daniel Fonseca Macedo,Rúben,Macedo,Portugal,25,65,68,Clube Sport Marítimo,RW,950000,...,32,25,21,6,11,14,13,0,Medium,Medium
1,Naif Almas,Naif,Almas,Saudi Arabia,21,57,69,Al Fayha,CB,375000,...,57,57,54,12,15,10,12,1,Medium,Medium
2,Rakan Al Shamlan,Rakan,Shamlan,Saudi Arabia,22,59,68,Al Batin,LW,500000,...,38,43,45,13,6,7,14,2,Medium,Low
3,Erick Wiemberg,Erick,Wiemberg,Chile,27,69,70,Unión La Calera,LB,1500000,...,58,66,64,12,13,7,9,3,High,Medium
4,Nicolás Forastiero,Nicolás,Forastiero,Argentina,22,59,69,Argentinos Juniors,GK,450000,...,6,10,12,56,64,64,61,4,Medium,Medium


In [7]:
# imports for preprocessing
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

numeric_transformer = make_pipeline(StandardScaler())
categorical_transformer = make_pipeline(
    OneHotEncoder(handle_unknown="ignore", sparse=False)
)

ordinal_transformer = make_pipeline(
    OrdinalEncoder(categories=[atk_workrate_levels, def_workrate_levels], dtype=int)
)

binary_transformer = make_pipeline(
    OneHotEncoder(drop="if_binary", dtype=int)
)
ct = make_column_transformer(
    (numeric_transformer, numeric_feats),
    (categorical_transformer, categorical_feats),
    (ordinal_transformer, ordinal_feats),
    (binary_transformer, binary_feats),
    ("drop", drop_feats)
)
ct.fit(df)

# get new feature names

In [8]:
# New column names after transformation
#ct.named_transformers_["pipeline-1"].get_feature_names_out() 
ohe_columns = ct.named_transformers_["pipeline-2"].get_feature_names_out().tolist()

bin_columns = ct.named_transformers_["pipeline-4"].get_feature_names_out().tolist()
feature_names = numeric_feats + ohe_columns + bin_columns + ordinal_feats

# creating transformed data to do clustering
transformed = pd.DataFrame(data=ct.fit_transform(df), columns=feature_names)

# all_cols
feature_cols = transformed.columns.tolist()
transformed.head()


Unnamed: 0,shot_power,kicking,reflexes,agility,stamina,long_passing,sliding_tackle,fk_accuracy,weight,ball_control,...,best_position_LW,best_position_LWB,best_position_RB,best_position_RM,best_position_RW,best_position_RWB,best_position_ST,preferred_foot_Right,atk_workrate,def_workrate
0,0.579793,-0.663211,-0.283362,0.295787,0.213661,1.018823,0.52094,0.934567,-1.015698,0.579212,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
1,-1.696649,-0.091337,-0.519755,-0.523034,-0.413758,-0.810603,0.52094,-1.165538,-0.597662,-1.116013,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
2,-0.374844,-0.21842,-0.578853,0.159317,-0.413758,-0.67993,-0.456218,-0.565508,-1.015698,-0.180716,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
3,-0.154543,-0.21842,-0.34246,0.364022,-0.037307,0.822813,0.374366,0.634552,-0.597662,0.696125,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
4,0.50636,-0.599669,-0.34246,0.091082,-1.103918,0.88815,1.058377,0.754558,-1.433733,0.4623,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0


### KNearestNeighbor


In [24]:
df.columns.tolist()

['name',
 'first_name',
 'last_name',
 'country',
 'age',
 'overall',
 'potential',
 'club',
 'best_position',
 'value',
 'wage',
 'total_stats',
 'preferred_foot',
 'weak_foot',
 'skill_move',
 'work_rate',
 'height',
 'weight',
 'crossing',
 'finishing',
 'heading_accuracy',
 'short_passing',
 'volleys',
 'dribbling',
 'curve',
 'fk_accuracy',
 'long_passing',
 'ball_control',
 'acceleration',
 'sprint_speed',
 'agility',
 'reactions',
 'balance',
 'shot_power',
 'jumping',
 'stamina',
 'strength',
 'long_shots',
 'aggression',
 'interceptions',
 'positioning',
 'vision',
 'penalties',
 'composure',
 'defensive_awareness',
 'standing_tackle',
 'sliding_tackle',
 'diving',
 'handling',
 'kicking',
 'reflexes',
 'id',
 'atk_workrate',
 'def_workrate']

In [9]:
from sklearn.neighbors import NearestNeighbors

In [105]:
# from sklean.neighbors import NearestNeigbors
import os
class KnnRecommender:
    def __init__(self, path_data):
        """
        Requires path to load the data and use it to fit to model

        Parameters
        ----------
        path_data: str, data file path (could be local or on cloud)
        """
        self.path_data = path_data
        self.model = NearestNeighbors()
        self._data = None

    def set_model_params(self, n_neighbors=10, algorithm="brute",
                         metric="cosine", n_jobs=-1):
        """
        Set model params for sklearn.neighbors.NearestNeighbors

        Parameters
        ----------
        n_neighbors: int, default to 10

        algorithm: {'auto', 'ball_tree', 'kd_tree', 'brute'} default to "brute"

        metric: string or callable, default to 'cosine' here

        n_jobs: int or None, default to -1
        """
        self.set_model_params(**{
            'n_neighbors': n_neighbors,
            'algorithm': algorithm,
            'metric': metric,
            'n_jobs': n_jobs})

    def prep_data(self):
        """
        Prepare data for the recommender
        """
        # read data
        df_full = pd.read_csv(os.path.join(self.path_data))
        self._data = df_full

    @property
    def data(self):
        return self._data

In [106]:
recom = KnnRecommender(path_data=path)

In [96]:
path = "../data/raw/player_raw_data.csv"

In [10]:
# Fit a NearestNeightbors first 
# K could be 10 for total of players in a one team at match except the player itself
# K could also be 14, since there are 15 unique positions
K_recomendations = 5
nbrs = NearestNeighbors(n_neighbors=K_recomendations + 1, metric="cosine", algorithm="brute").fit(transformed)
dist, rank = nbrs.kneighbors(transformed)

In [11]:
similar_df = pd.DataFrame(columns=[f"rank_{i} " for i in range(1, K_recomendations + 1)],
                          index=df["id"].values,
                          data=rank[:,1:])
dist_df = pd.DataFrame(columns=[f"rank_{i} " for i in range(1, K_recomendations + 1)],
                          index=df["id"].values,
                          data=dist[:,1:])

# Assgining the name of similar player to each rank
for cols in list(similar_df):
    tg_col = similar_df[cols]
    new_value = df["id"].iloc[tg_col].tolist()
    similar_df[cols] = new_value

In [41]:
similar_df.head()

Unnamed: 0,rank_1,rank_2,rank_3,rank_4
0,299,2237,5958,2209
1,493,8960,9204,6246
2,1486,2662,3136,7412
3,7940,8494,6674,4782
4,3882,1861,7523,9004


In [42]:
# # Note these distances are always in monotonicallly increasing order,
# # since k nearest neighbors uses cosine distance to classify closeness of two ojects
# # in this case, players, wehre the smaller is the distance is better
dist_df.head()

Unnamed: 0,rank_1,rank_2,rank_3,rank_4
0,0.167262,0.177537,0.192105,0.199396
1,0.126701,0.133642,0.137792,0.140418
2,0.275572,0.303469,0.305354,0.305382
3,0.174796,0.204383,0.204447,0.205279
4,0.025261,0.025469,0.025812,0.026836


In [12]:
# Use this function to get similar player

# import 

# plotly needs to be added to requirements
# plotly==5.9.0
# ipywidgets as well
# jupyter-dash as well
from plotly.offline import init_notebook_mode, iplot, plot
import plotly.graph_objs as go
import plotly.io as pio
pio.renderers.default = "colab"


def similar_player(similar_df, dist_df, player_id, **last_name):
    player_ids = similar_df.loc[int(player_id)]
    # player_name = df.query(f"id == {player_id}")["name"].values[0]

    # id_name_dict = df[['id', 'name']].set_index('id')['name'].to_dict()

    # Dataframe display
    display_col = ["name", "age", "overall", "potential", "value"] + categorical_feats + binary_feats
    display_df = pd.DataFrame({"id": player_ids}).merge(df[["id"]+display_col], how="left", on="id")[display_col]
    return display_df

#     # Bar chart 
#     # Might need to change here
#     # Reversing order?

#     Xaxis = dist_df.loc[player_id].values[::-1]
#     Yaxis = player_ids.map(id_name_dict).values[::-1]
#     fig = go.Figure(go.Bar(
#                x=Xaxis,
#                y=Yaxis,
#                orientation='h'))
#     fig.update_layout(title_text='Players similar to  " '+str(player_name)+' "')
#     fig.show()

In [17]:
# Checks
messi = df.query("last_name == 'Messi'").id.values[0]
ronaldo = df.query("name.str.contains('Ronaldo')").iloc[0].id
neymar = df.query("name.str.contains('Neymar')").id.values[0]

similar_player(similar_df, dist_df, messi)
# for id in [messi, ronaldo, neymar]:
#    similar_player(similar_df, dist_df, id)

Unnamed: 0,name,age,overall,potential,value,country,club,best_position,preferred_foot
0,Riyad Mahrez,30,86,86,65500000,Algeria,Manchester City,RW,Left
1,Antoine Griezmann,30,85,85,53000000,France,Atlético de Madrid,ST,Left
2,Memphis Depay,27,85,85,58500000,Netherlands,FC Barcelona,CF,Right
3,Pierre-Emerick Aubameyang,32,85,85,43500000,Gabon,FC Barcelona,ST,Right
4,Raheem Sterling,26,87,87,87000000,England,Manchester City,LW,Right


In [18]:
def make_recommendation(first_name, last_name, **id):
    p = df.query(f"first_name == '{first_name}' & last_name == '{last_name}'")
    return similar_player(similar_df, dist_df, p.id.values[0])

In [19]:
messi_df = make_recommendation("Lionel", "Messi")

In [20]:
messi_df

Unnamed: 0,name,age,overall,potential,value,country,club,best_position,preferred_foot
0,Riyad Mahrez,30,86,86,65500000,Algeria,Manchester City,RW,Left
1,Antoine Griezmann,30,85,85,53000000,France,Atlético de Madrid,ST,Left
2,Memphis Depay,27,85,85,58500000,Netherlands,FC Barcelona,CF,Right
3,Pierre-Emerick Aubameyang,32,85,85,43500000,Gabon,FC Barcelona,ST,Right
4,Raheem Sterling,26,87,87,87000000,England,Manchester City,LW,Right


In [1]:
messi_recomendations = pd.to_csv(make_recommendation("Lionel", "Messi"), "messi.csv")

NameError: name 'pd' is not defined