In [1]:
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport

pd.set_option("display.max_columns", 100)
pd.set_option("display.max_rows", 30)


In [61]:
df_orig = pd.read_csv('../Data/players.csv')

In [62]:
df_orig.head()

Unnamed: 0,int_player_id,str_player_name,str_positions,dt_date_of_birth,int_height,int_weight,int_overall_rating,int_potential_rating,str_best_position,int_best_overall_rating,int_value,int_wage,int_team_id,str_nationality,int_crossing,int_finishing,int_heading_accuracy,int_short_passing,int_volleys,int_defensive_awareness,int_standing_tackle,int_sliding_tackle,int_diving,int_handling,int_kicking,int_gk_positioning,int_reflexes,int_aggression,int_interceptions,int_positioning,int_vision,int_penalties,int_composure,int_acceleration,int_sprint_speed,int_agility,int_reactions,int_balance,int_shot_power,int_jumping,int_stamina,int_strength,int_long_shots,str_preferred_foot,int_weak_foot,int_skill_moves,int_international_reputations,str_work_rate,str_body_type,int_dribbling,int_curve,int_fk_accuracy,int_long_passing,int_ball_control,str_player_speciality,str_trait
0,1,Lionel Andrés Messi Cuccittini,"RW, ST, CF",1987-06-24,170,72,93,93,RW,93,103500000,560000,5.0,Argentina,85,95,70,91,88,32,35,24,6,11,15,14,8,44,40,93,95,75,96,91,80,91,94,95,86,68,72,69,94,Left,4,4,5,Medium/ Low,Unique,96,93,94,91,96,"['Dribbler', 'Distance Shooter', 'FK Specialis...","['Finesse Shot', 'Long Shot Taker (AI)', 'Spee..."
1,2,Cristiano Ronaldo dos Santos Aveiro,"ST, LW",1985-02-05,187,83,92,92,ST,92,63000000,220000,6.0,Portugal,84,95,90,82,86,28,32,24,7,11,15,14,11,63,29,95,82,84,95,87,91,87,95,71,94,95,84,78,93,Right,4,5,5,High/ Low,Unique,88,81,76,77,92,"['Aerial Threat', 'Dribbler', 'Distance Shoote...","['Power Free-Kick', 'Flair', 'Long Shot Taker ..."
2,3,Jan Oblak,GK,1993-01-07,188,87,91,93,GK,91,120000000,125000,8.0,Slovenia,13,11,15,43,13,27,12,18,87,92,78,90,90,34,19,11,65,11,68,43,60,67,88,49,59,78,41,78,12,Right,3,1,3,Medium/ Medium,Unique,12,13,14,40,30,,"['GK Long Throw', 'Comes For Crosses']"
3,4,Kevin De Bruyne,"CAM, CM",1991-06-28,181,70,91,91,CAM,91,129000000,370000,2.0,Belgium,94,82,55,94,82,68,65,53,15,13,5,10,13,76,66,88,94,84,91,77,76,78,91,76,91,63,89,74,91,Right,5,4,4,High/ High,Unique,88,85,83,93,92,"['Dribbler', 'Playmaker\xa0', 'Engine', 'Dista...","['Injury Prone', 'Leadership', 'Early Crosser'..."
4,5,Neymar da Silva Santos Júnior,"LW, CAM",1992-02-05,175,68,91,91,LW,91,132000000,270000,7.0,Brazil,85,87,62,87,87,35,30,29,9,9,15,15,11,51,36,87,90,92,93,94,89,96,91,83,80,62,81,50,84,Right,5,5,5,High/ Medium,Unique,95,88,89,81,95,"['Speedster', 'Dribbler', 'Playmaker\xa0', 'FK...","['Injury Prone', 'Flair', 'Speed Dribbler (AI)..."


In [63]:
df_orig.columns=[i.replace('int_','').replace('str_','').replace('dt_','') for i in df_orig.columns]
df_orig.columns

Index(['player_id', 'player_name', 'positions', 'date_of_birth', 'height',
       'weight', 'overall_rating', 'potential_rating', 'best_position',
       'best_overall_rating', 'value', 'wage', 'team_id', 'nationality',
       'crossing', 'finishing', 'heading_accuracy', 'short_passing', 'volleys',
       'defensive_awareness', 'standing_tackle', 'sliding_tackle', 'diving',
       'handling', 'kicking', 'gk_positioning', 'reflexes', 'aggression',
       'interceptions', 'positioning', 'vision', 'penalties', 'composure',
       'acceleration', 'sprspeed', 'agility', 'reactions', 'balance',
       'shot_power', 'jumping', 'stamina', 'strength', 'long_shots',
       'preferred_foot', 'weak_foot', 'skill_moves',
       'international_reputations', 'work_rate', 'body_type', 'dribbling',
       'curve', 'fk_accuracy', 'long_passing', 'ball_control',
       'player_speciality', 'trait'],
      dtype='object')

### Pre-processing to be done on columns
-  Drop unnecessary columns (such as date_of_birth,potential_rating,best_overall_rating,etc.)
- 'work-rate' : remove spaces in string<br>
- 'positions' : (Target variable)          
> remove spaces <br>
> convert the comma seperated string into list <br>
> split them into columns <br>
-  Add them all to a Custom Transformer and pass it to preprocessing pipeline


In [116]:
from sklearn.base import BaseEstimator, TransformerMixin

class CustomTarget(BaseEstimator, TransformerMixin):
    def __init__(self,columns_to_drop=[]):
        self.drop_col=columns_to_drop

    def fit(self, X, y=None):
        return self

    def transform(self, X,y):
        X['work_rate']=X['work_rate'].apply(lambda x : x.replace(' ',''))
        
        self.all_positions = list(X['best_position'].unique())
        target=pd.DataFrame(y.copy())
        target['positions']=target['positions'].apply(lambda x : x.replace(' ','').split(','))
        for i in self.all_positions:
            target[i]=target['positions'].apply(lambda x: int(i in x))
        target.drop(columns='positions',inplace=True)
        target = np.array(target)
        
        if len(self.drop_col)!=0:
            X.drop(columns=self.drop_col,inplace=True)
        
            
        return X,target
    


In [117]:
X.shape,y.shape

((19002, 55), (19002,))

In [118]:
columns_to_drop = ['player_id', 'player_name','date_of_birth','overall_rating', 
                   'potential_rating','best_position','value', 'wage', 'team_id',
                   'nationality','weak_foot', 'skill_moves','international_reputations',
                   'body_type','player_speciality', 'trait']

custom_transformer = CustomTarget(columns_to_drop=columns_to_drop)
pipeline = Pipeline([('custom_transformer',custom_transformer)])

In [123]:
pipeline.fit_transform(X)

TypeError: transform() missing 1 required positional argument: 'y'

In [64]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer


In [65]:
# Custom transformer
def transform_work_rate(df):
    df['work_rate']=df['work_rate'].apply(lambda x : x.replace(' ',''))
    return df

def transform_positions(df):
    all_positions = list(df['best_position'].unique())
    df['positions']=df['positions'].apply(lambda x : x.replace(' ','').split(','))
    for i in all_positions:
        df[i]=df['positions'].apply(lambda x: int(i in x))
    return df

work_rate_transformer = FunctionTransformer(transform_work_rate)
positions_transformer = FunctionTransformer(transform_positions)


# Drop transformer
drop_col=['player_id', 'player_name','date_of_birth','overall_rating', 'potential_rating','best_position',
          'value', 'wage', 'team_id', 'nationality','weak_foot', 'skill_moves','international_reputations',
          'body_type','player_speciality', 'trait']

drop_transformer = ColumnTransformer(transformers = [('drop_columns','drop',drop_col)],
                                     remainder='passthrough') 


In [67]:
pipeline = Pipeline([
                       ('work_rate',work_rate_transformer),
                       ('positions',positions_transformer),
                       ('drop_column',drop_transformer)
])

In [72]:
df = df_orig.copy()
df.head()

Unnamed: 0,player_id,player_name,positions,date_of_birth,height,weight,overall_rating,potential_rating,best_position,best_overall_rating,value,wage,team_id,nationality,crossing,finishing,heading_accuracy,short_passing,volleys,defensive_awareness,standing_tackle,sliding_tackle,diving,handling,kicking,gk_positioning,reflexes,aggression,interceptions,positioning,vision,penalties,composure,acceleration,sprspeed,agility,reactions,balance,shot_power,jumping,stamina,strength,long_shots,preferred_foot,weak_foot,skill_moves,international_reputations,work_rate,body_type,dribbling,curve,fk_accuracy,long_passing,ball_control,player_speciality,trait
0,1,Lionel Andrés Messi Cuccittini,"RW, ST, CF",1987-06-24,170,72,93,93,RW,93,103500000,560000,5.0,Argentina,85,95,70,91,88,32,35,24,6,11,15,14,8,44,40,93,95,75,96,91,80,91,94,95,86,68,72,69,94,Left,4,4,5,Medium/ Low,Unique,96,93,94,91,96,"['Dribbler', 'Distance Shooter', 'FK Specialis...","['Finesse Shot', 'Long Shot Taker (AI)', 'Spee..."
1,2,Cristiano Ronaldo dos Santos Aveiro,"ST, LW",1985-02-05,187,83,92,92,ST,92,63000000,220000,6.0,Portugal,84,95,90,82,86,28,32,24,7,11,15,14,11,63,29,95,82,84,95,87,91,87,95,71,94,95,84,78,93,Right,4,5,5,High/ Low,Unique,88,81,76,77,92,"['Aerial Threat', 'Dribbler', 'Distance Shoote...","['Power Free-Kick', 'Flair', 'Long Shot Taker ..."
2,3,Jan Oblak,GK,1993-01-07,188,87,91,93,GK,91,120000000,125000,8.0,Slovenia,13,11,15,43,13,27,12,18,87,92,78,90,90,34,19,11,65,11,68,43,60,67,88,49,59,78,41,78,12,Right,3,1,3,Medium/ Medium,Unique,12,13,14,40,30,,"['GK Long Throw', 'Comes For Crosses']"
3,4,Kevin De Bruyne,"CAM, CM",1991-06-28,181,70,91,91,CAM,91,129000000,370000,2.0,Belgium,94,82,55,94,82,68,65,53,15,13,5,10,13,76,66,88,94,84,91,77,76,78,91,76,91,63,89,74,91,Right,5,4,4,High/ High,Unique,88,85,83,93,92,"['Dribbler', 'Playmaker\xa0', 'Engine', 'Dista...","['Injury Prone', 'Leadership', 'Early Crosser'..."
4,5,Neymar da Silva Santos Júnior,"LW, CAM",1992-02-05,175,68,91,91,LW,91,132000000,270000,7.0,Brazil,85,87,62,87,87,35,30,29,9,9,15,15,11,51,36,87,90,92,93,94,89,96,91,83,80,62,81,50,84,Right,5,5,5,High/ Medium,Unique,95,88,89,81,95,"['Speedster', 'Dribbler', 'Playmaker\xa0', 'FK...","['Injury Prone', 'Flair', 'Speed Dribbler (AI)..."


In [73]:
df = pipeline.fit_transform(df)

In [45]:
df_orig2 = df_orig.copy()
df_orig2['work_rate']=df_orig2['work_rate'].apply(lambda x : x.replace(' ',''))
df_orig2['positions']=df_orig2['positions'].apply(lambda x : x.replace(' ','').split(','))

df=df_orig2.copy()
df=df[['height',
       'weight', 'overall_rating', 
       'crossing', 'finishing', 'heading_accuracy', 'short_passing', 'volleys',
       'defensive_awareness', 'standing_tackle', 'sliding_tackle', 'diving',
       'handling', 'kicking', 'gk_positioning', 'reflexes', 'aggression',
       'interceptions', 'positioning', 'vision', 'penalties', 'composure',
       'acceleration', 'sprspeed', 'agility', 'reactions', 'balance',
       'shot_power', 'jumping', 'stamina', 'strength','long_shots',
       'work_rate', 'dribbling',
       'curve', 'fk_accuracy', 'long_passing', 'ball_control',
       'positions']]
df.columns

Index(['height', 'weight', 'overall_rating', 'crossing', 'finishing',
       'heading_accuracy', 'short_passing', 'volleys', 'defensive_awareness',
       'standing_tackle', 'sliding_tackle', 'diving', 'handling', 'kicking',
       'gk_positioning', 'reflexes', 'aggression', 'interceptions',
       'positioning', 'vision', 'penalties', 'composure', 'acceleration',
       'sprspeed', 'agility', 'reactions', 'balance', 'shot_power', 'jumping',
       'stamina', 'strength', 'long_shots', 'work_rate', 'dribbling', 'curve',
       'fk_accuracy', 'long_passing', 'ball_control', 'positions'],
      dtype='object')

> **EDA**

### Multilabel Classification

In [None]:
# Pipeline
from sklearn.pipeline import Pipeline

# preprocessor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# estimator
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier



In [None]:
df.head()

In [None]:
# Numerical Features
num_col = numerical_features.copy()

num_steps = [
             ('impute',SimpleImputer(strategy='mean')),
             ('scaler', StandardScaler()),
            ]

num_pipeline = Pipeline(num_steps)

In [None]:
# Categorical features
cat_col = ['work_rate','']

cat_steps = [
             ('impute',SimpleImputer(strategy='constant')),
             ('encoder',OneHotEncoder())
            ]
cat_pipeline = Pipeline(cat_steps)

### LOGISTIC REGRESSION

In [None]:
X=df[numerical_features].copy()
temp=df['work_rate'].copy()
t=pd.get_dummies(temp,drop_first=True)
X=X.join(t)

y_orig=df['best_position'].copy()
mapping = {i:j for i,j in zip(y_orig.unique(),range(y_orig.nunique()))}
reverse_mapping={value:key for key, value in mapping.items()}
y=y_orig.map(mapping).copy()
y=np.array(y).reshape(-1,1)
# y=pd.get_dummies(y_orig,drop_first=True)

In [None]:
y

In [None]:
X.shape,y.shape

In [None]:
X.head()

In [None]:
numerical_features

In [None]:
dtc = DecisionTreeClassifier(max_depth = 2).fit(X, y)

dtc.fit(X,y)

In [None]:
y_pred = dtc.predict(X)

In [None]:
cm = confusion_matrix(y, y_pred)

In [None]:
y_pred

In [None]:
cm

In [None]:
y_pred_name = pd.Series(y_pred).map(reverse_mapping)

In [None]:
y_orig,y_pred_name

In [None]:
df_orig = pd.read_csv('../Data/players.csv')
df_orig.columns=[i.replace('int_','').replace('str_','').replace('dt_','') for i in df_orig.columns]

temp = df_orig[['height','weight', 'overall_rating', 
       'crossing', 'finishing', 'heading_accuracy', 'short_passing', 'volleys',
       'defensive_awareness', 'standing_tackle', 'sliding_tackle', 'diving',
       'handling', 'kicking', 'gk_positioning', 'reflexes', 'aggression',
       'interceptions', 'positioning', 'vision', 'penalties', 'composure',
       'acceleration', 'sprspeed', 'agility', 'reactions', 'balance',
       'shot_power', 'jumping', 'stamina', 'strength','long_shots',
       'work_rate', 'dribbling',
       'curve', 'fk_accuracy', 'long_passing', 'ball_control',
       'positions']].copy()
temp['positions'] = temp['positions'].apply(lambda x: x.replace(' ',''))
temp['positions'] = temp['positions'].apply(lambda x: x.split(','))
temp['work_rate'] = temp['work_rate'].apply(lambda x: x.replace(' ',''))

all_positions = list(df_orig['best_position'].unique())

In [None]:
temp

In [None]:
all_positions,len(all_positions)

In [None]:
temp['work_rate'].unique()

In [None]:
for i in all_positions:
    temp[i]=temp['positions'].apply(lambda x: int(i in x))

In [None]:
temp1=temp[[i for i in temp.columns if i not in ['positions','work_rate']]].copy()

In [None]:
X=temp1[['height', 'weight', 'overall_rating', 'crossing', 'finishing',
       'heading_accuracy', 'short_passing', 'volleys', 'defensive_awareness',
       'standing_tackle', 'sliding_tackle', 'diving', 'handling', 'kicking',
       'gk_positioning', 'reflexes', 'aggression', 'interceptions',
       'positioning', 'vision', 'penalties', 'composure', 'acceleration',
       'sprspeed', 'agility', 'reactions', 'balance', 'shot_power', 'jumping',
       'stamina', 'strength', 'long_shots',  'dribbling', 'curve',
       'fk_accuracy', 'long_passing', 'ball_control' ]].copy()
y=temp1[['RW', 'ST', 'GK', 'CAM',
       'LW', 'CB', 'CDM', 'CF', 'LM', 'CM', 'RB', 'LB', 'RM', 'LWB', 'RWB']].copy()

In [None]:
from skmultilearn.problem_transform import ClassifierChain
from sklearn.linear_model import LogisticRegression


In [None]:
inp=np.array(X.iloc[0,:])
inp

In [None]:
classifier = ClassifierChain(LogisticRegression(C=1))
classifier.fit(X, y)
predictions = classifier.predict(X)

In [None]:
np.array(predictions.todense())

In [None]:
np.array(y)


In [None]:
np.array(y.iloc[0,:])

In [None]:
import pandas_profiling

In [6]:
df_orig

Unnamed: 0,player_id,player_name,positions,date_of_birth,height,weight,overall_rating,potential_rating,best_position,best_overall_rating,value,wage,team_id,nationality,crossing,finishing,heading_accuracy,short_passing,volleys,defensive_awareness,standing_tackle,sliding_tackle,diving,handling,kicking,gk_positioning,reflexes,aggression,interceptions,positioning,vision,penalties,composure,acceleration,sprspeed,agility,reactions,balance,shot_power,jumping,stamina,strength,long_shots,preferred_foot,weak_foot,skill_moves,international_reputations,work_rate,body_type,dribbling,curve,fk_accuracy,long_passing,ball_control,player_speciality,trait
0,1,Lionel Andrés Messi Cuccittini,"RW, ST, CF",1987-06-24,170,72,93,93,RW,93,103500000,560000,5.0,Argentina,85,95,70,91,88,32,35,24,6,11,15,14,8,44,40,93,95,75,96,91,80,91,94,95,86,68,72,69,94,Left,4,4,5,Medium/ Low,Unique,96,93,94,91,96,"['Dribbler', 'Distance Shooter', 'FK Specialis...","['Finesse Shot', 'Long Shot Taker (AI)', 'Spee..."
1,2,Cristiano Ronaldo dos Santos Aveiro,"ST, LW",1985-02-05,187,83,92,92,ST,92,63000000,220000,6.0,Portugal,84,95,90,82,86,28,32,24,7,11,15,14,11,63,29,95,82,84,95,87,91,87,95,71,94,95,84,78,93,Right,4,5,5,High/ Low,Unique,88,81,76,77,92,"['Aerial Threat', 'Dribbler', 'Distance Shoote...","['Power Free-Kick', 'Flair', 'Long Shot Taker ..."
2,3,Jan Oblak,GK,1993-01-07,188,87,91,93,GK,91,120000000,125000,8.0,Slovenia,13,11,15,43,13,27,12,18,87,92,78,90,90,34,19,11,65,11,68,43,60,67,88,49,59,78,41,78,12,Right,3,1,3,Medium/ Medium,Unique,12,13,14,40,30,,"['GK Long Throw', 'Comes For Crosses']"
3,4,Kevin De Bruyne,"CAM, CM",1991-06-28,181,70,91,91,CAM,91,129000000,370000,2.0,Belgium,94,82,55,94,82,68,65,53,15,13,5,10,13,76,66,88,94,84,91,77,76,78,91,76,91,63,89,74,91,Right,5,4,4,High/ High,Unique,88,85,83,93,92,"['Dribbler', 'Playmaker\xa0', 'Engine', 'Dista...","['Injury Prone', 'Leadership', 'Early Crosser'..."
4,5,Neymar da Silva Santos Júnior,"LW, CAM",1992-02-05,175,68,91,91,LW,91,132000000,270000,7.0,Brazil,85,87,62,87,87,35,30,29,9,9,15,15,11,51,36,87,90,92,93,94,89,96,91,83,80,62,81,50,84,Right,5,5,5,High/ Medium,Unique,95,88,89,81,95,"['Speedster', 'Dribbler', 'Playmaker\xa0', 'FK...","['Injury Prone', 'Flair', 'Speed Dribbler (AI)..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18997,18998,张梦炫,CB,1999-04-26,177,70,47,52,CB,49,70000,1000,595.0,China PR,23,16,45,31,26,45,56,47,12,13,8,14,6,44,50,25,25,35,40,60,56,45,48,70,35,65,55,45,23,Right,2,2,1,Low/ Low,Normal (170-185),19,21,22,25,24,,
18998,18999,Vani Da Silva,ST,2003-03-30,171,58,47,67,CAM,51,130000,500,646.0,England,27,47,36,45,47,18,11,13,11,13,9,9,6,33,11,43,49,50,45,69,70,67,53,80,50,50,55,35,41,Right,2,2,1,Medium/ Medium,Lean (170-185),53,43,31,37,44,,
18999,19000,夏奥,CB,1999-02-11,178,66,47,55,CB,49,100000,1000,409.0,China PR,23,26,43,26,27,45,52,50,7,8,5,14,11,48,50,28,28,38,44,68,60,69,46,51,36,57,54,50,24,Right,2,2,1,Medium/ Medium,Lean (170-185),27,23,21,29,42,,
19000,19001,Ben Hough,CM,2003-06-22,175,65,47,67,CAM,51,130000,500,646.0,England,38,42,40,56,35,32,44,40,12,10,9,6,8,40,23,47,47,36,38,63,64,61,51,66,48,58,43,47,30,Right,2,2,1,Medium/ Medium,Lean (170-185),46,40,35,50,48,,
