In [2]:
#Import preprocessing libraries
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from imblearn.combine import SMOTETomek
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from imblearn.pipeline import Pipeline as ImblearnPipeline
from sklearn.metrics import accuracy_score

In [3]:
#Loading the datasets
df = pd.read_csv('eda.csv')

In [4]:
#Dropping the potential column as a players potential cannot be known till he excels in a certain position as well as the interntional 
#reputation column as it should not be a determinant for a players best position
df = df.drop(columns = ['Potential', 'International Reputation'])

In [5]:
df['Weak Foot Rating']

0        4
1        4
2        4
3        5
4        4
        ..
18534    3
18535    3
18536    3
18537    3
18538    3
Name: Weak Foot Rating, Length: 18539, dtype: int64

In [6]:
#Info about datasets
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18539 entries, 0 to 18538
Data columns (total 33 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Overall             18539 non-null  int64 
 1   Age                 18539 non-null  int64 
 2   Height(in cm)       18539 non-null  int64 
 3   Weight(in kg)       18539 non-null  int64 
 4   Weak Foot Rating    18539 non-null  int64 
 5   Skill Moves         18539 non-null  int64 
 6   Pace Total          18539 non-null  int64 
 7   Shooting Total      18539 non-null  int64 
 8   Passing Total       18539 non-null  int64 
 9   Dribbling Total     18539 non-null  int64 
 10  Defending Total     18539 non-null  int64 
 11  Physicality Total   18539 non-null  int64 
 12  Crossing            18539 non-null  int64 
 13  Finishing           18539 non-null  int64 
 14  Heading Accuracy    18539 non-null  int64 
 15  Short Passing       18539 non-null  int64 
 16  Curve               18

## Preprocessing

In [7]:
#Copying my datsframe
data =df.copy()

In [8]:
#Splitting the features and target columns
X = data.drop('Best Position', axis = 1)
y = data['Best Position']
data.columns

Index(['Overall', 'Age', 'Height(in cm)', 'Weight(in kg)', 'Weak Foot Rating',
       'Skill Moves', 'Pace Total', 'Shooting Total', 'Passing Total',
       'Dribbling Total', 'Defending Total', 'Physicality Total', 'Crossing',
       'Finishing', 'Heading Accuracy', 'Short Passing', 'Curve',
       'Acceleration', 'Agility', 'Reactions', 'Balance', 'Shot Power',
       'Jumping', 'Stamina', 'Strength', 'Aggression', 'Vision', 'Penalties',
       'Composure', ' GoalkeeperKicking', 'Nationality', 'Preferred Foot',
       'Best Position'],
      dtype='object')

In [9]:
#Splitting numerical and categorical features
num_features = ['Overall', 'Age', 'Height(in cm)', 'Weight(in kg)', 'Weak Foot Rating',
       'Skill Moves', 'Pace Total', 'Shooting Total', 'Passing Total',
       'Dribbling Total', 'Defending Total', 'Physicality Total', 'Crossing',
       'Finishing', 'Heading Accuracy', 'Short Passing', 'Curve',
       'Acceleration', 'Agility', 'Reactions', 'Balance', 'Shot Power',
       'Jumping', 'Stamina', 'Strength', 'Aggression', 'Vision', 'Penalties',
       'Composure', ' GoalkeeperKicking']
cat_features  = ['Nationality', 'Preferred Foot']

In [10]:
#Creating a pipeline for transforming both categorical and numerical features
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [11]:
preprocessor = ColumnTransformer(transformers= [
    ('num', numeric_transformer, num_features),
    ('cat', categorical_transformer, cat_features)
    ],
      remainder = 'passthrough'                       
    )

In [15]:
#Classifying certain positions into 4 main categories Defender, Midfielder, Forward and Goalkeeper
y.value_counts()
defends = ['CB', 'RB', 'LB', 'RWB', 'LWB'] #Defenders
midfield = ['CAM', 'RM', 'CDM', 'CM','LM'] #Midfielders
forward = ['ST', 'RW', 'LW', 'CF'] #Forwards
keeper = ['GK'] #Goalkeeper
y_precise = []
for i in y:
    if i in defends:
        y_precise.append(i.replace(i, 'Defender'))
    elif i in midfield:
        y_precise.append(i.replace(i, 'Midfielder'))
    elif i in forward:
        y_precise.append(i.replace(i, 'Forward'))
    else:
        y_precise.append(i.replace(i, 'Goalkeeper'))

In [21]:
#NEw target column
y_prec = pd.Series(y_precise, name = 'Best Position')

In [23]:
#Total number for each of the four player position categories
y_prec.value_counts()

Midfielder    7058
Defender      6273
Forward       3147
Goalkeeper    2061
Name: Best Position, dtype: int64

In [25]:
#Transforming my target column for model training
target = {'Midfielder':0,
         'Defender':1,
         'Forward':2,
         'Goalkeeper': 3,
         }
y_trans = []
for i in y_prec:
    y_trans.append(target[i])

In [26]:
#target column for model training
y_tran = pd.Series(y_trans, name = 'Best Position')

## Model Training

In [28]:
#Importing models for training data
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
from sklearn.svm import SVC

In [29]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'XG Boost': XGBClassifier(),
    'SVM': SVC(kernel='linear')
}

list(models.keys())

['Logistic Regression', 'Decision Tree', 'Random Forest', 'XG Boost', 'SVM']

In [30]:
for i,classifier in enumerate(list(models.values())):
    
    resampling_pipeline = ImblearnPipeline([
    ('preprocessor', preprocessor),
    ('classifier', classifier)
    ])
    
    X_train, X_test, y_train, y_test = train_test_split(X, y_tran, test_size=0.3, random_state=42)
    
    model = resampling_pipeline
    model.fit(X_train, y_train) #train model
    
    #Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    #Training set performance
    model_train_accuracy = accuracy_score(y_train, y_train_pred)
    model_train_f1 = f1_score(y_train, y_train_pred, average='weighted')
    model_train_precision = precision_score(y_train, y_train_pred, average='weighted')
    model_train_recall = recall_score(y_train, y_train_pred, average='weighted')
    #model_train_rocauc_score = roc_auc_score(y_train, y_train_pred, average='weighted', multi_class='ovr')

    
    #Test set performance
    model_test_accuracy = accuracy_score(y_test, y_test_pred)
    model_test_f1 = f1_score(y_test, y_test_pred, average='weighted')
    model_test_precision = precision_score(y_test, y_test_pred, average='weighted')
    model_test_recall = recall_score(y_test, y_test_pred, average='weighted')
    #model_test_rocauc_score = roc_auc_score(y_test, y_test_pred, average='weighted')
                                            
    print(list(models.keys())[i])
    
    print('Model peformance for Training set')
    print("- Accuracy: {:.4f}".format(model_train_accuracy))
    print("- F1 score: {:.4f}".format(model_train_f1))
                                            
    print("- Precision: {:.4f}".format(model_train_precision))
    print("- Recall: {:.4f}".format(model_train_recall))
    #print("- Roc Auc Score: {:.4f}".format(model_train_rocauc_score))
                                            
                                            
                                            
    print('---------------------------------------')
                                            
    print('Model performance for Test set')
    print("- Accuracy: {:.4f}".format(model_test_accuracy))
    print("- F1 score: {:.4f}".format(model_test_f1))
    print("- Precision: {:.4f}".format(model_test_precision))
    print("- Recall: {:.4f}".format(model_test_recall))
    #print("- Roc Auc Score: {:.4f}".format(model_test_rocauc_score))
                                            
                                            
    print('='*35)
    print('\n')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression
Model peformance for Training set
- Accuracy: 0.9224
- F1 score: 0.9224
- Precision: 0.9225
- Recall: 0.9224
---------------------------------------
Model performance for Test set
- Accuracy: 0.9157
- F1 score: 0.9156
- Precision: 0.9162
- Recall: 0.9157


Decision Tree
Model peformance for Training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
---------------------------------------
Model performance for Test set
- Accuracy: 0.8637
- F1 score: 0.8638
- Precision: 0.8639
- Recall: 0.8637


Random Forest
Model peformance for Training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
---------------------------------------
Model performance for Test set
- Accuracy: 0.9047
- F1 score: 0.9046
- Precision: 0.9084
- Recall: 0.9047


XG Boost
Model peformance for Training set
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
---------------------------------------
Model performance for T

## Hyperparameter Tuning

In [31]:
#Hyper parameter training for XGboost

#Parameters to be used for tuning XGboost model
params = {
    "learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
    "max_depth"        : [3, 4, 5, 6, 8, 10, 12, 15],
    "min_child_weight" : [1, 3, 5, 7 ],
    "gamma"            : [0.0, 0.1, 0.2 , 0.3, 0.4],
    "colsample_bytree" : [0.3, 0.4, 0.5 , 0.7]
}

In [32]:
#Importing the randomized search that helps to select the best parameters
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [None]:
classifier=XGBClassifier()
random_search=RandomizedSearchCV(classifier,param_distributions=params,n_iter=5,scoring='roc_auc',n_jobs=-1,cv=5,verbose=3)
boss_tuning = ImblearnPipeline([
    ('preprocessor', preprocessor),
    ('tuning', random_search)
])
boss_tuning.fit(X_train,y_train)