# Classifying players from Premier League by the level of their teams

## Import basic libraries

In [7]:
import pandas as pd
import numpy as np

## Import FIFA 21 dataset

In [8]:
df = pd.read_csv("/Users/alexitzu23/Downloads/archive/players_21.csv")
df.head() #showing the top 5 players in FIFA 21 by overall rating

Unnamed: 0,sofifa_id,player_url,short_name,long_name,age,dob,height_cm,weight_kg,nationality,club_name,...,lwb,ldm,cdm,rdm,rwb,lb,lcb,cb,rcb,rb
0,158023,https://sofifa.com/player/158023/lionel-messi/...,L. Messi,Lionel Andrés Messi Cuccittini,33,1987-06-24,170,72,Argentina,FC Barcelona,...,66+3,65+3,65+3,65+3,66+3,62+3,52+3,52+3,52+3,62+3
1,20801,https://sofifa.com/player/20801/c-ronaldo-dos-...,Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,35,1985-02-05,187,83,Portugal,Juventus,...,65+3,61+3,61+3,61+3,65+3,61+3,54+3,54+3,54+3,61+3
2,200389,https://sofifa.com/player/200389/jan-oblak/210002,J. Oblak,Jan Oblak,27,1993-01-07,188,87,Slovenia,Atlético Madrid,...,32+3,36+3,36+3,36+3,32+3,32+3,33+3,33+3,33+3,32+3
3,188545,https://sofifa.com/player/188545/robert-lewand...,R. Lewandowski,Robert Lewandowski,31,1988-08-21,184,80,Poland,FC Bayern München,...,64+3,65+3,65+3,65+3,64+3,61+3,60+3,60+3,60+3,61+3
4,190871,https://sofifa.com/player/190871/neymar-da-sil...,Neymar Jr,Neymar da Silva Santos Júnior,28,1992-02-05,175,68,Brazil,Paris Saint-Germain,...,67+3,62+3,62+3,62+3,67+3,62+3,49+3,49+3,49+3,62+3


## Data Cleaning

In [9]:
# extract players' names and cont. variables
df = df[['club_name', 'overall', 'potential', 'value_eur', 'wage_eur', 'international_reputation',]]

# replace null values with mean
df = df.fillna(df.mean())

# selecting top 4, mid 4, and bottom 4 clubs
df_clubs = df[(df.club_name =='Liverpool') | (df.club_name =='Manchester City') | (df.club_name =='Manchester United') | (df.club_name == 'Chelsea') |
              (df.club_name =='Sheffield United') | (df.club_name =='Burnley') | (df.club_name =='Southampton') | (df.club_name == 'Everton') |
              (df.club_name == 'Aston Villa') |(df.club_name == 'Bournemouth') | (df.club_name == 'Watford') | (df.club_name == 'Norwich City')]

# categorizing by top 4, mid 4, and bottom 4
df_clubs.club_name = df_clubs.club_name.replace({'Liverpool':'top 4', 'Manchester City':'top 4', 'Manchester United':'top 4', 'Chelsea':'top 4',
                                       'Sheffield United':'mid 4', 'Burnley':'mid 4', 'Southampton':'mid 4', 'Everton':'mid 4',
                                       'Aston Villa':'bottom 4', 'Bournemouth':'bottom 4', 'Watford':'bottom 4', 'Norwich City':'bottom 4'})

df_clubs.club_name.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


top 4       132
mid 4       127
bottom 4    123
Name: club_name, dtype: int64

## Train & Test split

In [10]:
# X - all features except the club , y - clubs
X = df_clubs.iloc[:, 1:].values
y = df_clubs.club_name.values

# 80/20 train & test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 1)

# standardize data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)

# scale both the train and the test set
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Execute logistic regression

In [11]:
from sklearn.linear_model import LogisticRegression

# initialize the logistic regression classifier
lr = LogisticRegression(random_state = 1, max_iter = 500)

# train the model
lr.fit(X_train, y_train)

# get prediction
y_pred = lr.predict(X_test)

## Model evaluation

In [12]:
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score

print("-----------Confusion matrix-----------")
print(pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted'], margins=True))
print("--------------------------------------")

print("Test set accuracy: {:.2f}".format(accuracy_score(y_test, y_pred)))
print("=======================================")

# 10 fold CV
acc = cross_val_score(lr, X_test, y_test, cv=10)
print("\n10-fold CV accuracy for each fold\n {}".format(acc))
print("\n--------------------------------------")
print("10-fold CV Average Accuracy: {:.2f}".format(acc.mean()))

-----------Confusion matrix-----------
Predicted  bottom 4  mid 4  top 4  All
True                                  
bottom 4         14      7      1   22
mid 4            10     16      6   32
top 4             1      4     18   23
All              25     27     25   77
--------------------------------------
Test set accuracy: 0.62

10-fold CV accuracy for each fold
 [0.875      0.75       0.75       0.875      0.5        0.75
 0.625      0.57142857 0.42857143 0.71428571]

--------------------------------------
10-fold CV Average Accuracy: 0.68
