# FPL Machine Learning: Binary Classification problem

In [419]:
#Fantasy Premier League machine learning project applying what I learnt while studying the Iris dataset.
#The aim will be to produce a model which can predict whether a given player has performed above or below average.
#I will be using the data set 'cleaned_players.csv' by Vaastav which I scraped from github.
#

In [420]:
#First lets import our dependancies and have a look at the data.
import pandas as pd
import numpy as np
import io
import requests
url = 'https://raw.githubusercontent.com/vaastav/Fantasy-Premier-League/master/data/2019-20/cleaned_players.csv'
read_data = requests.get(url).content
print(read_data)


b"first_name,second_name,goals_scored,assists,total_points,minutes,goals_conceded,creativity,influence,threat,bonus,bps,ict_index,clean_sheets,red_cards,yellow_cards,selected_by_percent,now_cost\r\nShkodran,Mustafi,0,2,19,440,7,13.2,128.8,79.0,1,101,22.2,1,0,0,0.3,51\r\nH\xc3\xa9ctor,Beller\xc3\xadn,1,0,17,442,8,28.1,103.8,28.0,3,81,16.1,1,0,2,0.8,54\r\nSead,Kolasinac,0,2,34,1086,18,156.3,190.0,67.0,1,219,41.2,2,0,3,0.5,52\r\nAinsley,Maitland-Niles,0,2,36,1210,19,155.4,261.8,37.0,3,216,44.9,3,1,3,2.5,46\r\nSokratis,Papastathopoulos,2,0,52,1606,25,31.8,431.6,106.0,5,286,57.1,3,0,5,1.4,49\r\nNacho,Monreal,0,0,10,270,4,40.8,47.2,12.0,1,54,10.0,1,0,0,0.2,50\r\nLaurent,Koscielny,0,0,0,0,0,0.0,0.0,0.0,0,0,0.0,0,0,0,0.1,50\r\nKonstantinos,Mavropanos,0,0,0,0,0,0.0,0.0,0.0,0,0,0.0,0,0,0,0.0,44\r\nCarl,Jenkinson,0,0,0,0,0,0.0,0.0,0.0,0,0,0.0,0,0,0,0.0,45\r\nRob,Holding,0,0,2,85,3,0.4,13.4,2.0,0,13,1.6,0,0,0,0.1,45\r\nPierre-Emerick,Aubameyang,14,3,129,2033,31,335.1,616.2,883.0,24,484,183.3,5,1,3

In [421]:
#Looks a bit messy! Lets clean it up quickly and have a look at the first 5 names, notice they are ordered by team.
address = pd.read_csv(io.StringIO(read_data.decode('utf-8')))
print(address.head())


  first_name       second_name  goals_scored  assists  total_points  minutes  \
0   Shkodran           Mustafi             0        2            19      440   
1     Héctor          Bellerín             1        0            17      442   
2       Sead         Kolasinac             0        2            34     1086   
3    Ainsley    Maitland-Niles             0        2            36     1210   
4   Sokratis  Papastathopoulos             2        0            52     1606   

   goals_conceded  creativity  influence  threat  bonus  bps  ict_index  \
0               7        13.2      128.8    79.0      1  101       22.2   
1               8        28.1      103.8    28.0      3   81       16.1   
2              18       156.3      190.0    67.0      1  219       41.2   
3              19       155.4      261.8    37.0      3  216       44.9   
4              25        31.8      431.6   106.0      5  286       57.1   

   clean_sheets  red_cards  yellow_cards  selected_by_percent  now_c

In [422]:
#Removing any players who have not played a single minute this season.
address = address[address.minutes != 0]
address

Unnamed: 0,first_name,second_name,goals_scored,assists,total_points,minutes,goals_conceded,creativity,influence,threat,bonus,bps,ict_index,clean_sheets,red_cards,yellow_cards,selected_by_percent,now_cost
0,Shkodran,Mustafi,0,2,19,440,7,13.2,128.8,79.0,1,101,22.2,1,0,0,0.3,51
1,Héctor,Bellerín,1,0,17,442,8,28.1,103.8,28.0,3,81,16.1,1,0,2,0.8,54
2,Sead,Kolasinac,0,2,34,1086,18,156.3,190.0,67.0,1,219,41.2,2,0,3,0.5,52
3,Ainsley,Maitland-Niles,0,2,36,1210,19,155.4,261.8,37.0,3,216,44.9,3,1,3,2.5,46
4,Sokratis,Papastathopoulos,2,0,52,1606,25,31.8,431.6,106.0,5,286,57.1,3,0,5,1.4,49
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
611,Leander,Dendoncker,2,0,62,1998,29,93.1,389.2,190.0,6,309,67.2,4,0,2,9.8,44
612,Jesús,Vallejo Lázaro,0,0,1,162,6,7.0,35.6,0.0,0,22,4.3,0,0,1,0.2,46
613,Patrick,Cutrone,2,0,23,292,6,35.6,85.2,155.0,0,69,27.5,1,0,0,0.5,55
614,Pedro,Lomba Neto,2,2,37,617,8,132.9,147.4,190.0,0,126,46.9,0,0,2,0.2,50


In [423]:
#I will remove total points earned and minutes played, and then try to predict the points per minute score of each player using what is left.
#This should work well as a binary classification problem because much of this information will be highly indicative of how many points a player has earned.
#If a player has scored a lot of goals, he likely has a lot of points etc.


In [424]:
#Calculating the points per minute.
I = range(0,len(address))
J = list(I)
for i in I:
    J[i] = address.iloc[i]['total_points'] / address.iloc[i]['minutes']
address.insert(18, 'points_per_minute', J)
print(address.head())

  first_name       second_name  goals_scored  assists  total_points  minutes  \
0   Shkodran           Mustafi             0        2            19      440   
1     Héctor          Bellerín             1        0            17      442   
2       Sead         Kolasinac             0        2            34     1086   
3    Ainsley    Maitland-Niles             0        2            36     1210   
4   Sokratis  Papastathopoulos             2        0            52     1606   

   goals_conceded  creativity  influence  threat  bonus  bps  ict_index  \
0               7        13.2      128.8    79.0      1  101       22.2   
1               8        28.1      103.8    28.0      3   81       16.1   
2              18       156.3      190.0    67.0      1  219       41.2   
3              19       155.4      261.8    37.0      3  216       44.9   
4              25        31.8      431.6   106.0      5  286       57.1   

   clean_sheets  red_cards  yellow_cards  selected_by_percent  now_c

In [425]:
#we now have a column to our dataset containing the points per minute score of each player, now lets find the mean.
total_ppm = address['points_per_minute'].sum()

avg_ppm = total_ppm / len(address)
        
print(avg_ppm)
#

0.05035836240509932


In [426]:
#Now lets add a column telling us if a player is above or below average.
#Call this column 'T' and enter 0 if the player is below average, and 1 if he is above or equal to the average.
K = range(0,len(address))
T = list(K)
for i in K:
    if address.iloc[i]['points_per_minute'] / avg_ppm >= 1 :
        T[i] = 1
    else:
        T[i] = 0
address.insert(19, 'T', T)

print(address.head())
#

  first_name       second_name  goals_scored  assists  total_points  minutes  \
0   Shkodran           Mustafi             0        2            19      440   
1     Héctor          Bellerín             1        0            17      442   
2       Sead         Kolasinac             0        2            34     1086   
3    Ainsley    Maitland-Niles             0        2            36     1210   
4   Sokratis  Papastathopoulos             2        0            52     1606   

   goals_conceded  creativity  influence  threat  bonus  bps  ict_index  \
0               7        13.2      128.8    79.0      1  101       22.2   
1               8        28.1      103.8    28.0      3   81       16.1   
2              18       156.3      190.0    67.0      1  219       41.2   
3              19       155.4      261.8    37.0      3  216       44.9   
4              25        31.8      431.6   106.0      5  286       57.1   

   clean_sheets  red_cards  yellow_cards  selected_by_percent  now_c

In [427]:
#Now lets remove the relevent columns, so the dataset contains no actual data on points earned.
address_1 = address.drop(['total_points', 'minutes', 'points_per_minute', 'T'], 1)
print(address_1.head())

#This is the dataset we use to make the model, T is our target. This cell is mainly just for visual clarity.
#

  first_name       second_name  goals_scored  assists  goals_conceded  \
0   Shkodran           Mustafi             0        2               7   
1     Héctor          Bellerín             1        0               8   
2       Sead         Kolasinac             0        2              18   
3    Ainsley    Maitland-Niles             0        2              19   
4   Sokratis  Papastathopoulos             2        0              25   

   creativity  influence  threat  bonus  bps  ict_index  clean_sheets  \
0        13.2      128.8    79.0      1  101       22.2             1   
1        28.1      103.8    28.0      3   81       16.1             1   
2       156.3      190.0    67.0      1  219       41.2             2   
3       155.4      261.8    37.0      3  216       44.9             3   
4        31.8      431.6   106.0      5  286       57.1             3   

   red_cards  yellow_cards  selected_by_percent  now_cost  
0          0             0                  0.3        51  
1 

In [428]:
#spliting the data into training and testing, roughly 80% will be training, 20% testing.

msk = np.random.rand(len(address)) < 0.8

train = address[msk]   #Splitting the whole data
test = address[~msk]


#removing the unwanted columns.
#We drop the names because they are strings and the model cant use them, we can use the index to identify players later
data_train = train.drop(['total_points', 'minutes','points_per_minute', 'T','first_name', 'second_name'], 1)   
data_test = test.drop(['total_points', 'minutes','points_per_minute', 'T', 'first_name', 'second_name'], 1)

T_train_1 = address[msk][['T']]
T_test_1 = address[~msk][['T']]
#Need these to be a series format in order for use of LogisticRegression().
T_train = T_train_1['T']
T_test = T_test_1['T']         

In [429]:
#Testing our data is formatted correctly:
type(T_train)  



pandas.core.series.Series

In [430]:
#It is a series as required. Now lets see how many of our samples are reserved for testing.
len(data_test)


98

In [431]:
#Great! now lets have a look at the dataset we have created for training:
data_train


Unnamed: 0,goals_scored,assists,goals_conceded,creativity,influence,threat,bonus,bps,ict_index,clean_sheets,red_cards,yellow_cards,selected_by_percent,now_cost
0,0,2,7,13.2,128.8,79.0,1,101,22.2,1,0,0,0.3,51
1,1,0,8,28.1,103.8,28.0,3,81,16.1,1,0,2,0.8,54
2,0,2,18,156.3,190.0,67.0,1,219,41.2,2,0,3,0.5,52
3,0,2,19,155.4,261.8,37.0,3,216,44.9,3,1,3,2.5,46
4,2,0,25,31.8,431.6,106.0,5,286,57.1,3,0,5,1.4,49
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
607,0,1,2,43.5,6.2,12.0,0,31,6.2,0,0,0,0.1,47
608,4,7,28,555.7,588.0,454.0,14,437,159.9,4,0,0,19.9,58
612,0,0,6,7.0,35.6,0.0,0,22,4.3,0,0,1,0.2,46
613,2,0,6,35.6,85.2,155.0,0,69,27.5,1,0,0,0.5,55


# Logistic Regression Model

In [432]:
#I think we are ready to import the model and get going.
from sklearn.linear_model import LogisticRegression
log_reg_model = LogisticRegression()
log_reg_model.fit(data_train,T_train)




LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [433]:
#Testing the model:

log_reg_model_predictions = log_reg_model.predict(data_test)

#Lets have a look at the predictions against the real data:
print('T')
print(T_test)
print()
print('Predicted T')
print(log_reg_model_predictions)


T
11     0
39     0
41     1
47     1
55     0
      ..
581    0
596    0
600    1
611    0
614    1
Name: T, Length: 98, dtype: int64

Predicted T
[1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0
 1 1 1 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0
 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 1 1 0 1]


In [434]:
#By inspection we can see already that the first five players have been predicted correctly.
#We can use the following metrics to test the model mathematically.

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

#Checking the precision of our model:

print(classification_report(T_test, log_reg_model_predictions))


              precision    recall  f1-score   support

           0       0.88      0.93      0.91        72
           1       0.77      0.65      0.71        26

    accuracy                           0.86        98
   macro avg       0.83      0.79      0.81        98
weighted avg       0.85      0.86      0.85        98



In [435]:
#The model predicts to an accuracy of 85%! This is a decent score, now lets make a function to make a prediction for an individual.


In [436]:
#For this, lets train the model on the full data set, to hopefully improve our accuracy a little more.
full_data = address.drop(['total_points', 'minutes','first_name','points_per_minute', 'T', 'second_name'], 1)
full_predictions = log_reg_model.predict(full_data)
#Lets have a look at them.
print(full_predictions)

[1 0 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1 0 1 0 0
 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1
 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 1 0 1 0 0 0 0 0
 0 1 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 1 1 0 1 0 1 0 0 0 0 0
 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 1 1 0 1 0 0 0 1 0 1 0 1 1 1 1 1 1 0 1 1 1 1
 1 1 1 1 1 0 1 0 1 0 1 1 1 0 0 0 0 0 0 1 1 0 0 1 1 1 1 1 0 0 0 0 0 0 0 0 0
 0 0 0 0 1 0 1 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0
 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 1 1 1 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 1 0 0 0 0 0 0 1 1 1 0 0 1 0 1 1 0
 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1
 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 1 0 0 1 1 1]


In [437]:
#Notice the long string of '1's in about the middle row, I suspect that may be where we reach the Liverpool team.


In [457]:
#Lets write the function:
def performance_LR(fn, sn):
    i = address.loc[address['first_name'] == fn].index[0]
    j = address.loc[address['second_name'] == sn].index[0]    #Finding the index from the name.
    if i == j:
        if full_predictions[i] == 1:       #Referring to the above.
            print('Player has above average points per minute')
        else:
            print('Player has below average points per minute')
    else:
        print('First name index does not match the last, likely there are multiple players with that name.')

In [458]:
#Lets test it using the first player, Mustafi, who somehow has an above average points per minute score.
performance_LR('Shkodran', 'Mustafi')

Player has above average points per minute


In [459]:
#At least the model can predict correctly, lets try another one:
performance_LR('Sead', 'Kolasinac')

Player has below average points per minute


In [441]:
#Another correct prediction!

# K Nearest Neighbours

In [442]:
#Now we will use a knn simple predictive model, again imported from sklearn.

from sklearn.neighbors import KNeighborsClassifier

#For simplicity we will start with K=1.
knn_1 = KNeighborsClassifier(n_neighbors = 1)

In [443]:
#We can use the same split as before:
knn_1.fit(data_train,T_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

In [444]:
#Lets test it on Mustafi again.
#Extracting his row as a new dataframe.
mustafi_test = address.loc[address['second_name'] == 'Mustafi']
#Removing the required columns.
mustafi_test_1 = mustafi_test.drop(['total_points', 'minutes', 'first_name', 'second_name', 'points_per_minute', 'T'], 1) 
#Calling the model, should return '1'.
knn_1.predict(mustafi_test_1)[0]

0

In [445]:
#Good, now lets use the testing set.
knn_1_predictions = knn_1.predict(data_test)
print(knn_1_predictions)

[1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 1 0 1 1 0 0 0 0
 1 0 0 0 1 1 0 1 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 1 1 1 0
 1 1 0 0 1 0 0 1 0 0 1 1 0 0 0 0 1 1 0 0 0 1 0 1]


In [446]:
#Comparing by inspection again, to the actual data:
print(T_test)

11     0
39     0
41     1
47     1
55     0
      ..
581    0
596    0
600    1
611    0
614    1
Name: T, Length: 98, dtype: int64


In [447]:
#Again we see the first 5 values are predicted correctly. Lets evaluate using the classification report again.
print(classification_report(T_test, knn_1_predictions))

              precision    recall  f1-score   support

           0       0.88      0.82      0.85        72
           1       0.58      0.69      0.63        26

    accuracy                           0.79        98
   macro avg       0.73      0.76      0.74        98
weighted avg       0.80      0.79      0.79        98



In [448]:
#This is less accurate than the logistic regression model, but still not too bad with almost 80% accuracy.
#Lets see what happens if we try again with k = 3.

knn_3 = KNeighborsClassifier(n_neighbors = 3)

knn_3.fit(data_train,T_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [449]:
#Mustafi test:
mustafi_test = address.loc[address['second_name'] == 'Mustafi']
#Removing the required columns.
mustafi_test_1 = mustafi_test.drop(['total_points', 'minutes','first_name', 'second_name', 'points_per_minute', 'T'], 1) 
#Calling the model, should return '1'.
knn_3.predict(mustafi_test_1)[0]

0

In [450]:
knn_3_predictions = knn_3.predict(data_test)
print(knn_3_predictions)

[0 0 1 1 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 1 0 1 1 0 0 0 0
 1 0 0 0 1 1 0 1 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0 1 0 1 0
 1 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0]


In [451]:
print(classification_report(T_test, knn_3_predictions))

              precision    recall  f1-score   support

           0       0.89      0.89      0.89        72
           1       0.69      0.69      0.69        26

    accuracy                           0.84        98
   macro avg       0.79      0.79      0.79        98
weighted avg       0.84      0.84      0.84        98



In [452]:
#Our accuracy has improved marginally, lets try k =5.
knn_5 = KNeighborsClassifier(n_neighbors = 5)

knn_5.fit(data_train,T_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [453]:
knn_5_predictions = knn_5.predict(data_test)
print(classification_report(T_test, knn_5_predictions))

              precision    recall  f1-score   support

           0       0.89      0.88      0.88        72
           1       0.67      0.69      0.68        26

    accuracy                           0.83        98
   macro avg       0.78      0.78      0.78        98
weighted avg       0.83      0.83      0.83        98



In [None]:
#This time we are less accurate, lets say k = 3 was a good model.