# Football Tournament Prediction Model

## Preparing the data

### Importing Pandas and Numpy library

In [19]:
import pandas as pd
import numpy as np

### Importing the dataset

In [2]:
data = pd.read_csv('../dataset/dataset.csv')

### Printing the first 5 rows of the dataset

In [3]:
data.head(5)

Unnamed: 0,Name,Jersey Number,Club,Position,Nationality,Age,Appearances,Wins,Losses,Goals,...,Punches,High Claims,Catches,Sweeper clearances,Throw outs,Goal Kicks,Yellow cards,Red cards,Fouls,Offsides
0,Bernd Leno,1.0,Arsenal,Goalkeeper,Germany,28.0,64,28,16,0,...,34.0,26.0,17.0,28.0,375.0,489.0,2,0,0,
1,Matt Macey,33.0,Arsenal,Goalkeeper,England,26.0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,
2,Rúnar Alex Rúnarsson,13.0,Arsenal,Goalkeeper,Iceland,25.0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,
3,Héctor Bellerín,2.0,Arsenal,Defender,Spain,25.0,160,90,37,7,...,,,,,,,23,0,125,8.0
4,Kieran Tierney,3.0,Arsenal,Defender,Scotland,23.0,16,7,5,1,...,,,,,,,2,0,9,0.0


### Replacing all the NULL values

In [4]:
data.fillna(0, inplace=True)

In [5]:
data.head(5)

Unnamed: 0,Name,Jersey Number,Club,Position,Nationality,Age,Appearances,Wins,Losses,Goals,...,Punches,High Claims,Catches,Sweeper clearances,Throw outs,Goal Kicks,Yellow cards,Red cards,Fouls,Offsides
0,Bernd Leno,1.0,Arsenal,Goalkeeper,Germany,28.0,64,28,16,0,...,34.0,26.0,17.0,28.0,375.0,489.0,2,0,0,0.0
1,Matt Macey,33.0,Arsenal,Goalkeeper,England,26.0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0.0
2,Rúnar Alex Rúnarsson,13.0,Arsenal,Goalkeeper,Iceland,25.0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0.0
3,Héctor Bellerín,2.0,Arsenal,Defender,Spain,25.0,160,90,37,7,...,0.0,0.0,0.0,0.0,0.0,0.0,23,0,125,8.0
4,Kieran Tierney,3.0,Arsenal,Defender,Scotland,23.0,16,7,5,1,...,0.0,0.0,0.0,0.0,0.0,0.0,2,0,9,0.0


We are replacing all NULL values with zeroes to maintain data consistency in datatypes and to make the predictions accurate

### Listing all the columns of our dataset

In [6]:
data.columns.tolist()

['Name',
 'Jersey Number',
 'Club',
 'Position',
 'Nationality',
 'Age',
 'Appearances',
 'Wins',
 'Losses',
 'Goals',
 'Goals per match',
 'Headed goals',
 'Goals with right foot',
 'Goals with left foot',
 'Penalties scored',
 'Freekicks scored',
 'Shots',
 'Shots on target',
 'Shooting accuracy %',
 'Hit woodwork',
 'Big chances missed',
 'Clean sheets',
 'Goals conceded',
 'Tackles',
 'Tackle success %',
 'Last man tackles',
 'Blocked shots',
 'Interceptions',
 'Clearances',
 'Headed Clearance',
 'Clearances off line',
 'Recoveries',
 'Duels won',
 'Duels lost',
 'Successful 50/50s',
 'Aerial battles won',
 'Aerial battles lost',
 'Own goals',
 'Errors leading to goal',
 'Assists',
 'Passes',
 'Passes per match',
 'Big chances created',
 'Crosses',
 'Cross accuracy %',
 'Through balls',
 'Accurate long balls',
 'Saves',
 'Penalties saved',
 'Punches',
 'High Claims',
 'Catches',
 'Sweeper clearances',
 'Throw outs',
 'Goal Kicks',
 'Yellow cards',
 'Red cards',
 'Fouls',
 'Offsides

## Creating the prediction model

### Importing important libraries

In [35]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

### Choosing the features 

Here we will choose the features from the dataset that are appropriate to predict the goals scored by a player.

To do so, we will create a variable named 'features' that holds all the columns of the dataset that we want.

In [8]:
features = ['Age', 'Appearances', 'Wins', 'Goals per match', 'Headed goals', 'Goals with right foot', 'Goals with left foot', 'Penalties scored', 'Freekicks scored', 'Shots', 'Shots on target',]

### Choosing the target

As we are looking to predict the goals scored by each player, 'Goals' will be our target

In [9]:
target = ['Goals']

### Creating the features and target variables

In [10]:
X = data[features]
y = data[target]

### Splitting the training and testing sets

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Creating the Decision Tree Model

In [12]:
model = DecisionTreeRegressor(random_state=42)

### Training the model

In [13]:
model.fit(X_train, y_train)

### Making the predictions

In [14]:
y_pred_test = model.predict(X_test)
y_pred_train = model.predict(X_train)

## Calculating MSE and accuracy of the model

### Mean Squared Error (MSE)

In [32]:
mse = mean_squared_error(y_test, y_pred_test)

In [33]:
print("Mean Squared Error (MSE): ", mse)

Mean Squared Error (MSE):  8.817391304347826


### Accuracy

In [36]:
accuracy = accuracy_score(y_test, y_pred_test)

In [44]:
print("Accuracy: ", accuracy)

Accuracy:  0.7217391304347827


#### Accuracy in %

In [48]:
accuracy_perc = (accuracy/1)*100

In [49]:
accuracy_perc = format(accuracy_perc, '.2f')

In [52]:
print("Accuracy %: ", accuracy_perc, '%')

Accuracy %:  72.17 %


## Analyzing the predictions

In [15]:
print(y_pred_test)
print(y_pred_train)

[ 0.  0. 25.  1.  0.  0.  3. 46.  0.  0.  3.  0.  0.  0.  1.  0.  0.  1.
  1.  1.  1.  0.  0.  8.  2. 33.  0.  0.  0.  1.  0.  1. 12.  8.  0.  0.
  8. 13.  3.  0. 12.  1. 33. 14. 14. 15.  2.  5. 23.  0.  0.  0.  0. 12.
  1.  7.  0.  0.  0.  0.  0.  0.  4.  2.  0.  0. 21.  2.  0.  0.  0.  2.
 15.  0.  0.  0.  0.  0.  0.  2.  6.  8.  2.  0.  0.  0.  5.  0.  0.  0.
  6.  2.  0.  0. 50.  0. 23.  0.  0.  0. 22. 75.  2.  2.  0.  1. 39.  0.
  1. 12. 14.  0.  4. 13.  0.]
[  0.   0.   1.   0.   1.  15.   0.   0.   0.   4.  25.   5.   0.   2.
   0.   0.   0.  25.   7.   0.   0.  16.   0.   4.   0.   0.   0.   5.
   0.   0.  12.   2.   0.   8.  61.   1.   5.   0.   0.   0.   0.   9.
   1.   0.   0.   3.   0.  14.   7.   0.   0.   4.   0.   0.   6.  41.
   6.  15.   0.   1.  31.   1.   0.  22.  53.   6.   2.   0.   6.   1.
   1.   0.   2.   0.  13.   0.   0.   0.  30.   0.   4.   0.   3.   1.
   5.   0.   8.   7.   2.  10.   3.   7.   3.   0.   0.   0.   0.   0.
   3.   0.   0.  41.   0.   0.  11.

In [17]:
y_pred_test.shape

(115,)

In [18]:
y_pred_train.shape

(456,)

In [21]:
predictions = np.concatenate((y_pred_test, y_pred_train))

In [22]:
print(predictions)

[  0.   0.  25.   1.   0.   0.   3.  46.   0.   0.   3.   0.   0.   0.
   1.   0.   0.   1.   1.   1.   1.   0.   0.   8.   2.  33.   0.   0.
   0.   1.   0.   1.  12.   8.   0.   0.   8.  13.   3.   0.  12.   1.
  33.  14.  14.  15.   2.   5.  23.   0.   0.   0.   0.  12.   1.   7.
   0.   0.   0.   0.   0.   0.   4.   2.   0.   0.  21.   2.   0.   0.
   0.   2.  15.   0.   0.   0.   0.   0.   0.   2.   6.   8.   2.   0.
   0.   0.   5.   0.   0.   0.   6.   2.   0.   0.  50.   0.  23.   0.
   0.   0.  22.  75.   2.   2.   0.   1.  39.   0.   1.  12.  14.   0.
   4.  13.   0.   0.   0.   1.   0.   1.  15.   0.   0.   0.   4.  25.
   5.   0.   2.   0.   0.   0.  25.   7.   0.   0.  16.   0.   4.   0.
   0.   0.   5.   0.   0.  12.   2.   0.   8.  61.   1.   5.   0.   0.
   0.   0.   9.   1.   0.   0.   3.   0.  14.   7.   0.   0.   4.   0.
   0.   6.  41.   6.  15.   0.   1.  31.   1.   0.  22.  53.   6.   2.
   0.   6.   1.   1.   0.   2.   0.  13.   0.   0.   0.  30.   0.   4.
   0. 

### Putting the predictions into the dataset itself

In [23]:
data['Predicted Goals'] = predictions

In [24]:
data.head(5)

Unnamed: 0,Name,Jersey Number,Club,Position,Nationality,Age,Appearances,Wins,Losses,Goals,...,High Claims,Catches,Sweeper clearances,Throw outs,Goal Kicks,Yellow cards,Red cards,Fouls,Offsides,Predicted Goals
0,Bernd Leno,1.0,Arsenal,Goalkeeper,Germany,28.0,64,28,16,0,...,26.0,17.0,28.0,375.0,489.0,2,0,0,0.0,0.0
1,Matt Macey,33.0,Arsenal,Goalkeeper,England,26.0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0.0,0.0
2,Rúnar Alex Rúnarsson,13.0,Arsenal,Goalkeeper,Iceland,25.0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0,0,0,0.0,25.0
3,Héctor Bellerín,2.0,Arsenal,Defender,Spain,25.0,160,90,37,7,...,0.0,0.0,0.0,0.0,0.0,23,0,125,8.0,1.0
4,Kieran Tierney,3.0,Arsenal,Defender,Scotland,23.0,16,7,5,1,...,0.0,0.0,0.0,0.0,0.0,2,0,9,0.0,0.0
