In [11]:
import pandas as pd
import numpy as np
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score     

In [12]:
fifa_data = pd.read_csv("data/fifa_dataset/FIFA_2018_Statistics.csv")
fifa_data

Unnamed: 0,Date,Team,Opponent,Goal Scored,Ball Possession %,Attempts,On-Target,Off-Target,Blocked,Corners,...,Yellow Card,Yellow & Red,Red,Man of the Match,1st Goal,Round,PSO,Goals in PSO,Own goals,Own goal Time
0,14-06-2018,Russia,Saudi Arabia,5,40,13,7,3,3,6,...,0,0,0,Yes,12.0,Group Stage,No,0,,
1,14-06-2018,Saudi Arabia,Russia,0,60,6,0,3,3,2,...,0,0,0,No,,Group Stage,No,0,,
2,15-06-2018,Egypt,Uruguay,0,43,8,3,3,2,0,...,2,0,0,No,,Group Stage,No,0,,
3,15-06-2018,Uruguay,Egypt,1,57,14,4,6,4,5,...,0,0,0,Yes,89.0,Group Stage,No,0,,
4,15-06-2018,Morocco,Iran,0,64,13,3,6,4,5,...,1,0,0,No,,Group Stage,No,0,1.0,90.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,11-07-2018,England,Croatia,1,46,11,1,6,4,4,...,1,0,0,No,5.0,Semi- Finals,No,0,,
124,14-07-2018,Belgium,England,2,43,12,4,3,5,4,...,1,0,0,Yes,4.0,3rd Place,No,0,,
125,14-07-2018,England,Belgium,0,57,15,5,7,3,5,...,2,0,0,No,,3rd Place,No,0,,
126,15-07-2018,France,Croatia,4,39,8,6,1,1,2,...,2,0,0,Yes,18.0,Final,No,0,1.0,18.0


In [13]:
feature_names = [i for i in fifa_data.columns if fifa_data[i].dtype in [np.int64, np.int64]]
feature_names

['Goal Scored',
 'Ball Possession %',
 'Attempts',
 'On-Target',
 'Off-Target',
 'Blocked',
 'Corners',
 'Offsides',
 'Free Kicks',
 'Saves',
 'Pass Accuracy %',
 'Passes',
 'Distance Covered (Kms)',
 'Fouls Committed',
 'Yellow Card',
 'Yellow & Red',
 'Red',
 'Goals in PSO']

In [14]:
set(list(fifa_data.columns))-set(feature_names)

{'1st Goal',
 'Date',
 'Man of the Match',
 'Opponent',
 'Own goal Time',
 'Own goals',
 'PSO',
 'Round',
 'Team'}

In [15]:
#Load Data and Split
fifa_data = pd.read_csv("data/fifa_dataset/FIFA_2018_Statistics.csv")

#all of the non-numeric features in the fifa dataset seem not very relevant i.e. 
feature_names = [i for i in fifa_data.columns if fifa_data[i].dtype in [np.int64, np.int64]]


fifa_data=fifa_data[feature_names+["Man of the Match"] ]
fifa_data["Man of the Match"]=fifa_data["Man of the Match"].map(lambda x: 1 if x=="Yes" else 0)
fifa_train, fifa_test = train_test_split(fifa_data, test_size=0.2, random_state=42)
fifa_train.to_parquet("data/fifa_dataset/train_cleaned.parquet")
fifa_test.to_parquet("data/fifa_dataset/test_cleaned.parquet")


fifa_x_train = fifa_train[feature_names]
fifa_x_test=fifa_test[feature_names]

fifa_y_train = fifa_train["Man of the Match"] 
fifa_y_test=fifa_test["Man of the Match"] 

In [16]:
fifa_train

Unnamed: 0,Goal Scored,Ball Possession %,Attempts,On-Target,Off-Target,Blocked,Corners,Offsides,Free Kicks,Saves,Pass Accuracy %,Passes,Distance Covered (Kms),Fouls Committed,Yellow Card,Yellow & Red,Red,Goals in PSO,Man of the Match
70,1,32,8,2,5,1,1,1,12,3,69,226,93,16,2,0,0,0,0
78,1,41,17,6,8,3,10,1,12,0,73,324,103,10,3,0,0,0,1
47,0,34,4,0,4,0,1,3,14,7,73,271,109,11,1,0,0,0,0
0,5,40,13,7,3,3,6,3,11,0,78,306,118,22,0,0,0,0,1
12,0,52,18,6,7,5,3,5,21,2,85,394,104,10,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,3,56,24,8,10,6,10,1,10,3,87,621,108,13,0,0,0,0,1
14,2,54,11,2,7,2,6,2,17,2,84,462,104,20,2,0,0,0,1
92,1,36,9,4,4,1,0,4,20,5,77,284,102,18,3,0,0,0,0
51,2,58,20,5,8,7,7,3,17,2,87,547,112,12,1,0,0,0,1


In [17]:
fifa_model=RandomForestClassifier(random_state=42)
fifa_model.fit(fifa_x_train, fifa_y_train)
pre_trained_predictions=fifa_model.predict(fifa_x_test)
pre_trained_accuracy = accuracy_score(fifa_y_test, pre_trained_predictions)
print("Pre-Trained Accuracy:", pre_trained_accuracy)

Pre-Trained Accuracy: 0.6538461538461539


In [18]:
with open('data/fifa_dataset/RF.pkl', 'wb') as f:
    pickle.dump(fifa_model, f)

In [19]:
feature_desc = [
    'Number of goals scored by the team during the match.',
    'Percentage of ball possession by the team during the match.',
    'Number of attempts or shots taken by the team.',
    'Number of shots that were on target.',
    'Number of shots that went off target.',
    'Number of shots that were blocked by the opponent.',
    'Number of corner kicks taken by the team.',
    'Number of times the team was caught offside.',
    'Number of free kicks taken by the team.',
    "Number of saves made by the team's goalkeeper.",
    'Percentage of passes that successfully reached a teammate.',
    'Total number of passes made by the team.',
    "Total distance covered by the team's players during the match, in kilometers.",
    'Number of fouls committed by the team.',
    'Number of yellow cards received by the team.',
    'Number of yellow-red cards received by the team.',
    'Number of red cards received by the team.',
    'Number of goals scored by the team during the penalty shootout.'
]



feature_desc_df = pd.DataFrame({
    "feature_name": list(fifa_x_test.columns),
    "feature_average": fifa_x_train.mean().to_list() ,
    "feature_desc": feature_desc,
})
     

dataset_description="The dataset contains information about the football matches during the world cup. Every instance is a match played by a team and holds info about how well the team performed "
target_description="The target variable is whether the team received (1) or did not receive (0) the Man of the Match award"
task_description="Predict whether a team will win the Man of the Match award given all the other features"

dataset_info={
 "dataset_description": dataset_description,
 "target_description": target_description,
 "task_description": task_description,
 "feature_description": feature_desc_df
 }


with open('data/fifa_dataset/dataset_info', 'wb') as f:
    pickle.dump(dataset_info, f)

In [20]:
feature_desc_df

Unnamed: 0,feature_name,feature_average,feature_desc
0,Goal Scored,1.303922,Number of goals scored by the team during the ...
1,Ball Possession %,50.107843,Percentage of ball possession by the team duri...
2,Attempts,12.333333,Number of attempts or shots taken by the team.
3,On-Target,3.784314,Number of shots that were on target.
4,Off-Target,5.176471,Number of shots that went off target.
5,Blocked,3.313725,Number of shots that were blocked by the oppon...
6,Corners,4.754902,Number of corner kicks taken by the team.
7,Offsides,1.382353,Number of times the team was caught offside.
8,Free Kicks,15.147059,Number of free kicks taken by the team.
9,Saves,2.529412,Number of saves made by the team's goalkeeper.
