In [11]:
import pandas as pd
import numpy as np
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from scipy.special import inv_boxcox

In [12]:
fifa_data = pd.read_csv('data_mid_bootcamp_project_FIFA_MoneyBall-master/fifa21_male2.csv')
fifa_data.columns

Index(['ID', 'Name', 'Age', 'OVA', 'Nationality', 'Club', 'BOV', 'BP',
       'Position', 'Player Photo',
       ...
       'CDM', 'RDM', 'RWB', 'LB', 'LCB', 'CB', 'RCB', 'RB', 'GK', 'Gender'],
      dtype='object', length=107)

## 1. choosing the columns we want

In [14]:
data = fifa_data[['ID',
 'Name',
 'Age',
 'OVA',
 'BOV',
 'POT',
 'Growth',
 'Value',
 'Total Stats',
 'Base Stats',
 'Composure','Stamina','Strength','Reactions','Power']]
data





Unnamed: 0,ID,Name,Age,OVA,BOV,POT,Growth,Value,Total Stats,Base Stats,Composure,Stamina,Strength,Reactions,Power
0,2,G. Pasquale,33,69,71,69,0,€625K,1929,408,,69,68,69,347
1,16,Luis García,37,71,70,71,0,€600K,1906,385,79.0,64,60,65,324
2,27,J. Cole,33,71,71,71,0,€1.1M,1770,354,,29,56,59,284
3,36,D. Yorke,36,68,70,82,14,€0,1348,369,,51,66,55,239
4,41,Iniesta,36,81,82,81,0,€5.5M,2014,420,89.0,58,62,75,297
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17120,259088,A. Medioub,22,64,66,69,5,€550K,1367,302,46.0,59,74,48,279
17121,259090,W. Rickard,18,56,58,66,10,€130K,1529,319,46.0,53,57,54,271
17122,259091,C. Barrett,18,56,56,67,11,€130K,1430,303,43.0,53,54,59,233
17123,259101,J. Gazibegović,20,62,62,73,11,€475K,1579,335,54.0,57,48,56,261


# Standardizing

In [15]:
def cleaning_0(data): 
    cols = [col_name.lower().replace(' ', '_') for col_name in data.columns]
    data.columns = cols
    question1 = input("Are there columns to drop? (Y/N)")
    while question1 == "Y":
        which_one = input("Enter column to drop (None: script runs further):")
        if which_one != "None":
            data = data.drop([which_one], axis=1)
            question2 = input("Are there more columns to drop? (Y/N)")
            if question2 == "Y":
                question1 = question2
            elif question2 == "N":
                print("No more columns dropped.")
                break
            else:
                print("Error: input must be Y or N!")
                question2 = input("Are there more columns to drop? (Y/N)")   
        elif which_one == "None":
            break   
    if question1 == "N":
        print("No columns dropped.")
    elif question1 != "Y" and question1 != "N":
        print("Error: input must be Y or N!")
        question1 = input("Are there columns to drop? (Y/N)")
        while question1 == "Y":
            which_one = input("Enter column to drop (None: continue without dropping or random key: repeat question):")
            if which_one == "None":
                break
            elif which_one != "None":
                data = data.drop([which_one], axis=1)
                question2 = input("Are there more columns to drop? (Y/N)")
                if question2 == "Y":
                    question1 = question2
                elif question2 == "N":
                    print("No more columns dropped.")
                    break
                else:
                    print("Error: input must be Y or N!")
                    question2 = input("Are there more columns to drop? (Y/N)")
        if question1 == "N":
            print("No columns dropped.")
        elif question1 != "Y" and question1 != "N":
            ("Error: input must be Y or N! To drop columns re-run function. Standardisation nevertheless complete")
            question1 = input("Are there columns to drop? (Y/N)")
    ordered_columns = sorted(data.columns, reverse=False)
    data = data[ordered_columns]
    return data

In [16]:
cleaning_0(data)

No columns dropped.


Unnamed: 0,age,base_stats,bov,composure,growth,id,name,ova,pot,power,reactions,stamina,strength,total_stats,value
0,33,408,71,,0,2,G. Pasquale,69,69,347,69,69,68,1929,€625K
1,37,385,70,79.0,0,16,Luis García,71,71,324,65,64,60,1906,€600K
2,33,354,71,,0,27,J. Cole,71,71,284,59,29,56,1770,€1.1M
3,36,369,70,,14,36,D. Yorke,68,82,239,55,51,66,1348,€0
4,36,420,82,89.0,0,41,Iniesta,81,81,297,75,58,62,2014,€5.5M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17120,22,302,66,46.0,5,259088,A. Medioub,64,69,279,48,59,74,1367,€550K
17121,18,319,58,46.0,10,259090,W. Rickard,56,66,271,54,53,57,1529,€130K
17122,18,303,56,43.0,11,259091,C. Barrett,56,67,233,59,53,54,1430,€130K
17123,20,335,62,54.0,11,259101,J. Gazibegović,62,73,261,56,57,48,1579,€475K


## Dealing with Null values

In [17]:
data.isnull().sum()


id               0
name             0
age              0
ova              0
bov              0
pot              0
growth           0
value            0
total_stats      0
base_stats       0
composure      423
stamina          0
strength         0
reactions        0
power            0
dtype: int64

In [23]:
data["composure"] = data["composure"].isna().replace('',data['composure'].mean())
data['composure']

0         NaN
1        79.0
2         NaN
3         NaN
4        89.0
         ... 
17120    46.0
17121    46.0
17122    43.0
17123    54.0
17124    57.0
Name: composure, Length: 17125, dtype: float64

In [31]:
data['composure'][data.composure.isna()]

Series([], Name: composure, dtype: float64)

In [30]:
data['composure'][data.composure.isna()]=data['composure'].median() # replace NaN with median as it is where the values are closer to the center


# begin exploring