In [1]:
# forked from https://www.kaggle.com/roshansharma/fifa-2019-data-analysis-and-visualization

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

import warnings
warnings.filterwarnings("ignore")

from _utils.u_constant import PATH_ROOT

path = PATH_ROOT + "Code projects/Python/kaggle/Fifa2019/"

In [80]:
data = pd.read_csv(path + "data.csv")
data.drop("Unnamed: 0", axis=1, inplace=True)
print(data.shape)
data.head()

(18207, 88)


Unnamed: 0,ID,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,Club Logo,...,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause
0,158023,L. Messi,31,https://cdn.sofifa.org/players/4/19/158023.png,Argentina,https://cdn.sofifa.org/flags/52.png,94,94,FC Barcelona,https://cdn.sofifa.org/teams/2/light/241.png,...,96.0,33.0,28.0,26.0,6.0,11.0,15.0,14.0,8.0,€226.5M
1,20801,Cristiano Ronaldo,33,https://cdn.sofifa.org/players/4/19/20801.png,Portugal,https://cdn.sofifa.org/flags/38.png,94,94,Juventus,https://cdn.sofifa.org/teams/2/light/45.png,...,95.0,28.0,31.0,23.0,7.0,11.0,15.0,14.0,11.0,€127.1M
2,190871,Neymar Jr,26,https://cdn.sofifa.org/players/4/19/190871.png,Brazil,https://cdn.sofifa.org/flags/54.png,92,93,Paris Saint-Germain,https://cdn.sofifa.org/teams/2/light/73.png,...,94.0,27.0,24.0,33.0,9.0,9.0,15.0,15.0,11.0,€228.1M
3,193080,De Gea,27,https://cdn.sofifa.org/players/4/19/193080.png,Spain,https://cdn.sofifa.org/flags/45.png,91,93,Manchester United,https://cdn.sofifa.org/teams/2/light/11.png,...,68.0,15.0,21.0,13.0,90.0,85.0,87.0,88.0,94.0,€138.6M
4,192985,K. De Bruyne,27,https://cdn.sofifa.org/players/4/19/192985.png,Belgium,https://cdn.sofifa.org/flags/7.png,91,92,Manchester City,https://cdn.sofifa.org/teams/2/light/10.png,...,88.0,68.0,58.0,51.0,15.0,13.0,5.0,10.0,13.0,€196.4M


In [81]:
data.isnull().sum().sort_values(ascending=False)[:10]

Loaned From    16943
LWB             2085
LM              2085
CB              2085
LCB             2085
LB              2085
RWB             2085
RDM             2085
CDM             2085
LDM             2085
dtype: int64

In [96]:
data.columns

Index(['ID', 'Name', 'Age', 'Photo', 'Nationality', 'Flag', 'Overall',
       'Potential', 'Club', 'Club Logo', 'Value', 'Wage', 'Special',
       'Preferred Foot', 'International Reputation', 'Weak Foot',
       'Skill Moves', 'Work Rate', 'Body Type', 'Real Face', 'Position',
       'Jersey Number', 'Joined', 'Loaned From', 'Contract Valid Until',
       'Height', 'Weight', 'LS', 'ST', 'RS', 'LW', 'LF', 'CF', 'RF', 'RW',
       'LAM', 'CAM', 'RAM', 'LM', 'LCM', 'CM', 'RCM', 'RM', 'LWB', 'LDM',
       'CDM', 'RDM', 'RWB', 'LB', 'LCB', 'CB', 'RCB', 'RB', 'Crossing',
       'Finishing', 'HeadingAccuracy', 'ShortPassing', 'Volleys', 'Dribbling',
       'Curve', 'FKAccuracy', 'LongPassing', 'BallControl', 'Acceleration',
       'SprintSpeed', 'Agility', 'Reactions', 'Balance', 'ShotPower',
       'Jumping', 'Stamina', 'Strength', 'LongShots', 'Aggression',
       'Interceptions', 'Positioning', 'Vision', 'Penalties', 'Composure',
       'Marking', 'StandingTackle', 'SlidingTackle', 'GKDiv

In [None]:

base_columns = ['Name', 'Age', 'Value', 'Wage', 'Preferred Foot', 'Weak Foot', 'International Reputation']

ability_columns = ['Crossing', 'Finishing', 'HeadingAccuracy', 'ShortPassing', 'Volleys', 'Dribbling',
                   'Curve', 'FKAccuracy', 'LongPassing', 'BallControl', 'Acceleration', 'SprintSpeed', 
                   'Agility', 'Reactions', 'Balance', 'ShotPower', 'Jumping', 'Stamina', 'Strength', 
                   'LongShots', 'Aggression', 'Interceptions', 'Positioning', 'Vision', 'Penalties', 
                   'Composure', 'Marking', 'StandingTackle', 'SlidingTackle', 'GKDiving', 'GKHandling',
                   'GKKicking', 'GKPositioning', 'GKReflexes']

position_columns = ['LS', 'ST', 'RS', 'LW', 'LF', 'CF', 'RF', 'RW',
                    'LAM', 'CAM', 'RAM', 'LM', 'LCM', 'CM', 'RCM', 
                    'RM', 'LWB', 'LDM', 'CDM', 'RDM', 'RWB', 'LB', 
                    'LCB', 'CB', 'RCB', 'RB']


## 1. 特征处理：变换与缺失值处理

### 1.1 特征转换

In [82]:
# 单位变换 ft => cm
def transform_ft(x):
    if x is np.nan:
        return x
    ft, in_ = map(int, x.split("'"))
    return round((ft * 12 + in_) * 2.54)

# 单位变换 lbs => kg
def transform_lbs(x):
    if x is np.nan:
        return x
    lbs = int(x[:-3])
    return round(lbs * 0.45359)
        
# 提取数值 €xK => x (k€)
def transform_wage(x):
    if x is np.nan:
        return x
    if x[-1] == "K":
        return int(x[1:-1])
    return int(x[1:])

In [84]:
data["Height"] = data["Height"].apply(transform_ft)
data["Weight"] = data["Weight"].apply(transform_lbs)
data["Wage"] = data["Wage"].apply(transform_wage)

### 1.2 缺失值处理

In [6]:
# 用均值填充
def fill_with_mean(column):
    data[column].fillna(data[column].mean(), inplace=True)

columns = ["ShortPassing", "Volleys", "Dribbling", "Curve", "FKAccuracy", "LongPassing", 
           "BallControl", "HeadingAccuracy", "Finishing", "Crossing"]
for column in columns:
    fill_with_mean(column)

In [None]:
# 用业务常识填充
data["Contract Valid Until"].fillna("2019", inplace=True)
data["Loaned From"].fillna("None", inplace=True)
data["Club"].fillna("No Club", inplace=True)

In [None]:
# 用中位数填充
def fill_with_median(column):
    data[column].fillna(data[column].median(), inplace=True)
columns = ["Height", "Weight", "Skill Moves"]
# =[180cm, 75kg, 2]

In [None]:
# 用众数填充
def fill_with_mode(column):
    data[column].fillna(data[column].mode(), inplace=True)

columns = ["Joined", "Jersey Number", "Body Type", "Work Rate", 
           "Weak Foot", "Preferred Foot", "International Reputation"]
# =["Jul 1, 2018", 8, "Normal", "Medium/ Medium", 3, "Right", 1]

In [None]:
darta