# Pandas Exercises
### By: Jingyu Li

In [1]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

## 1. Pandas walkthrough: FIFA 20 complete player dataset

#### **Context & Acknowledgements**
The dataset includes the players data for the Career Mode from FIFA 20. The dataset is download from https://www.kaggle.com/stefanoleone992/fifa-20-complete-player-dataset. The original data was scraped from the publicly available website https://sofifa.com.

#### **Dataset Introduction**
The dataset has been loaded. All columns are self-explanatory.

In [2]:
df = pd.read_csv(r'data/FIFA20_player.csv')

In [3]:
df.head()

Unnamed: 0,id,short_name,long_name,age,dob,height_cm,weight_kg,nationality,club,overall_score,value_eur,wage_eur,team_position
0,158023,L. Messi,Lionel Andrés Messi Cuccittini,32.0,6/24/1987,170.0,72.0,Argentina,FC Barcelona,94.0,95500000.0,565000.0,RW
1,20801,Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,34.0,2/5/1985,187.0,83.0,Portugal,Juventus,93.0,58500000.0,405000.0,LW
2,190871,Neymar Jr,Neymar da Silva Santos Junior,27.0,2/5/1992,175.0,68.0,Brazil,Paris Saint-Germain,92.0,105500000.0,290000.0,CAM
3,200389,J. Oblak,Jan Oblak,26.0,1/7/1993,188.0,87.0,Slovenia,Atlético Madrid,91.0,77500000.0,125000.0,GK
4,183277,E. Hazard,Eden Hazard,28.0,1/7/1991,175.0,74.0,Belgium,Real Madrid,91.0,90000000.0,470000.0,LW


### 1.1 Handling Missing Data

In [4]:
# Q1: count the number of missing values in each column
# Note: for questions of modifying the DataFrame (e.g impute missing value, sort the value), 
# either modify "df" inplace or assign the new dataframe to "df"
np.sum(df.isnull(),axis=0)

id                 0
short_name         2
long_name          2
age                1
dob                2
height_cm          2
weight_kg          3
nationality        4
club               3
overall_score     15
value_eur          2
wage_eur           1
team_position    251
dtype: int64

In [5]:
# Q2: add a new category "unknown" into feature team_position
df['team_position'] = df['team_position'].fillna('unknown')

In [6]:
# Q3: impute missing values in value_eur and wage_eur by median of the column
df['value_eur'] = df['value_eur'].fillna(df['value_eur'].median(axis=0))
df['wage_eur'] = df['wage_eur'].fillna(df['wage_eur'].median(axis=0))

In [7]:
# Q4: impute missing values in overall_score by mean of all players' score in the same club
fill_mean = lambda x: x.fillna(x.mean(axis=0))
df['overall_score'] = df.groupby('club')['overall_score'].apply(fill_mean)

In [8]:
# Q5: drop the rows with missing value
print('before dropping na:', df.shape)
df.dropna(axis=0, inplace=True)
print('after dropping na:', df.shape)

before dropping na: (18280, 13)
after dropping na: (18276, 13)


### 1.2 Data Transformation

In [9]:
# Q6: check if there is any duplicate row in the dataset. If yes, remove the duplicate row
check_duplicate = df.duplicated()
check_duplicate[check_duplicate==True]

18278    True
18279    True
dtype: bool

In [10]:
df.drop_duplicates(inplace=True)
print('after dropping duplicates:', df.shape)

after dropping duplicates: (18274, 13)


In [11]:
# Q7: transform nationality into uppercase
df['nationality'] = df['nationality'].str.upper()

In [12]:
# Q8: transform the format of club name
# example: Real Madrid --> Real_Madrid
df['club'] = df['club'].str.replace(' ','_')

In [13]:
# Q9: create a feature age_bin based on age
# the bins are (0,18], (18,22], (22,28], (28, 35], (35,100] and name each bin as b1, b2, b3, b4, b5
edges = [0, 18, 22, 28, 35, 100]
df['age_bin'] = pd.cut(df['age'], edges, labels=['b1','b2','b3','b4','b5'])

In [14]:
# Q10: count the frequency of each age_bin
df.groupby('age_bin')['id'].count()

age_bin
b1     880
b2    5076
b3    7653
b4    4372
b5     293
Name: id, dtype: int64

In [15]:
# Q11: create a feature height_bin based on height_cm, and cut into quantiles (0, 0.1, 0.2, ..., 0.9, 1)
quantiles = np.arange(11)/10
df['height_bin'] = pd.qcut(df['height_cm'], quantiles)

In [16]:
# Q12: create dummy variables based on team_position in the DataFrame
dm = pd.get_dummies(df['team_position'])
df = df.join(dm)

In [17]:
# Q13: calculate BMI for each player: BMI = weight (in kg) / height (in m)**2
df['bmi'] = df['weight_kg']/((df['height_cm']/100)**2)

In [18]:
# Q14: create a variable: standard_bmi which equals 1 if BMI in [18,24] else 0
df['standard_bmi'] = np.where((df['bmi']<=24) & (df['bmi']>=18), 1, 0)

### 1.3 Data Wrangling

In [19]:
# Q15: sort the DataFrame according to weight_kg (high to low) and height_cm (if weights are same, low to high)
df = df.sort_values(by=['weight_kg', 'height_cm'], ascending=[False, True])

In [20]:
# Q16: sort the DataFrame generated in Q15 according to row index (low to high)
df = df.sort_index(axis=0)

In [21]:
data19 = pd.read_csv(r'data/FIFA19_player.csv')
data19.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17770 entries, 0 to 17769
Data columns (total 4 columns):
sofifa_id        17770 non-null int64
short_name       17770 non-null object
weight_kg        17770 non-null int64
overall_score    17770 non-null int64
dtypes: int64(3), object(1)
memory usage: 555.4+ KB


In [22]:
practice = pd.melt(data19, ['sofifa_id', 'short_name']).sort_values(by=['sofifa_id']).reset_index().drop('index', axis=1)
practice.head()

Unnamed: 0,sofifa_id,short_name,variable,value
0,164,G. Pinzi,overall_score,70
1,164,G. Pinzi,weight_kg,76
2,657,D. Vaughan,overall_score,66
3,657,D. Vaughan,weight_kg,70
4,768,Felipe,overall_score,75


In [23]:
# Q17: the DataFrame "practice" is in "long" format, transform it into 'wide' format (reverse operation of pd.melt)
# then merge the players' weight_kg and overall_score into the DataFrame of FIFA2020
practice = practice.pivot_table(index=['sofifa_id','short_name'], columns='variable', values='value').reset_index()
practice.head()

variable,sofifa_id,short_name,overall_score,weight_kg
0,164,G. Pinzi,70,76
1,657,D. Vaughan,66,70
2,768,Felipe,75,75
3,1179,G. Buffon,88,92
4,2147,M. Stekelenburg,73,92


In [24]:
df = pd.merge(df, practice, how='left', left_on='id', right_on='sofifa_id', suffixes=('', '_2019'))
df.drop(['sofifa_id','short_name_2019'], axis=1, inplace=True)

In [25]:
# Q18: create a variable: score_change which equals to
# - "increase" if overall_score/overall_score-1 >= 2%
# - "decrease" if overall_score/overall_score-1 >= -2%
# - otherwise "equal"

# then count the number of players in each category
def change(x):
    val = x['overall_score']/x['overall_score_2019']-1
    if val >= 0.02:
        return 'increase'
    elif val <= -0.02:
        return 'decrease'
    else:
        return 'equal'

df['score_change'] = df.apply(change, axis=1)
df.groupby('score_change')['id'].count()

score_change
decrease     1499
equal       12975
increase     3800
Name: id, dtype: int64

### 1.4 Data Aggregation

In [26]:
# Q19: calculate the average players' overall_score of each club
df.groupby('club')['overall_score'].mean().sort_values(ascending=False)

club
FC_Bayern_München         81.304348
Real_Madrid               80.121212
Juventus                  80.060606
Uruguay                   78.608696
FC_Barcelona              78.363636
Netherlands               78.000000
Colombia                  78.000000
Mexico                    78.000000
Bayer_04_Leverkusen       77.280000
Chelsea                   77.060606
Manchester_City           77.000000
Napoli                    76.870968
Manchester_United         76.848485
Tottenham_Hotspur         76.484848
Atlético_Madrid           76.181818
Milan                     76.172414
Turkey                    76.000000
Lazio                     75.939394
Paris_Saint-Germain       75.909091
SL_Benfica                75.833333
Inter                     75.666667
Sweden                    75.666667
FC_Porto                  75.500000
Liverpool                 75.437500
Sevilla_FC                75.424242
Valencia_CF               75.393939
Borussia_Dortmund         75.322581
Everton                

In [27]:
# Q20: calculate the median value_eur for each team_position and age_bin (group by the two columns)
# then pivot the index of age_bin to column index
res = df.groupby(['team_position','age_bin'])['value_eur'].median()
res.unstack()

age_bin,b1,b2,b3,b4,b5
team_position,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CAM,625000.0,1250000.0,1900000.0,2350000.0,600000.0
CB,400000.0,325000.0,950000.0,650000.0,100000.0
CDM,375000.0,1300000.0,1300000.0,1800000.0,267500.0
CF,1500000.0,,9000000.0,6000000.0,
CM,327500.0,950000.0,1400000.0,1800000.0,575000.0
GK,110000.0,850000.0,1000000.0,1100000.0,240000.0
LAM,,3750000.0,1750000.0,2500000.0,
LB,375000.0,750000.0,900000.0,825000.0,100000.0
LCB,250000.0,850000.0,1100000.0,875000.0,195000.0
LCM,425000.0,1200000.0,1350000.0,1000000.0,155000.0


In [28]:
# Q21: calculate the following value for each team_position: 
# - wage_eur: median, max()-min()
# - BMI: standard deviation, max, min, max()-min()
# - age: mean

# then change the column index to one level and find a proper way to rename the column index, e.g. wage_eur_median
def get_range(x):
    return x.max()-x.min()
agg_method = {'wage_eur':['median', get_range], 'bmi':['std','max','min',get_range], 'age':['mean']}

res = df.groupby('team_position').agg(agg_method)
cols = [x[0]+'_'+x[1] for x in res.columns.tolist()]
res.columns = cols
res

Unnamed: 0_level_0,wage_eur_median,wage_eur_get_range,bmi_std,bmi_max,bmi_min,bmi_get_range,age_mean
team_position,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
CAM,5000.0,289000.0,1.328729,27.471707,18.991965,8.479742,26.585209
CB,4000.0,87000.0,1.248867,26.878678,20.452885,6.425793,28.51
CDM,6000.0,299000.0,1.230307,27.01686,18.312854,8.704005,27.187845
CF,21000.0,283000.0,1.750004,26.332889,20.061728,6.27116,27.428571
CM,4000.0,88000.0,1.217424,25.503616,19.736301,5.767315,27.5
GK,4000.0,249000.0,1.415017,27.95976,19.0,8.95976,28.506042
LAM,7000.0,31000.0,1.148966,24.858365,21.049818,3.808547,27.217391
LB,4000.0,239000.0,1.310762,27.33564,19.445795,7.889845,26.355357
LCB,4000.0,299000.0,1.239606,27.774423,19.576333,8.19809,27.575758
LCM,4000.0,329000.0,1.335658,27.143037,18.587892,8.555145,26.085158


In [29]:
# Q22: find the top 5 valuable player (value_eur) in each team_position

# the following solution assume no ties
def top_n(df, n=5, col='value_eur'):
    return df[['short_name', col]].sort_values(by=col, ascending=False)[:n]

df.groupby('team_position').apply(top_n)

Unnamed: 0_level_0,Unnamed: 1_level_0,short_name,value_eur
team_position,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CAM,2,Neymar Jr,105500000.0
CAM,37,M. Reus,56000000.0
CAM,113,D. Alli,41500000.0
CAM,97,A. Gómez,34000000.0
CAM,247,S. Bergwijn,30000000.0
CB,126,S. de Vrij,31500000.0
CB,231,F. Acerbi,19000000.0
CB,573,I. Konaté,18000000.0
CB,395,N. Nkoulou,15000000.0
CB,474,J. Guilavogui,14500000.0


In [30]:
# the following solution returns all players if they are tie
def top_n(df, n=5, col='value_eur'):
    dense_rank = df[col].rank(method='dense', ascending=False)
    return df[dense_rank<=n][['short_name', col]].sort_values(by=col, ascending=False)

df.groupby('team_position').apply(top_n)

Unnamed: 0_level_0,Unnamed: 1_level_0,short_name,value_eur
team_position,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CAM,2,Neymar Jr,105500000.0
CAM,37,M. Reus,56000000.0
CAM,113,D. Alli,41500000.0
CAM,97,A. Gómez,34000000.0
CAM,247,S. Bergwijn,30000000.0
CB,126,S. de Vrij,31500000.0
CB,231,F. Acerbi,19000000.0
CB,573,I. Konaté,18000000.0
CB,395,N. Nkoulou,15000000.0
CB,474,J. Guilavogui,14500000.0


In [31]:
# Q23: calculate the weighted average overall_score of each age_bin
df['personal_weight'] = np.random.uniform(0,1,(df.shape[0],1)) # don't remove

def weighted_avg(x):
    return np.sum(x['overall_score']*x['personal_weight'])/np.sum(x['personal_weight'])

df.groupby('age_bin').apply(weighted_avg)

age_bin
b1    56.889927
b2    62.418924
b3    67.945452
b4    69.524772
b5    68.470300
dtype: float64

In [32]:
# Q24: calculate the correlation coefficients between height_cm and weight_kg for each team_position
def get_corr(x, col1='height_cm', col2='weight_kg'):
    return np.corrcoef(x[col1], x[col2])[0,1]

df.groupby('team_position').apply(get_corr)

team_position
CAM        0.698120
CB         0.625767
CDM        0.748559
CF         0.580608
CM         0.674450
GK         0.536432
LAM        0.791669
LB         0.638993
LCB        0.641555
LCM        0.704943
LDM        0.688433
LF         0.749908
LM         0.679681
LS         0.732917
LW         0.696017
LWB        0.620648
RAM        0.462912
RB         0.648462
RCB        0.567246
RCM        0.682704
RDM        0.630627
RES        0.769691
RF         0.733584
RM         0.712920
RS         0.752568
RW         0.738628
RWB        0.584878
ST         0.664731
SUB        0.766860
unknown    0.801373
dtype: float64