In [1]:
import pandas as pd
import numpy as np

In [2]:
# read the csv file
df = pd.read_csv('players_20.csv')

# set index
df.set_index('short_name', inplace=True)

# select columns
df = df[['long_name', 'age', 'dob', 'height_cm', 'weight_kg', 'nationality', 'club']]

#display the data frame
df.head()

Unnamed: 0_level_0,long_name,age,dob,height_cm,weight_kg,nationality,club
short_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
L. Messi,Lionel Andrés Messi Cuccittini,32,1987-06-24,170,72,Argentina,FC Barcelona
Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,34,1985-02-05,187,83,Portugal,Juventus
Neymar Jr,Neymar da Silva Santos Junior,27,1992-02-05,175,68,Brazil,Paris Saint-Germain
J. Oblak,Jan Oblak,26,1993-01-07,188,87,Slovenia,Atlético Madrid
E. Hazard,Eden Hazard,28,1991-01-07,175,74,Belgium,Real Madrid


## .sample() 

In [3]:
# extract 10 random elements from the "nationality" column
df['nationality'].sample(n=10, random_state=99)

short_name
R. Alvarado               Mexico
Qiao Wei                China PR
E. Paredes                 Chile
K. Stoyanov             Bulgaria
M. Chudý                Slovakia
D. Sundaram                India
R. Gagliardini             Italy
G. Debeljuh              Croatia
Kim Seung Yong    Korea Republic
M. Hansen                Denmark
Name: nationality, dtype: object

In [4]:
# extract a random n%(10%) sample of the dataframe
fraction = df.sample(frac=0.1, random_state=99)
len(fraction)

1828

In [5]:
# upsample: increase the sampling rate -> frac>1  (Note: replace parameter has to be True for frac parameter > 1)
bigger_fraction = df.sample(frac=2, replace=True, random_state=99)
len(bigger_fraction)

36556

## .apply() 

In [6]:
# use numpy function and apply it to series
df['age'] = df['age'].apply(np.sqrt)

In [7]:
# create custom function and apply it to a data frame
def calculate_bmi(row):
    return row['weight_kg'] / ((row['height_cm'] / 100) ** 2)

df['bmi'] = df.apply(calculate_bmi, axis=1)

In [8]:
df.head(3)

Unnamed: 0_level_0,long_name,age,dob,height_cm,weight_kg,nationality,club,bmi
short_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
L. Messi,Lionel Andrés Messi Cuccittini,5.656854,1987-06-24,170,72,Argentina,FC Barcelona,24.913495
Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,5.830952,1985-02-05,187,83,Portugal,Juventus,23.735308
Neymar Jr,Neymar da Silva Santos Junior,5.196152,1992-02-05,175,68,Brazil,Paris Saint-Germain,22.204082


### .apply() + lambda 

In [9]:
# use lambda function to get the year or "dob" series

# change column's data type
df['dob'] = df['dob'].astype('datetime64[ns]')
# get the year
df['dob'].apply(lambda x:x.year).head(3)

short_name
L. Messi             1987
Cristiano Ronaldo    1985
Neymar Jr            1992
Name: dob, dtype: int64

In [10]:
# alternative wirh dt attribute
df['dob'].dt.year.head(3)

short_name
L. Messi             1987
Cristiano Ronaldo    1985
Neymar Jr            1992
Name: dob, dtype: int32

In [11]:
df['bmi_lambda'] = df.apply(lambda x: x['weight_kg']/((x['height_cm']/100)**2), axis=1).head(3)

In [12]:
df.head(3)

Unnamed: 0_level_0,long_name,age,dob,height_cm,weight_kg,nationality,club,bmi,bmi_lambda
short_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
L. Messi,Lionel Andrés Messi Cuccittini,5.656854,1987-06-24,170,72,Argentina,FC Barcelona,24.913495,24.913495
Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,5.830952,1985-02-05,187,83,Portugal,Juventus,23.735308,23.735308
Neymar Jr,Neymar da Silva Santos Junior,5.196152,1992-02-05,175,68,Brazil,Paris Saint-Germain,22.204082,22.204082


## .copy() 

In [13]:
# deep=True by default - makes a deep copy
df_deep_copy = df.copy()

In [14]:
df_deep_copy.head(3)

Unnamed: 0_level_0,long_name,age,dob,height_cm,weight_kg,nationality,club,bmi,bmi_lambda
short_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
L. Messi,Lionel Andrés Messi Cuccittini,5.656854,1987-06-24,170,72,Argentina,FC Barcelona,24.913495,24.913495
Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,5.830952,1985-02-05,187,83,Portugal,Juventus,23.735308,23.735308
Neymar Jr,Neymar da Silva Santos Junior,5.196152,1992-02-05,175,68,Brazil,Paris Saint-Germain,22.204082,22.204082


In [20]:
# update value in original dataframe
df.loc['L. Messi', 'height_cm'] = 180

In [21]:
# compare both data frames
df.loc['L. Messi', 'height_cm'] == df_deep_copy.loc['L. Messi', 'height_cm']

False

In [22]:
# deep=False - makes a shallow copy
df_shallow_copy = df.copy(deep=False)

In [23]:
# update value in original dataframe
df.loc['Cristiano Ronaldo', 'height_cm'] = 200

In [24]:
df.loc['Cristiano Ronaldo', 'height_cm'] == df_shallow_copy.loc['Cristiano Ronaldo', 'height_cm']

True