In [1]:
import pandas as pd
import numpy as np

In [2]:
# reads csv file
df = pd.read_csv('players_20.csv')
# set index
df.set_index('short_name', inplace=True)
# select column
df = df[['long_name', 'age', 'dob', 'height_cm', 'weight_kg', 'nationality', 'club']]

In [3]:
df.head(3)

Unnamed: 0_level_0,long_name,age,dob,height_cm,weight_kg,nationality,club
short_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
L. Messi,Lionel Andrés Messi Cuccittini,32,1987-06-24,170,72,Argentina,FC Barcelona
Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,34,1985-02-05,187,83,Portugal,Juventus
Neymar Jr,Neymar da Silva Santos Junior,27,1992-02-05,175,68,Brazil,Paris Saint-Germain


## .sample() 

create random sample

In [6]:
# extract 10 random elements from the "nationality" column
df['nationality'].sample(frac=0.2, random_state=99)

short_name
R. Alvarado              Mexico
Qiao Wei               China PR
E. Paredes                Chile
K. Stoyanov            Bulgaria
M. Chudý               Slovakia
                     ...       
Pedro León                Spain
M. Sykes       Northern Ireland
Y. Fofana                France
A. Micai                  Italy
David Bruno            Portugal
Name: nationality, Length: 3656, dtype: object

In [7]:
# extract a random n%(10%) sample of the dataframe
fraction = df.sample(frac=0.2, random_state=99)
len(fraction)

3656

In [8]:
# upsample: increase the sampling rate -> frac>1  (Note: replace parameter has to be True for frac parameter > 1)
bigger_fraction = df.sample(frac=2, replace=True, random_state=99)
len(bigger_fraction)

36556

## .query() 

Filter a Data Frame

In [11]:
# select players older than 34
df.query('age>34').head(3)

Unnamed: 0_level_0,long_name,age,dob,height_cm,weight_kg,nationality,club
short_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Z. Ibrahimović,Zlatan Ibrahimović,37,1981-10-03,195,95,Sweden,LA Galaxy
Pepe,Képler Laveran Lima Ferreira,36,1983-02-26,188,81,Portugal,FC Porto
G. Buffon,Gianluigi Buffon,41,1978-01-28,192,92,Italy,Juventus


In [12]:
#equivalent boolean slicing
df[df['age']>34].head(3)

Unnamed: 0_level_0,long_name,age,dob,height_cm,weight_kg,nationality,club
short_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Z. Ibrahimović,Zlatan Ibrahimović,37,1981-10-03,195,95,Sweden,LA Galaxy
Pepe,Képler Laveran Lima Ferreira,36,1983-02-26,188,81,Portugal,FC Porto
G. Buffon,Gianluigi Buffon,41,1978-01-28,192,92,Italy,Juventus


In [13]:
# select players older than 34 from Italy
df.query('age>34 and nationality==\'Italy\'').head(3)

Unnamed: 0_level_0,long_name,age,dob,height_cm,weight_kg,nationality,club
short_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
G. Buffon,Gianluigi Buffon,41,1978-01-28,192,92,Italy,Juventus
F. Quagliarella,Fabio Quagliarella,36,1983-01-31,180,79,Italy,Sampdoria
D. De Rossi,Daniele De Rossi,35,1983-07-24,184,83,Italy,Boca Juniors


In [14]:
#equivalent boolean slicing
df[(df['age'] > 34) & (df['nationality'] == 'Italy')].head(3)

Unnamed: 0_level_0,long_name,age,dob,height_cm,weight_kg,nationality,club
short_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
G. Buffon,Gianluigi Buffon,41,1978-01-28,192,92,Italy,Juventus
F. Quagliarella,Fabio Quagliarella,36,1983-01-31,180,79,Italy,Sampdoria
D. De Rossi,Daniele De Rossi,35,1983-07-24,184,83,Italy,Boca Juniors


In [15]:
# add a not operator to the first example
df.query('not age>34').head(3)

Unnamed: 0_level_0,long_name,age,dob,height_cm,weight_kg,nationality,club
short_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
L. Messi,Lionel Andrés Messi Cuccittini,32,1987-06-24,170,72,Argentina,FC Barcelona
Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,34,1985-02-05,187,83,Portugal,Juventus
Neymar Jr,Neymar da Silva Santos Junior,27,1992-02-05,175,68,Brazil,Paris Saint-Germain


In [16]:
#equivalent boolean slicing
df[~(df['age'] > 34)].head(3)

Unnamed: 0_level_0,long_name,age,dob,height_cm,weight_kg,nationality,club
short_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
L. Messi,Lionel Andrés Messi Cuccittini,32,1987-06-24,170,72,Argentina,FC Barcelona
Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,34,1985-02-05,187,83,Portugal,Juventus
Neymar Jr,Neymar da Silva Santos Junior,27,1992-02-05,175,68,Brazil,Paris Saint-Germain


In [17]:
# convert height to meters and select those with height above 1.8
df.query('height_cm/100>1.8').head(3)

Unnamed: 0_level_0,long_name,age,dob,height_cm,weight_kg,nationality,club
short_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,34,1985-02-05,187,83,Portugal,Juventus
J. Oblak,Jan Oblak,26,1993-01-07,188,87,Slovenia,Atlético Madrid
K. De Bruyne,Kevin De Bruyne,28,1991-06-28,181,70,Belgium,Manchester City


In [18]:
#equivalent boolean slicing
df[~((df['height_cm']/100) < 1.8)].head(3)

Unnamed: 0_level_0,long_name,age,dob,height_cm,weight_kg,nationality,club
short_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,34,1985-02-05,187,83,Portugal,Juventus
J. Oblak,Jan Oblak,26,1993-01-07,188,87,Slovenia,Atlético Madrid
K. De Bruyne,Kevin De Bruyne,28,1991-06-28,181,70,Belgium,Manchester City


In [19]:
# select players that were born after 1990

# check data types 
df.dtypes

long_name      object
age             int64
dob            object
height_cm       int64
weight_kg       int64
nationality    object
club           object
dtype: object

In [20]:
# conver "dob" column to datetime type
df['dob'] = df['dob'].astype('datetime64[ns]')
df.dtypes

long_name              object
age                     int64
dob            datetime64[ns]
height_cm               int64
weight_kg               int64
nationality            object
club                   object
dtype: object

In [21]:
# query
df.query('dob.dt.year>1990').head(3)

Unnamed: 0_level_0,long_name,age,dob,height_cm,weight_kg,nationality,club
short_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Neymar Jr,Neymar da Silva Santos Junior,27,1992-02-05,175,68,Brazil,Paris Saint-Germain
J. Oblak,Jan Oblak,26,1993-01-07,188,87,Slovenia,Atlético Madrid
E. Hazard,Eden Hazard,28,1991-01-07,175,74,Belgium,Real Madrid


In [22]:
#equivalent boolean slicing
df[df['dob'].dt.year > 1990].head(3)

Unnamed: 0_level_0,long_name,age,dob,height_cm,weight_kg,nationality,club
short_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Neymar Jr,Neymar da Silva Santos Junior,27,1992-02-05,175,68,Brazil,Paris Saint-Germain
J. Oblak,Jan Oblak,26,1993-01-07,188,87,Slovenia,Atlético Madrid
E. Hazard,Eden Hazard,28,1991-01-07,175,74,Belgium,Real Madrid


## .apply() 

In [24]:
# use numpy function and apply it to series
df['age'].apply(np.sqrt).head(3)

short_name
L. Messi             5.656854
Cristiano Ronaldo    5.830952
Neymar Jr            5.196152
Name: age, dtype: float64

In [25]:
# create your own function and apply it to a dataframe
def calculate_bmi(row):
    return row['weight_kg'] / ((row['height_cm'] / 100) ** 2)

df.apply(calculate_bmi, axis=1).head(3)

short_name
L. Messi             24.913495
Cristiano Ronaldo    23.735308
Neymar Jr            22.204082
dtype: float64

### .apply() + lambda 

In [27]:
# use lambda function to convert "height_cm" series to meters
df['height_cm'].apply(lambda x:x/100).head(3) 

short_name
L. Messi             1.70
Cristiano Ronaldo    1.87
Neymar Jr            1.75
Name: height_cm, dtype: float64

In [28]:
#alternative
(df['height_cm'] / 100).head(3)

short_name
L. Messi             1.70
Cristiano Ronaldo    1.87
Neymar Jr            1.75
Name: height_cm, dtype: float64

In [29]:
# use lambda function to convert "long_name" series to upper case
df['long_name'].apply(lambda x:x.upper()).head(3)

short_name
L. Messi                  LIONEL ANDRÉS MESSI CUCCITTINI
Cristiano Ronaldo    CRISTIANO RONALDO DOS SANTOS AVEIRO
Neymar Jr                  NEYMAR DA SILVA SANTOS JUNIOR
Name: long_name, dtype: object

In [30]:
#alternative with str attribute
df['long_name'].str.upper().head(3)

short_name
L. Messi                  LIONEL ANDRÉS MESSI CUCCITTINI
Cristiano Ronaldo    CRISTIANO RONALDO DOS SANTOS AVEIRO
Neymar Jr                  NEYMAR DA SILVA SANTOS JUNIOR
Name: long_name, dtype: object

In [31]:
df['dob'] = df['dob'].astype('datetime64[ns]')

# use lambda function to get the year or "dob" series
df['dob'].apply(lambda x:x.year).head(3)

short_name
L. Messi             1987
Cristiano Ronaldo    1985
Neymar Jr            1992
Name: dob, dtype: int64

In [32]:
# alternative wirh dt attribute
df['dob'].dt.year.head(3)

short_name
L. Messi             1987
Cristiano Ronaldo    1985
Neymar Jr            1992
Name: dob, dtype: int32

In [33]:
# apply lambda function to dataframe in order to calculate bmi
df.apply(lambda x:calculate_bmi(x), axis=1).head(3)

short_name
L. Messi             24.913495
Cristiano Ronaldo    23.735308
Neymar Jr            22.204082
dtype: float64

In [34]:
df.apply(lambda x: x['weight_kg']/((x['height_cm']/100)**2), axis=1).head(3)

short_name
L. Messi             24.913495
Cristiano Ronaldo    23.735308
Neymar Jr            22.204082
dtype: float64

## .copy() 

Make a copy of a Data Frame.

In [37]:
# deep=True by default (Modifications to the data or indices of the copy will not be reflected in the original object )
df_deep_copy = df.copy()

In [38]:
df_deep_copy.head(3)

Unnamed: 0_level_0,long_name,age,dob,height_cm,weight_kg,nationality,club
short_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
L. Messi,Lionel Andrés Messi Cuccittini,32,1987-06-24,170,72,Argentina,FC Barcelona
Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,34,1985-02-05,187,83,Portugal,Juventus
Neymar Jr,Neymar da Silva Santos Junior,27,1992-02-05,175,68,Brazil,Paris Saint-Germain


In [39]:
# update value in original dataframe
df.loc['L. Messi', 'height_cm'] = 180

In [40]:
df.loc['L. Messi', 'height_cm'] == df_deep_copy.loc['L. Messi', 'height_cm']

False

In [76]:
# deep=False (Any changes to the data of the original will be reflected in the shallow copy and vice versa)
df_shallow_copy = df.copy(deep=False)

In [78]:
# update value in original dataframe
df.loc['Cristiano Ronaldo', 'height_cm'] = 200

In [80]:
df.loc['Cristiano Ronaldo', 'height_cm'] == df_shallow_copy.loc['Cristiano Ronaldo', 'height_cm']

True