In [1]:
import sys
import numpy as np
import scipy.stats
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

np.random.seed(123)

# plot style
plt.style.use("seaborn-v0_8")
# print(plt.style.available)

# the "R4" palette
_colours = ["#000000f0", "#DF536Bf0", "#61D04Ff0", "#2297E6f0",
            "#28E2E5f0", "#CD0BBCf0", "#F5C710f0", "#999999f0"]

_linestyles = ["solid", "dashed", "dashdot", "dotted"]

# A different plotting style for each plotted line
plt.rcParams["axes.prop_cycle"] = plt.cycler(color=_colours, linestyle=_linestyles*2)
plt.rcParams["patch.facecolor"] = _colours[0]

# Python version
print(f"sys: {sys.version}")
print(f"numpy: {np.__version__}")
print(f"scipy: {scipy.__version__}")
print(f"pandas: {pd.__version__}")
print(f"matplotlib: {matplotlib.__version__}")
print(f"seaborns {sns.__version__}")

sys: 3.9.5 (tags/v3.9.5:0a7dcbd, May  3 2021, 17:27:52) [MSC v.1928 64 bit (AMD64)]
numpy: 1.24.2
scipy: 1.10.1
pandas: 1.5.3
matplotlib: 3.7.1
seaborns 0.12.2


# Creating DataFrames

In [2]:
df = pd.DataFrame(
    np.random.rand(4, 3),
    columns=["a", "b", "c"]
)

df

Unnamed: 0,a,b,c
0,0.696469,0.286139,0.226851
1,0.551315,0.719469,0.423106
2,0.980764,0.68483,0.480932
3,0.392118,0.343178,0.72905


In [3]:
df = pd.DataFrame(dict(
    a = np.round(np.random.rand(5), 2),
    b = [1, 2.5, np.nan, 4, np.nan],
    c = [True, True, False, False, True],
    d = ["A", "B", "C", None, "E"],
    e = ["spam", "spam", "bacon", "spam", "eggs"],
    f = np.array([
        "2021-01-01", "2022-02-02", "2023-03-03", "2024-04-04", "2025-05-05"],
        dtype="datetime64[D]"),
    g = [["spam"], ["bacon", "spam"], None, ["eggs", "bacon", "spam"], ["ham"]],
))

df

Unnamed: 0,a,b,c,d,e,f,g
0,0.44,1.0,True,A,spam,2021-01-01,[spam]
1,0.06,2.5,True,B,spam,2022-02-02,"[bacon, spam]"
2,0.4,,False,C,bacon,2023-03-03,
3,0.74,4.0,False,,spam,2024-04-04,"[eggs, bacon, spam]"
4,0.18,,True,E,eggs,2025-05-05,[ham]


In [4]:
print(f"Shape: {df.shape}")
print(f"\ndtype:\n{df.dtypes}")

Shape: (5, 7)

dtype:
a           float64
b           float64
c              bool
d            object
e            object
f    datetime64[ns]
g            object
dtype: object


# DataFrames from data

In [5]:
body = pd.read_csv("data/nhanes_adult_female_bmx_2020.csv", comment="#")

In [6]:
body.head()

Unnamed: 0,BMXWT,BMXHT,BMXARML,BMXLEG,BMXARMC,BMXHIP,BMXWAIST
0,97.1,160.2,34.7,40.8,35.8,126.1,117.9
1,91.1,152.7,33.5,33.0,38.5,125.5,103.1
2,73.0,161.2,37.4,38.0,31.8,106.2,92.0
3,61.7,157.4,38.0,34.7,29.0,101.0,90.5
4,55.4,154.6,34.6,34.0,28.3,92.5,73.2


In [7]:
print(f"Shape: {body.shape}")
print(f"\ndtype:\n{body.dtypes}")

Shape: (4221, 7)

dtype:
BMXWT       float64
BMXHT       float64
BMXARML     float64
BMXLEG      float64
BMXARMC     float64
BMXHIP      float64
BMXWAIST    float64
dtype: object


In [8]:
body.mean()

BMXWT        77.403791
BMXHT       160.136792
BMXARML      36.031035
BMXLEG       37.157972
BMXARMC      32.710329
BMXHIP      109.178370
BMXWAIST     98.488107
dtype: float64

# Series

In [9]:
s = body.BMXWT
print(f"Shape: {s.shape}")
print(f"\nHead:\n{s.head()}")

Shape: (4221,)

Head:
0    97.1
1    91.1
2    73.0
3    61.7
4    55.4
Name: BMXWT, dtype: float64


In [10]:
s = body.loc[:, "BMXWT"]
print(f"Shape: {s.shape}")
print(f"\nHead:\n{s.head()}")

Shape: (4221,)

Head:
0    97.1
1    91.1
2    73.0
3    61.7
4    55.4
Name: BMXWT, dtype: float64


In [11]:
print(f".mean method: {s.mean()}")
print(f"numpy function: {np.mean(s)}")

.mean method: 77.40379057095475
numpy function: 77.40379057095475


In [12]:
print(f"Shape: {s.shape}")
print(f"dtype: {s.dtype}")
print(f"Values: {s.values}")
print(f"Name: {s.name}")

Shape: (4221,)
dtype: float64
Values: [97.1 91.1 73.  ... 73.  78.6 82.8]
Name: BMXWT


# Rows (index) & Columns

In [13]:
body

Unnamed: 0,BMXWT,BMXHT,BMXARML,BMXLEG,BMXARMC,BMXHIP,BMXWAIST
0,97.1,160.2,34.7,40.8,35.8,126.1,117.9
1,91.1,152.7,33.5,33.0,38.5,125.5,103.1
2,73.0,161.2,37.4,38.0,31.8,106.2,92.0
3,61.7,157.4,38.0,34.7,29.0,101.0,90.5
4,55.4,154.6,34.6,34.0,28.3,92.5,73.2
...,...,...,...,...,...,...,...
4216,66.8,157.0,32.6,38.4,30.7,103.8,92.5
4217,116.9,167.4,42.2,43.0,40.7,128.4,120.0
4218,73.0,159.6,36.2,37.0,31.4,104.6,99.3
4219,78.6,168.5,38.1,40.2,36.0,102.4,98.5


In [14]:
# The index slot of a data frame stores an object of class Index (or one of its
# derivatives) that gives the row names
print(body.index)

RangeIndex(start=0, stop=4221, step=1)


In [15]:
print(body.columns)

Index(['BMXWT', 'BMXHT', 'BMXARML', 'BMXLEG', 'BMXARMC', 'BMXHIP', 'BMXWAIST'], dtype='object')


In [16]:
df

Unnamed: 0,a,b,c,d,e,f,g
0,0.44,1.0,True,A,spam,2021-01-01,[spam]
1,0.06,2.5,True,B,spam,2022-02-02,"[bacon, spam]"
2,0.4,,False,C,bacon,2023-03-03,
3,0.74,4.0,False,,spam,2024-04-04,"[eggs, bacon, spam]"
4,0.18,,True,E,eggs,2025-05-05,[ham]


In [17]:
df2 = df.set_index("e")
df2

Unnamed: 0_level_0,a,b,c,d,f,g
e,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
spam,0.44,1.0,True,A,2021-01-01,[spam]
spam,0.06,2.5,True,B,2022-02-02,"[bacon, spam]"
bacon,0.4,,False,C,2023-03-03,
spam,0.74,4.0,False,,2024-04-04,"[eggs, bacon, spam]"
eggs,0.18,,True,E,2025-05-05,[ham]


In [18]:
print(df2.index.name)

e


In [19]:
df2.rename_axis("new_column").reset_index()

Unnamed: 0,new_column,a,b,c,d,f,g
0,spam,0.44,1.0,True,A,2021-01-01,[spam]
1,spam,0.06,2.5,True,B,2022-02-02,"[bacon, spam]"
2,bacon,0.4,,False,C,2023-03-03,
3,spam,0.74,4.0,False,,2024-04-04,"[eggs, bacon, spam]"
4,eggs,0.18,,True,E,2025-05-05,[ham]


In [20]:
df2

Unnamed: 0_level_0,a,b,c,d,f,g
e,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
spam,0.44,1.0,True,A,2021-01-01,[spam]
spam,0.06,2.5,True,B,2022-02-02,"[bacon, spam]"
bacon,0.4,,False,C,2023-03-03,
spam,0.74,4.0,False,,2024-04-04,"[eggs, bacon, spam]"
eggs,0.18,,True,E,2025-05-05,[ham]


In [21]:
df2.reset_index(drop=True)

Unnamed: 0,a,b,c,d,f,g
0,0.44,1.0,True,A,2021-01-01,[spam]
1,0.06,2.5,True,B,2022-02-02,"[bacon, spam]"
2,0.4,,False,C,2023-03-03,
3,0.74,4.0,False,,2024-04-04,"[eggs, bacon, spam]"
4,0.18,,True,E,2025-05-05,[ham]


In [22]:
df2.rename(columns={"a": "spam"})

Unnamed: 0_level_0,spam,b,c,d,f,g
e,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
spam,0.44,1.0,True,A,2021-01-01,[spam]
spam,0.06,2.5,True,B,2022-02-02,"[bacon, spam]"
bacon,0.4,,False,C,2023-03-03,
spam,0.74,4.0,False,,2024-04-04,"[eggs, bacon, spam]"
eggs,0.18,,True,E,2025-05-05,[ham]


# groupby

In [23]:
nhanes = pd.read_csv(
    "data/nhanes_p_demo_bmx_2020.csv",
    comment="#").rename({
        "BMXBMI": "bmival",
        "RIAGENDR": "gender",
        "DMDBORN4": "usborn"
    }, axis=1)

In [24]:
nhanes

Unnamed: 0,SEQN,BMDSTATS,BMXWT,BMIWT,BMXRECUM,BMIRECUM,BMXHEAD,BMIHEAD,BMXHT,BMIHT,...,FIAINTRP,MIALANG,MIAPROXY,MIAINTRP,AIALANGA,WTINTPRP,WTMECPRP,SDMVPSU,SDMVSTRA,INDFMPIR
0,109263,4,,,,,,,,,...,2.0,,,,,7891.762435,8951.815567,3,156,4.66
1,109264,1,42.2,,,,,,154.7,,...,2.0,1.0,2.0,2.0,1.0,11689.747264,12271.157043,1,155,0.83
2,109265,1,12.0,,91.6,,,,89.3,,...,2.0,,,,,16273.825939,16658.764203,1,157,3.06
3,109266,1,97.1,,,,,,160.2,,...,2.0,1.0,2.0,2.0,1.0,7825.646112,8154.968193,2,168,5.00
4,109269,3,13.6,,90.9,,,,,1.0,...,2.0,,,,,5906.250521,6848.271782,2,152,0.96
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14295,124818,1,108.8,,,,,,168.7,3.0,...,2.0,1.0,2.0,2.0,1.0,21586.596728,21666.889837,1,166,3.82
14296,124819,1,15.4,,94.9,,,,93.7,,...,2.0,,,,,1664.919253,1838.169709,2,171,0.07
14297,124820,1,22.9,,,,,,123.3,,...,2.0,,,,,14819.783161,16497.806674,1,157,1.22
14298,124821,1,79.5,,,,,,176.4,,...,2.0,1.0,2.0,2.0,1.0,4666.817952,4853.430230,1,158,3.71


In [25]:
res = nhanes.groupby(["gender","usborn"])["bmival"].mean()
res

gender  usborn
1       1         25.734110
        2         27.405251
2       1         27.120261
        2         27.579448
        77        28.725000
        99        32.600000
Name: bmival, dtype: float64

In [26]:
res = res.reset_index()
res

Unnamed: 0,gender,usborn,bmival
0,1,1,25.73411
1,1,2,27.405251
2,2,1,27.120261
3,2,2,27.579448
4,2,77,28.725
5,2,99,32.6


# Aggregating DataFrame

In [27]:
df = pd.DataFrame(
    dict(u = np.round(np.random.rand(5), 2),
         v = np.round(np.random.randn(5), 2),
         w = ["spam", "bacon", "spam", "eggs", "sausage"]),
    index=["a", "b", "c", "d", "e"]
)
df

Unnamed: 0,u,v,w
a,0.18,0.74,spam
b,0.53,1.49,bacon
c,0.53,-0.94,spam
d,0.63,1.18,eggs
e,0.85,-1.25,sausage


In [28]:
u = df.loc[:, "u"]
print(u)

a    0.18
b    0.53
c    0.53
d    0.63
e    0.85
Name: u, dtype: float64


In [29]:
np.quantile(u, [0, 0.5, 1])


array([0.18, 0.53, 0.85])

In [30]:
uv = df.loc[:, ["u", "v"]]
uv

Unnamed: 0,u,v
a,0.18,0.74
b,0.53,1.49
c,0.53,-0.94
d,0.63,1.18
e,0.85,-1.25


In [31]:
np.quantile(uv, [0, 0.5, 1], axis=0)

array([[ 0.18, -1.25],
       [ 0.53,  0.74],
       [ 0.85,  1.49]])

In [32]:
df.mean(numeric_only=True)

u    0.544
v    0.244
dtype: float64

In [33]:
df.quantile([0, 0.5, 1], numeric_only=True)

Unnamed: 0,u,v
0.0,0.18,-1.25
0.5,0.53,0.74
1.0,0.85,1.49


In [34]:
df.median(numeric_only=True)

u    0.53
v    0.74
dtype: float64

In [35]:
df.min(numeric_only=True)

u    0.18
v   -1.25
dtype: float64

In [36]:
df.max(numeric_only=True)

u    0.85
v    1.49
dtype: float64

In [37]:
df.var(numeric_only=True)

u    0.05848
v    1.57713
dtype: float64

In [38]:
df.std(numeric_only=True)

u    0.241826
v    1.255838
dtype: float64

In [39]:
df.skew(numeric_only=True)

u   -0.558188
v   -0.445562
dtype: float64

In [40]:
def q25(x):
    return x.quantile(0.25)
q25.__name__ = "25%"


def q50(x):
    return x.quantile(0.5)
q50.__name__ = "50%"


def q75(x):
    return x.quantile(0.75)
q75.__name__ = "75%"

In [41]:
df.loc[:, ["u", "v"]].agg(["count", "mean", "std", "min", q25, q50, q75, "max"])

Unnamed: 0,u,v
count,5.0,5.0
mean,0.544,0.244
std,0.241826,1.255838
min,0.18,-1.25
25%,0.53,-0.94
50%,0.53,0.74
75%,0.63,1.18
max,0.85,1.49


In [42]:
df.describe()

Unnamed: 0,u,v
count,5.0,5.0
mean,0.544,0.244
std,0.241826,1.255838
min,0.18,-1.25
25%,0.53,-0.94
50%,0.53,0.74
75%,0.63,1.18
max,0.85,1.49


# Transform DataFrame

In [43]:
df = pd.DataFrame(
    dict(u = np.round(np.random.rand(5), 2),
         v = np.round(np.random.randn(5), 2),
         w = ["spam", "bacon", "spam", "eggs", "sausage"]),
    index=["a", "b", "c", "d", "e"]
)

df

Unnamed: 0,u,v,w
a,0.29,-0.64,spam
b,0.63,-2.75,bacon
c,0.09,-0.23,spam
d,0.43,-0.7,eggs
e,0.43,-1.77,sausage


In [44]:
np.exp(df.u)

a    1.336427
b    1.877611
c    1.094174
d    1.537258
e    1.537258
Name: u, dtype: float64

In [45]:
np.exp(df.loc[:, ["u", "v"]])

Unnamed: 0,u,v
a,1.336427,0.527292
b,1.877611,0.063928
c,1.094174,0.794534
d,1.537258,0.496585
e,1.537258,0.170333


In [46]:
u = df.u

# standardized version of u
u_s = (u - np.mean(u)) / np.std(u)

print(u_s)

a   -0.470280
b    1.433235
c   -1.589995
d    0.313520
e    0.313520
Name: u, dtype: float64


In [47]:
df.loc[:, "u"] > df.loc[:, "v"]

a    True
b    True
c    True
d    True
e    True
dtype: bool

In [48]:
# standardized version of u and v
df.loc[:, ["u", "v"]].apply( lambda x: (x-np.mean(x))/np.std(x) )

Unnamed: 0,u,v
a,-0.47028,0.628389
b,1.433235,-1.665556
c,-1.589995,1.074131
d,0.31352,0.563158
e,0.31352,-0.600122


In [49]:
df.loc[:, ["u_s", "v_s"]] = np.array(df.loc[:, ["u", "v"]].apply(lambda x: (x-np.mean(x))/np.std(x)))
df

Unnamed: 0,u,v,w,u_s,v_s
a,0.29,-0.64,spam,-0.47028,0.628389
b,0.63,-2.75,bacon,1.433235,-1.665556
c,0.09,-0.23,spam,-1.589995,1.074131
d,0.43,-0.7,eggs,0.31352,0.563158
e,0.43,-1.77,sausage,0.31352,-0.600122


In [50]:
df.loc[:, "uv_squared"] = (df.loc[:, "u"] * df.loc[:, "v"])**2
df

Unnamed: 0,u,v,w,u_s,v_s,uv_squared
a,0.29,-0.64,spam,-0.47028,0.628389,0.034447
b,0.63,-2.75,bacon,1.433235,-1.665556,3.001556
c,0.09,-0.23,spam,-1.589995,1.074131,0.000428
d,0.43,-0.7,eggs,0.31352,0.563158,0.090601
e,0.43,-1.77,sausage,0.31352,-0.600122,0.579273


# Indexing Series

In [51]:
np.random.seed(123)
b = pd.Series(np.round(np.random.rand(10), 2))
b

0    0.70
1    0.29
2    0.23
3    0.55
4    0.72
5    0.42
6    0.98
7    0.68
8    0.48
9    0.39
dtype: float64

In [52]:
new_index = np.random.permutation(np.arange(10))
new_index

array([2, 1, 8, 7, 9, 4, 5, 6, 3, 0])

In [53]:
b.index = new_index
b

2    0.70
1    0.29
8    0.23
7    0.55
9    0.72
4    0.42
5    0.98
6    0.68
3    0.48
0    0.39
dtype: float64

In [54]:
c = b.copy()
c.index = list("abcdefghij")
c

a    0.70
b    0.29
c    0.23
d    0.55
e    0.72
f    0.42
g    0.98
h    0.68
i    0.48
j    0.39
dtype: float64

## .loc

In [55]:
print(b.loc[0])
print(c.loc["j"])

0.39
0.39


In [56]:
print(b.loc[[0, 1, 0]])

0    0.39
1    0.29
0    0.39
dtype: float64


In [57]:
print(c.loc[["j", "b", "j"]])

j    0.39
b    0.29
j    0.39
dtype: float64


In [58]:
b.loc[(b > 0.4) & (b < 0.6)]

7    0.55
4    0.42
3    0.48
dtype: float64

## .iloc

In [59]:
print(b.iloc[0])

0.7


In [60]:
print(b.iloc[0:7]) # returns the 1st, 2nd, ..., 7th element

2    0.70
1    0.29
8    0.23
7    0.55
9    0.72
4    0.42
5    0.98
dtype: float64


In [61]:
print(b.iloc[1:7]) # returns the 2nd, 3rd, ..., 7th element

1    0.29
8    0.23
7    0.55
9    0.72
4    0.42
5    0.98
dtype: float64


# Indexing DataFrame

In [62]:
np.random.seed(123)
df = pd.DataFrame(dict(
    u = np.round(np.random.rand(5), 2),
    v = np.round(np.random.randn(5), 2),
    w = ["spam", "bacon", "spam", "eggs", "sausage"],
    x = [True, False, True, False, True]
))

df

Unnamed: 0,u,v,w,x
0,0.7,0.32,spam,True
1,0.29,-0.05,bacon,False
2,0.23,-0.2,spam,True
3,0.55,1.98,eggs,False
4,0.72,-1.62,sausage,True


In [63]:
df.loc[:, "u":"v"]

Unnamed: 0,u,v
0,0.7,0.32
1,0.29,-0.05
2,0.23,-0.2
3,0.55,1.98
4,0.72,-1.62


In [64]:
df.loc[ df.loc[:, "u"] > 0.5, "u":"v"]

Unnamed: 0,u,v
0,0.7,0.32
3,0.55,1.98
4,0.72,-1.62


In [65]:
df.iloc[:3, :]

Unnamed: 0,u,v,w,x
0,0.7,0.32,spam,True
1,0.29,-0.05,bacon,False
2,0.23,-0.2,spam,True


In [66]:
df.iloc[:3, :].loc[:, ["u", "v"]] # first 3 rows

Unnamed: 0,u,v
0,0.7,0.32
1,0.29,-0.05
2,0.23,-0.2


In [67]:
# DIFFERENT FROM THE CODE ABOVE
df.loc[:3, ["u", "v"]] # all rows up to index "3"

Unnamed: 0,u,v
0,0.7,0.32
1,0.29,-0.05
2,0.23,-0.2
3,0.55,1.98


In [68]:
df.loc[ (df.u > 0.5) & (df.u <= 0.7), ["u", "w"]]

Unnamed: 0,u,w
0,0.7,spam
3,0.55,eggs


In [69]:
df.drop(columns=["v"])

Unnamed: 0,u,w,x
0,0.7,spam,True
1,0.29,bacon,False
2,0.23,spam,True
3,0.55,eggs,False
4,0.72,sausage,True


In [70]:
df.loc[df.w.isin(["spam", "bacon"]), :]

Unnamed: 0,u,v,w,x
0,0.7,0.32,spam,True
1,0.29,-0.05,bacon,False
2,0.23,-0.2,spam,True


In [71]:
df.loc[:, "y"] = df.loc[:, "u"]**2

df

Unnamed: 0,u,v,w,x,y
0,0.7,0.32,spam,True,0.49
1,0.29,-0.05,bacon,False,0.0841
2,0.23,-0.2,spam,True,0.0529
3,0.55,1.98,eggs,False,0.3025
4,0.72,-1.62,sausage,True,0.5184


In [72]:
df.insert(1, "new_col", df.u*-1)
df

Unnamed: 0,u,new_col,v,w,x,y
0,0.7,-0.7,0.32,spam,True,0.49
1,0.29,-0.29,-0.05,bacon,False,0.0841
2,0.23,-0.23,-0.2,spam,True,0.0529
3,0.55,-0.55,1.98,eggs,False,0.3025
4,0.72,-0.72,-1.62,sausage,True,0.5184


In [73]:
df2 = pd.DataFrame(dict(
    u = [2],
    new_col = [-2],
    v = [0],
    w = ["cheese"],
    x = [False],
    y = [0],
), index=[999])

df2

Unnamed: 0,u,new_col,v,w,x,y
999,2,-2,0,cheese,False,0


In [74]:
pd.concat([df, df2])

Unnamed: 0,u,new_col,v,w,x,y
0,0.7,-0.7,0.32,spam,True,0.49
1,0.29,-0.29,-0.05,bacon,False,0.0841
2,0.23,-0.23,-0.2,spam,True,0.0529
3,0.55,-0.55,1.98,eggs,False,0.3025
4,0.72,-0.72,-1.62,sausage,True,0.5184
999,2.0,-2.0,0.0,cheese,False,0.0


# Sampling & Splitting

In [75]:
body

Unnamed: 0,BMXWT,BMXHT,BMXARML,BMXLEG,BMXARMC,BMXHIP,BMXWAIST
0,97.1,160.2,34.7,40.8,35.8,126.1,117.9
1,91.1,152.7,33.5,33.0,38.5,125.5,103.1
2,73.0,161.2,37.4,38.0,31.8,106.2,92.0
3,61.7,157.4,38.0,34.7,29.0,101.0,90.5
4,55.4,154.6,34.6,34.0,28.3,92.5,73.2
...,...,...,...,...,...,...,...
4216,66.8,157.0,32.6,38.4,30.7,103.8,92.5
4217,116.9,167.4,42.2,43.0,40.7,128.4,120.0
4218,73.0,159.6,36.2,37.0,31.4,104.6,99.3
4219,78.6,168.5,38.1,40.2,36.0,102.4,98.5


## Select N of rows

In [76]:
# 5 rows without replacement
body.sample(5, random_state=123)

Unnamed: 0,BMXWT,BMXHT,BMXARML,BMXLEG,BMXARMC,BMXHIP,BMXWAIST
4214,58.4,156.2,35.2,34.7,27.2,99.5,77.5
3361,73.7,161.0,36.5,34.5,29.0,107.6,98.2
3759,61.4,164.6,37.5,40.4,26.9,93.5,84.4
3733,120.4,158.8,33.5,34.6,40.5,147.2,129.3
1121,123.5,157.5,35.5,29.0,50.5,143.0,136.4


## Select % of rows

In [77]:
body2 = body.head(10)
body2

Unnamed: 0,BMXWT,BMXHT,BMXARML,BMXLEG,BMXARMC,BMXHIP,BMXWAIST
0,97.1,160.2,34.7,40.8,35.8,126.1,117.9
1,91.1,152.7,33.5,33.0,38.5,125.5,103.1
2,73.0,161.2,37.4,38.0,31.8,106.2,92.0
3,61.7,157.4,38.0,34.7,29.0,101.0,90.5
4,55.4,154.6,34.6,34.0,28.3,92.5,73.2
5,62.0,144.7,32.5,34.2,29.8,106.7,84.8
6,66.2,166.5,37.5,37.6,32.0,96.3,95.7
7,75.9,154.5,35.4,37.6,32.7,107.7,98.7
8,77.2,159.2,38.5,40.5,35.7,102.0,97.5
9,91.6,174.5,36.1,45.9,35.2,121.3,100.3


In [78]:
print(id(body))
print(id(body2))

1747795306720
1747801852368


In [79]:
idx = np.random.permutation(body2.shape[0])
print(idx)

[2 1 8 7 0 4 5 6 3 9]


In [80]:
# select 80% rows, with replacement
k = int(body2.shape[0]*0.8)
body2.iloc[idx[:k], :]

Unnamed: 0,BMXWT,BMXHT,BMXARML,BMXLEG,BMXARMC,BMXHIP,BMXWAIST
2,73.0,161.2,37.4,38.0,31.8,106.2,92.0
1,91.1,152.7,33.5,33.0,38.5,125.5,103.1
8,77.2,159.2,38.5,40.5,35.7,102.0,97.5
7,75.9,154.5,35.4,37.6,32.7,107.7,98.7
0,97.1,160.2,34.7,40.8,35.8,126.1,117.9
4,55.4,154.6,34.6,34.0,28.3,92.5,73.2
5,62.0,144.7,32.5,34.2,29.8,106.7,84.8
6,66.2,166.5,37.5,37.6,32.0,96.3,95.7


In [81]:
# select the other 20%
body2.iloc[idx[k:], :]

Unnamed: 0,BMXWT,BMXHT,BMXARML,BMXLEG,BMXARMC,BMXHIP,BMXWAIST
3,61.7,157.4,38.0,34.7,29.0,101.0,90.5
9,91.6,174.5,36.1,45.9,35.2,121.3,100.3


# Method chaining

In [82]:
air = pd.read_csv("data/air_quality_2018_means.csv", comment="#")

In [83]:
air.head()

Unnamed: 0,sp_name,param_id,value
0,Alphington,API,0.542584
1,Alphington,BPM2.5,7.848758
2,Alphington,CO,0.210884
3,Alphington,HPM10,17.835714
4,Alphington,NO2,9.55812


In [84]:
print(f"Shape: {air.shape}")
print(f"\ndtype:\n{air.dtypes}")

Shape: (56, 3)

dtype:
sp_name      object
param_id     object
value       float64
dtype: object


In [85]:
air.param_id.isin(["BPM2.5", "NO2"]).head(10)

0    False
1     True
2    False
3    False
4     True
5    False
6    False
7    False
8    False
9     True
Name: param_id, dtype: bool

In [86]:
air.loc[air.param_id.isin(["BPM2.5", "NO2"]), :]

Unnamed: 0,sp_name,param_id,value
1,Alphington,BPM2.5,7.848758
4,Alphington,NO2,9.55812
9,Altona North,NO2,9.467912
13,Churchill,BPM2.5,6.39123
15,Dandenong,NO2,9.800705
18,Footscray,BPM2.5,7.640948
20,Footscray,NO2,10.274531
25,Geelong South,BPM2.5,6.502762
27,Geelong South,NO2,5.681722
31,Melbourne CBD,BPM2.5,8.072998


In [87]:
# We can use backslash "\" or round brackets "()"
air.\
loc[air.param_id.isin(["BPM2.5", "NO2"]), :].\
reset_index(drop=True)

Unnamed: 0,sp_name,param_id,value
0,Alphington,BPM2.5,7.848758
1,Alphington,NO2,9.55812
2,Altona North,NO2,9.467912
3,Churchill,BPM2.5,6.39123
4,Dandenong,NO2,9.800705
5,Footscray,BPM2.5,7.640948
6,Footscray,NO2,10.274531
7,Geelong South,BPM2.5,6.502762
8,Geelong South,NO2,5.681722
9,Melbourne CBD,BPM2.5,8.072998


In [88]:
(
    air.
    loc[air.param_id.isin(["BPM2.5", "NO2"]), :].
    reset_index(drop=True)
)

Unnamed: 0,sp_name,param_id,value
0,Alphington,BPM2.5,7.848758
1,Alphington,NO2,9.55812
2,Altona North,NO2,9.467912
3,Churchill,BPM2.5,6.39123
4,Dandenong,NO2,9.800705
5,Footscray,BPM2.5,7.640948
6,Footscray,NO2,10.274531
7,Geelong South,BPM2.5,6.502762
8,Geelong South,NO2,5.681722
9,Melbourne CBD,BPM2.5,8.072998


In [89]:
air2 = (
    air.
    loc[air.param_id.isin(["BPM2.5", "NO2"]), :].
    reset_index(drop=True)
)

In [90]:
air2

Unnamed: 0,sp_name,param_id,value
0,Alphington,BPM2.5,7.848758
1,Alphington,NO2,9.55812
2,Altona North,NO2,9.467912
3,Churchill,BPM2.5,6.39123
4,Dandenong,NO2,9.800705
5,Footscray,BPM2.5,7.640948
6,Footscray,NO2,10.274531
7,Geelong South,BPM2.5,6.502762
8,Geelong South,NO2,5.681722
9,Melbourne CBD,BPM2.5,8.072998


# Sorting

In [91]:
air2.sort_values("value", ascending=False)

Unnamed: 0,sp_name,param_id,value
6,Footscray,NO2,10.274531
4,Dandenong,NO2,9.800705
1,Alphington,NO2,9.55812
2,Altona North,NO2,9.467912
9,Melbourne CBD,BPM2.5,8.072998
14,Traralgon,BPM2.5,8.024735
0,Alphington,BPM2.5,7.848758
5,Footscray,BPM2.5,7.640948
11,Morwell East,BPM2.5,6.784596
12,Morwell South,BPM2.5,6.512849


In [92]:
air2.sort_values(["param_id", "value"], ascending=[True, False])

Unnamed: 0,sp_name,param_id,value
9,Melbourne CBD,BPM2.5,8.072998
14,Traralgon,BPM2.5,8.024735
0,Alphington,BPM2.5,7.848758
5,Footscray,BPM2.5,7.640948
11,Morwell East,BPM2.5,6.784596
12,Morwell South,BPM2.5,6.512849
7,Geelong South,BPM2.5,6.502762
10,Moe,BPM2.5,6.427079
3,Churchill,BPM2.5,6.39123
6,Footscray,NO2,10.274531


In [93]:
# air2.sort_values("value", ascending=False)
pd.DataFrame(dict(
    a1 = air2.value,
    a2 = np.argsort(air2.value)
))

Unnamed: 0,a1,a2
0,7.848758,13
1,9.55812,8
2,9.467912,15
3,6.39123,3
4,9.800705,10
5,7.640948,7
6,10.274531,12
7,6.502762,11
8,5.681722,5
9,8.072998,0


In [94]:
air2.iloc[np.argsort(air2.value), :]

Unnamed: 0,sp_name,param_id,value
13,Morwell South,NO2,5.12443
8,Geelong South,NO2,5.681722
15,Traralgon,NO2,5.776333
3,Churchill,BPM2.5,6.39123
10,Moe,BPM2.5,6.427079
7,Geelong South,BPM2.5,6.502762
12,Morwell South,BPM2.5,6.512849
11,Morwell East,BPM2.5,6.784596
5,Footscray,BPM2.5,7.640948
0,Alphington,BPM2.5,7.848758


In [95]:
air2.sort_values("value", ascending=True)

Unnamed: 0,sp_name,param_id,value
13,Morwell South,NO2,5.12443
8,Geelong South,NO2,5.681722
15,Traralgon,NO2,5.776333
3,Churchill,BPM2.5,6.39123
10,Moe,BPM2.5,6.427079
7,Geelong South,BPM2.5,6.502762
12,Morwell South,BPM2.5,6.512849
11,Morwell East,BPM2.5,6.784596
5,Footscray,BPM2.5,7.640948
0,Alphington,BPM2.5,7.848758


In [96]:
# The following 2 lines are equivalent
a1 = air2.iloc[np.argsort(air2.value), :]
a2 = air2.sort_values("value", ascending=True)

x1 = pd.Series(['X']).repeat(air2.shape[0])
x1.name = "X"
x1.index = a1.index

pd.concat([a1, x1, a2], axis=1)

Unnamed: 0,sp_name,param_id,value,X,sp_name.1,param_id.1,value.1
13,Morwell South,NO2,5.12443,X,Morwell South,NO2,5.12443
8,Geelong South,NO2,5.681722,X,Geelong South,NO2,5.681722
15,Traralgon,NO2,5.776333,X,Traralgon,NO2,5.776333
3,Churchill,BPM2.5,6.39123,X,Churchill,BPM2.5,6.39123
10,Moe,BPM2.5,6.427079,X,Moe,BPM2.5,6.427079
7,Geelong South,BPM2.5,6.502762,X,Geelong South,BPM2.5,6.502762
12,Morwell South,BPM2.5,6.512849,X,Morwell South,BPM2.5,6.512849
11,Morwell East,BPM2.5,6.784596,X,Morwell East,BPM2.5,6.784596
5,Footscray,BPM2.5,7.640948,X,Footscray,BPM2.5,7.640948
0,Alphington,BPM2.5,7.848758,X,Alphington,BPM2.5,7.848758


# Stacking and Unstacking (Long and Wide Forms)

In [97]:
# Long format (stacked)
air2

Unnamed: 0,sp_name,param_id,value
0,Alphington,BPM2.5,7.848758
1,Alphington,NO2,9.55812
2,Altona North,NO2,9.467912
3,Churchill,BPM2.5,6.39123
4,Dandenong,NO2,9.800705
5,Footscray,BPM2.5,7.640948
6,Footscray,NO2,10.274531
7,Geelong South,BPM2.5,6.502762
8,Geelong South,NO2,5.681722
9,Melbourne CBD,BPM2.5,8.072998


In [98]:
# Wide format (unstacked)
air2.set_index(["sp_name", "param_id"])

Unnamed: 0_level_0,Unnamed: 1_level_0,value
sp_name,param_id,Unnamed: 2_level_1
Alphington,BPM2.5,7.848758
Alphington,NO2,9.55812
Altona North,NO2,9.467912
Churchill,BPM2.5,6.39123
Dandenong,NO2,9.800705
Footscray,BPM2.5,7.640948
Footscray,NO2,10.274531
Geelong South,BPM2.5,6.502762
Geelong South,NO2,5.681722
Melbourne CBD,BPM2.5,8.072998


In [99]:
# Wide format (unstacked)
air2.set_index(["sp_name", "param_id"]).unstack()

Unnamed: 0_level_0,value,value
param_id,BPM2.5,NO2
sp_name,Unnamed: 1_level_2,Unnamed: 2_level_2
Alphington,7.848758,9.55812
Altona North,,9.467912
Churchill,6.39123,
Dandenong,,9.800705
Footscray,7.640948,10.274531
Geelong South,6.502762,5.681722
Melbourne CBD,8.072998,
Moe,6.427079,
Morwell East,6.784596,
Morwell South,6.512849,5.12443


In [100]:
# Wide format (unstacked)
# .loc[...] to drop the last level of the hierarchy
air2_wide = air2.set_index(["sp_name", "param_id"]).unstack().loc[:, "value"]
air2_wide

param_id,BPM2.5,NO2
sp_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Alphington,7.848758,9.55812
Altona North,,9.467912
Churchill,6.39123,
Dandenong,,9.800705
Footscray,7.640948,10.274531
Geelong South,6.502762,5.681722
Melbourne CBD,8.072998,
Moe,6.427079,
Morwell East,6.784596,
Morwell South,6.512849,5.12443


In [101]:
# Back to the long format
air2_wide.T

sp_name,Alphington,Altona North,Churchill,Dandenong,Footscray,Geelong South,Melbourne CBD,Moe,Morwell East,Morwell South,Traralgon
param_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
BPM2.5,7.848758,,6.39123,,7.640948,6.502762,8.072998,6.427079,6.784596,6.512849,8.024735
NO2,9.55812,9.467912,,9.800705,10.274531,5.681722,,,,5.12443,5.776333


In [102]:
# Back to the long format
air2_wide.T.rename_axis(index="location", columns="param")

param,Alphington,Altona North,Churchill,Dandenong,Footscray,Geelong South,Melbourne CBD,Moe,Morwell East,Morwell South,Traralgon
location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
BPM2.5,7.848758,,6.39123,,7.640948,6.502762,8.072998,6.427079,6.784596,6.512849,8.024735
NO2,9.55812,9.467912,,9.800705,10.274531,5.681722,,,,5.12443,5.776333


In [103]:
# Back to the long format
air2_wide.T.rename_axis(index="location", columns="param").\
stack()

location  param        
BPM2.5    Alphington        7.848758
          Churchill         6.391230
          Footscray         7.640948
          Geelong South     6.502762
          Melbourne CBD     8.072998
          Moe               6.427079
          Morwell East      6.784596
          Morwell South     6.512849
          Traralgon         8.024735
NO2       Alphington        9.558120
          Altona North      9.467912
          Dandenong         9.800705
          Footscray        10.274531
          Geelong South     5.681722
          Morwell South     5.124430
          Traralgon         5.776333
dtype: float64

In [104]:
# Back to the long format
air2_wide.T.rename_axis(index="location", columns="param").\
stack().rename("value")

location  param        
BPM2.5    Alphington        7.848758
          Churchill         6.391230
          Footscray         7.640948
          Geelong South     6.502762
          Melbourne CBD     8.072998
          Moe               6.427079
          Morwell East      6.784596
          Morwell South     6.512849
          Traralgon         8.024735
NO2       Alphington        9.558120
          Altona North      9.467912
          Dandenong         9.800705
          Footscray        10.274531
          Geelong South     5.681722
          Morwell South     5.124430
          Traralgon         5.776333
Name: value, dtype: float64

In [105]:
# Back to the long format
air2_wide.T.rename_axis(index="location", columns="param").\
stack().rename("value").reset_index()

Unnamed: 0,location,param,value
0,BPM2.5,Alphington,7.848758
1,BPM2.5,Churchill,6.39123
2,BPM2.5,Footscray,7.640948
3,BPM2.5,Geelong South,6.502762
4,BPM2.5,Melbourne CBD,8.072998
5,BPM2.5,Moe,6.427079
6,BPM2.5,Morwell East,6.784596
7,BPM2.5,Morwell South,6.512849
8,BPM2.5,Traralgon,8.024735
9,NO2,Alphington,9.55812


# Merging

In [106]:
A = pd.DataFrame({
    "x": ["a0", "a1", "a2", "a3"],
    "y": ["b0", "b1", "b2", "b3"]
})

A

Unnamed: 0,x,y
0,a0,b0
1,a1,b1
2,a2,b2
3,a3,b3


In [107]:
B = pd.DataFrame({
    "x": ["a0", "a2", "a2", "a4"],
    "z": ["c0", "c1", "c2", "c3"]
})

B

Unnamed: 0,x,z
0,a0,c0
1,a2,c1
2,a2,c2
3,a4,c3


In [108]:
# Inner join
pd.merge(A, B, on="x")

Unnamed: 0,x,y,z
0,a0,b0,c0
1,a2,b2,c1
2,a2,b2,c2


In [109]:
# Left join
pd.merge(A, B, how="left", on="x")

Unnamed: 0,x,y,z
0,a0,b0,c0
1,a1,b1,
2,a2,b2,c1
3,a2,b2,c2
4,a3,b3,


In [110]:
# Right join
pd.merge(A, B, how="right", on="x")

Unnamed: 0,x,y,z
0,a0,b0,c0
1,a2,b2,c1
2,a2,b2,c2
3,a4,,c3


In [111]:
# Full outer join
pd.merge(A, B, how="outer", on="x")

Unnamed: 0,x,y,z
0,a0,b0,c0
1,a1,b1,
2,a2,b2,c1
3,a2,b2,c2
4,a3,b3,
5,a4,,c3
