In [1]:
import pandas as pd

In [138]:
df = pd.read_csv("titanic.csv")
df.shape

(891, 15)

In [139]:
df.head().T

Unnamed: 0,0,1,2,3,4
survived,0,1,1,1,0
pclass,3,1,3,1,3
sex,male,female,female,female,male
age,22.0,38.0,26.0,35.0,35.0
sibsp,1,1,0,1,0
parch,0,0,0,0,0
fare,7.25,71.2833,7.925,53.1,8.05
embarked,S,C,S,S,S
class,Third,First,Third,First,Third
who,man,woman,woman,woman,man


In [140]:
df2 = df.groupby(["sex", "age"], dropna=False, observed=False).agg(
    {
        "survived": ["size", "mean"],
        "fare": "mean",
        "embarked": ["first", "last", "nunique"],
    }
)
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,survived,survived,fare,embarked,embarked,embarked
Unnamed: 0_level_1,Unnamed: 1_level_1,size,mean,mean,first,last,nunique
sex,age,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
female,0.75,2,1.000000,19.258300,C,C,1
female,1.00,2,1.000000,13.437500,S,C,2
female,2.00,6,0.333333,43.245833,S,S,1
female,3.00,2,0.500000,31.327100,C,S,2
female,4.00,5,1.000000,22.828340,S,S,2
...,...,...,...,...,...,...,...
male,70.50,1,0.000000,7.750000,Q,Q,1
male,71.00,2,0.000000,42.079200,C,C,1
male,74.00,1,0.000000,7.775000,S,S,1
male,80.00,1,1.000000,30.000000,S,S,1


In [189]:
def parse_res(res):
    type_name = type(res).__name__
    if type_name == "Series":
        extra_info = f", {len(res.index)}"
    elif type_name == "DataFrame":
        extra_info = f", {res.shape}"
    else:
        extra_info = ""
    return f"{type_name}{extra_info}"


# test
print(f"{parse_res(df.iloc[0,0]) = }")
print(f"{parse_res(df[df.columns[0]]) = }")
print(f"{parse_res(df[1:3]) = }")

parse_res(df.iloc[0,0]) = 'int64'
parse_res(df[df.columns[0]]) = 'Series, 891'
parse_res(df[1:3]) = 'DataFrame, (2, 15)'


### index operator `[]`

inputs:
- label
- list of labels
- slice object
- boolean array


In [190]:
# index operator on srs
srs = df["survived"]
srs.index = df["fare"]
srs

fare
7.2500     0
71.2833    1
7.9250     1
53.1000    1
8.0500     0
          ..
13.0000    0
30.0000    1
23.4500    0
30.0000    1
7.7500     0
Name: survived, Length: 891, dtype: int64

In [191]:
print(f"{parse_res(srs[0]) = }")  # label
print(f"{parse_res(srs[211.5]) = }")  # unique label

print(f"{parse_res(srs[[211.5,0]]) = }")  # list of labels

print(f"{parse_res(srs[:5]) = }")  # slice

print(f"{parse_res(srs[srs >0.5]) = }")  # boolean array

parse_res(srs[0]) = 'Series, 15'
parse_res(srs[211.5]) = 'int64'
parse_res(srs[[211.5,0]]) = 'Series, 16'
parse_res(srs[:5]) = 'Series, 873'
parse_res(srs[srs >0.5]) = 'Series, 342'


  print(f"{parse_res(srs[:5]) = }") # slice


In [192]:
# index operator
print(f"{parse_res(df['survived']) = }")  # label

print(
    f"{parse_res(df[[i for i in df.columns if i.startswith('s')]]) = }"
)  # list of labels

print(f"{parse_res(df[df['age']>15]) = }")  # boolean array

print(f"{parse_res(df[1:3]) = }")  # integer slice

parse_res(df['survived']) = 'Series, 891'
parse_res(df[[i for i in df.columns if i.startswith('s')]]) = 'DataFrame, (891, 3)'
parse_res(df[df['age']>15]) = 'DataFrame, (631, 15)'
parse_res(df[1:3]) = 'DataFrame, (2, 15)'


In [193]:
# index operator on multi-index df
print(f"{parse_res(df2['survived']) = }")  # level 0 label

print(f"{parse_res(df2[('survived','size')]) = }")  # label

print(
    f"{parse_res(df2[[i for i in df2.columns if i[0] == 'embarked']]) = }"
)  # list of label

print(f"{parse_res(df2[1:3]) = }")  # integer slice

parse_res(df2['survived']) = 'DataFrame, (147, 2)'
parse_res(df2[('survived','size')]) = 'Series, 147'
parse_res(df2[[i for i in df2.columns if i[0] == 'embarked']]) = 'DataFrame, (147, 3)'
parse_res(df2[1:3]) = 'DataFrame, (2, 6)'



### index operator on the attribute `.loc[]`

inputs:
- label
- list of labels
- slice obj w/ labels
- boolean array
- tuple of above
- missing axes' inputs assumed to be the null slice

Note: index operator on the attribute `.iloc[]`, inputs:
- same as above, but integer based

In [194]:
# series
srs = df["survived"]
srs.index = df["fare"]
srs

fare
7.2500     0
71.2833    1
7.9250     1
53.1000    1
8.0500     0
          ..
13.0000    0
30.0000    1
23.4500    0
30.0000    1
7.7500     0
Name: survived, Length: 891, dtype: int64

In [195]:
print(f"{parse_res(srs.loc) = }")  # loc attr

print(f"{parse_res(srs.loc[0]) = }")  # label
print(f"{parse_res(srs.loc[6.8583]) = }")

try:
    print(f"{parse_res(srs.loc[-1]) = }")
except KeyError as e:
    print(f"KeyError: {e}")

print(f"{parse_res(srs.loc[srs.index[0]]) = }")  # label from integer location

print(f"{parse_res(srs.loc[[0,211.5]]) = }")  # list of label

print(f"{parse_res(srs.loc[6.8583:35.0]) = }")  # slice

print(f"{parse_res(srs.loc[srs > 6]) = }")  # boolean array

srs2 = df.groupby(["sex", "class"]).agg({"survived": "mean"})
print(f"{parse_res(srs2.loc[('male','First')]) = }")  # tuple of above

parse_res(srs.loc) = '_LocIndexer'
parse_res(srs.loc[0]) = 'Series, 15'
parse_res(srs.loc[6.8583]) = 'int64'
KeyError: -1
parse_res(srs.loc[srs.index[0]]) = 'Series, 13'
parse_res(srs.loc[[0,211.5]]) = 'Series, 16'
parse_res(srs.loc[6.8583:35.0]) = 'Series, 0'
parse_res(srs.loc[srs > 6]) = 'Series, 0'
parse_res(srs2.loc[('male','First')]) = 'Series, 1'


In [196]:
# df
print(f"{parse_res(df.loc) = }")  # loc attr

print(f"{parse_res(df.loc[0]) = }")  # index label

print(f"{parse_res(df.loc[[0,1]]) = }")  # list of index label

print(f"{parse_res(df.loc[1:2]) = }")  # slice

print(f"{parse_res(df.loc[df['age'] > 6]) = }")  # boolean array

df3 = df.groupby(["sex", "class"]).agg({"survived": "mean"})
print(f"{parse_res(df3.loc[('male','First')]) = }")  # tuple of above

parse_res(df.loc) = '_LocIndexer'
parse_res(df.loc[0]) = 'Series, 15'
parse_res(df.loc[[0,1]]) = 'DataFrame, (2, 15)'
parse_res(df.loc[1:2]) = 'DataFrame, (2, 15)'
parse_res(df.loc[df['age'] > 6]) = 'DataFrame, (667, 15)'
parse_res(df3.loc[('male','First')]) = 'Series, 1'


### boolean indexing

In [144]:
gdf = df.groupby("class", dropna=False, observed=False).agg(
    {"survived": ["size", "sum", "mean"]}
)
gdf

Unnamed: 0_level_0,survived,survived,survived
Unnamed: 0_level_1,size,sum,mean
class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
First,216,136,0.62963
Second,184,87,0.472826
Third,491,119,0.242363


In [162]:
s = gdf["survived"]["mean"]
gdf.loc[(s > 0.7) | (s < 0.3)].index == gdf.loc[
    ~((s <= 0.7) & (s >= 0.3))
].index  # bracket is required for comparison

array([ True])

### isin

In [167]:
df2.loc[df2.index.isin(["male"], level=0)].index.get_level_values(
    0
).unique()  # level arg

Index(['male'], dtype='object', name='sex')

In [175]:
_ = (
    df[["class", "sex"]]
    .isin(  # df isin dict arg, then row wise agg logic
        {"class": ["First", "Third"], "sex": ["male"]}
    )
    .all(axis=1)
)

df[_].pipe(len)

469

### where

In [198]:
df.where(df["age"] > 50, axis="index").shape  # keeps same shape

(891, 15)

### filter

In [None]:
# select columns based on column name string value
df.filter(like="a_")
df.filter(regex="a_")

### str accessor

In [None]:
# select rows based on column string value
df.loc[df.a_a.str.contains("a_"), :]