ref: https://pandas.pydata.org/Pandas_Cheat_Sheet.pdf

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.api as sm
from scipy import stats

In [3]:
df = pd.read_csv("titanic.csv")
df.shape

(891, 15)

In [4]:
ts = pd.read_csv("flights.csv")
# Use %b for abbreviated month name (e.g., Jan, Feb, etc.)
ts.index = pd.to_datetime(
    ts["year"].astype(str) + "-" + ts["month"].astype(str) + "-01", format="%Y-%b-%d"
).dt.to_period("D")
ts.shape

(144, 3)

### overview

In [28]:
df.shape

(891, 15)

In [75]:
df.head(1).T

Unnamed: 0,0
survived,0
pclass,3
sex,male
age,22.0
sibsp,1
parch,0
fare,7.25
embarked,S
class,Third
who,man


In [78]:
# vertical info view
_ = pd.concat([df.head(1).T, df.dtypes, df.isna().sum()], axis=1)
_.columns = ["example_value", "dtypes", "n_null"]
_

Unnamed: 0,example_value,dtypes,n_null
survived,0,int64,0
pclass,3,int64,0
sex,male,object,0
age,22.0,float64,177
sibsp,1,int64,0
parch,0,int64,0
fare,7.25,float64,0
embarked,S,object,2
class,Third,object,0
who,man,object,0


### reshape

In [17]:
pd.DataFrame.melt
pd.DataFrame.pivot
pd.DataFrame.pivot_table

<function pandas.core.frame.DataFrame.pivot_table(self, values=None, index=None, columns=None, aggfunc: 'AggFuncType' = 'mean', fill_value=None, margins: 'bool' = False, dropna: 'bool' = True, margins_name: 'Level' = 'All', observed: 'bool | lib.NoDefault' = <no_default>, sort: 'bool' = True) -> 'DataFrame'>

### filtering

In [62]:
_ = df.loc[(df["age"] > 10) & (df["age"] <= 20), :]
print(len(_))

_ = df.loc[df["who"].isin(["man", "woman"]), :]
print(len(_))

115
808


In [60]:
_ = df.query("age > 10 & age <= 20")
print(len(_))

_ = df.query("who in ['man', 'woman']")
print(len(_))

115
808


In [69]:
_ = df.filter(regex="class$").head()
print(_.shape)

(5, 2)


### group

In [None]:
df.groupby("who", dropna=False, observed=False)["survived"].mean()

who
man      0.163873
child    0.590361
woman    0.756458
Name: survived, dtype: float64

In [34]:
df.groupby("who", dropna=False, observed=False).agg(
    {"survived": ["size", "sum", "mean"]}
).round({("survived", "mean"): 2})

Unnamed: 0_level_0,survived,survived,survived
Unnamed: 0_level_1,size,sum,mean
who,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
child,83,49,0.59
man,537,88,0.16
woman,271,205,0.76


In [33]:
df.groupby(["who", "class"], dropna=False, observed=False)["survived"].agg(
    lambda x: (len(x), float(x.mean().round(2)))
).unstack()

class,First,Second,Third
who,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
child,"(6, 0.83)","(19, 1.0)","(58, 0.43)"
man,"(119, 0.35)","(99, 0.08)","(319, 0.12)"
woman,"(91, 0.98)","(66, 0.91)","(114, 0.49)"


In [None]:
df["age_bin"] = pd.qcut(df["age"], q=4)

# groupby dict
df.groupby(
    [
        "age_bin",
    ],
    dropna=False,
    observed=False,
).agg(
    **{
        "n": ("age_bin", "size"),
        "survived_rate": ("survived", "mean"),
        "pct_man": ("who", lambda x: x.isin(["man"]).sum() / len(x)),
        "class": (
            "class",
            lambda x: df.loc[x.index, :]["class"].mode(),
        ),
    }
).round({"survived_rate": 2, "pct_man": 2})

Unnamed: 0_level_0,n,survived_rate,pct_man,class
age_bin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"(0.419, 20.125]",179,0.46,0.35,Third
"(20.125, 28.0]",183,0.36,0.66,Third
"(28.0, 38.0]",175,0.43,0.65,Third
"(38.0, 80.0]",177,0.37,0.66,First
,177,0.29,0.7,Third


In [None]:
# window
ts.rolling(window="100d", min_periods=4)["passengers"].agg("mean")

1949-01-01       NaN
1949-02-01       NaN
1949-03-01       NaN
1949-04-01    122.75
1949-05-01    125.00
               ...  
1960-08-01    558.75
1960-09-01    567.75
1960-10-01    549.25
1960-11-01    491.25
1960-12-01    447.75
Freq: D, Name: passengers, Length: 144, dtype: float64

In [None]:
pd.DataFrame.rank  # operate on group but return original shape

### mutation

In [None]:
pd.Series
pd.Series.map
pd.Series.apply
pd.Series.dt
pd.Series.str

### join

In [None]:
pd.DataFrame.merge