[Reference](https://towardsdatascience.com/11-useful-pandas-functionalities-you-might-have-overlooked-ad080527c768)

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame(
    data={
        "x": ["a", "a", "a", "b", "b", "b", "c"],
        "y": [np.NaN, 2, 3, 1, np.NaN, 3, np.NaN],
    }
)
df.groupby("x").nth(0)

Unnamed: 0_level_0,y
x,Unnamed: 1_level_1
a,
b,1.0
c,


In [3]:
df.groupby("x").nth([0, 2])

Unnamed: 0_level_0,y
x,Unnamed: 1_level_1
a,
a,3.0
b,1.0
b,3.0
c,


In [4]:
X = pd.DataFrame(
	data={
    	"x": ["a", "a", "a", "b", "b", "b", "c"],
    	"y": [np.NaN, 2, 3, 1, np.NaN, 3, np.NaN],
	}
)

y = X.pop("y")

print(X.shape)
print(y.shape)

(7, 1)
(7,)


In [5]:
a = pd.DataFrame(
	data={
    	"col_1": [1, 2, 3, 4],
    	"col_2": [5, 6, 7, 8],
	}
)

b = pd.DataFrame(
	data={
    	"col_1": [1, 2, 3, 9],
    	"col_2": [5, 6, 7, 8],
	}
)

a.compare(b)

Unnamed: 0_level_0,col_1,col_1
Unnamed: 0_level_1,self,other
3,4.0,9.0


In [6]:
a.compare(a)

In [7]:
a.compare(b, keep_shape=True)

Unnamed: 0_level_0,col_1,col_1,col_2,col_2
Unnamed: 0_level_1,self,other,self,other
0,,,,
1,,,,
2,,,,
3,4.0,9.0,,


In [8]:
a.compare(b, keep_shape=True, keep_equal=True)

Unnamed: 0_level_0,col_1,col_1,col_2,col_2
Unnamed: 0_level_1,self,other,self,other
0,1,1,5,5
1,2,2,6,6
2,3,3,7,7
3,4,9,8,8


In [9]:
X = pd.DataFrame(
    data={
        "a": [1, 2, 3, 4, 5, 6],
        "b": [2, 3, 4, 5, 6, 7],
        "y": [3, 4, 5, 6, 7, 8],
    }
)

y = X.pop("y")
X = X.iloc[[0, 3, 5], :]
X

Unnamed: 0,a,b
0,1,2
3,4,5
5,6,7


In [10]:
y, X = y.align(X, join="inner")
X.index == y.index

array([ True,  True,  True])

In [11]:
df = pd.DataFrame(
    data={
        "x": ["a", "a", "a", "b", "b", "b", "c"],
        "y": [np.NaN, 2, 3, 1, np.NaN, 3, np.NaN],
    }
)
print(df.to_markdown())

|    | x   |   y |
|---:|:----|----:|
|  0 | a   | nan |
|  1 | a   |   2 |
|  2 | a   |   3 |
|  3 | b   |   1 |
|  4 | b   | nan |
|  5 | b   |   3 |
|  6 | c   | nan |


In [12]:
print(df.to_markdown(tablefmt="latex"))

\begin{tabular}{rlr}
\hline
    & x   &   y \\
\hline
  0 & a   & nan \\
  1 & a   &   2 \\
  2 & a   &   3 \\
  3 & b   &   1 \\
  4 & b   & nan \\
  5 & b   &   3 \\
  6 & c   & nan \\
\hline
\end{tabular}


In [13]:
df = pd.DataFrame(
    {
        "a": [1, 2, 3, 4, 5],
        "b": [1, 2, np.nan, 4, 5],
        "c": ["x", "y", np.nan, "x", "y"],
        "d": pd.Series([True, False, True, True, False], dtype="object"),
        "e": [np.nan, 100.5, 200, 200, 100],
        "f": ["a", "b", "c", "a", "c"],
    }
)
df

Unnamed: 0,a,b,c,d,e,f
0,1,1.0,x,True,,a
1,2,2.0,y,False,100.5,b
2,3,,,True,200.0,c
3,4,4.0,x,True,200.0,a
4,5,5.0,y,False,100.0,c


In [14]:
df.dtypes

a      int64
b    float64
c     object
d     object
e    float64
f     object
dtype: object

In [15]:
df_2 = df.convert_dtypes()
df_2

Unnamed: 0,a,b,c,d,e,f
0,1,1.0,x,True,,a
1,2,2.0,y,False,100.5,b
2,3,,,True,200.0,c
3,4,4.0,x,True,200.0,a
4,5,5.0,y,False,100.0,c


In [16]:
df_2.dtypes

a      Int64
b      Int64
c     string
d    boolean
e    Float64
f     string
dtype: object

In [17]:
df_3 = df.convert_dtypes(convert_boolean=False)
df_3

Unnamed: 0,a,b,c,d,e,f
0,1,1.0,x,1,,a
1,2,2.0,y,0,100.5,b
2,3,,,1,200.0,c
3,4,4.0,x,1,200.0,a
4,5,5.0,y,0,100.0,c


In [19]:
# df["x"].astype("category")

In [20]:
from pandas.api.types import CategoricalDtype

df = pd.DataFrame({
    "size": ["XL", "S", "M", "XS", "L"],
    "sales": [50, 10, 20, 90, 100]}
)

categories = CategoricalDtype(
    ["XS", "S", "M", "L", "XL"], 
    ordered=True
)

df["size"] = df["size"].astype(categories)
df

Unnamed: 0,size,sales
0,XL,50
1,S,10
2,M,20
3,XS,90
4,L,100


In [21]:
df.sort_values(by="size")

Unnamed: 0,size,sales
3,XS,90
1,S,10
2,M,20
4,L,100
0,XL,50


In [22]:
df[df["size"] > "M"]

Unnamed: 0,size,sales
0,XL,50
4,L,100


In [23]:
df = pd.DataFrame(np.random.randint(0, 100, size=(10000000, 5)))
df[df <= 90] = 0

In [24]:
def memory_usage(df):
    return(round(df.memory_usage(deep=True).sum() / 1024 ** 2, 2))

In [25]:
memory_usage(df)

381.47

In [26]:
df_1 = df.astype("uint8")
memory_usage(df_1)

47.68

In [27]:
df_2 = df.astype(pd.SparseDtype("uint8", 0))
memory_usage(df_2)

21.47

In [28]:
N = 1000
df = pd.DataFrame({
    "group": np.random.choice(["AA", "BB"], N),
    "region": np.random.choice(["a", "b", "c"], N, p=[0.5, 0.3, 0.2]),
    "category": np.random.choice(["x", "y", "z"], N, p=[0.3, 0.3, 0.4]),
    "sales": np.random.normal(1000, 50, N)
})
df

Unnamed: 0,group,region,category,sales
0,BB,c,x,1025.874952
1,BB,b,z,988.441799
2,AA,b,z,1037.014329
3,BB,a,x,983.474403
4,BB,b,y,1095.410817
...,...,...,...,...
995,BB,a,z,1060.178121
996,AA,a,y,1107.760925
997,BB,b,x,938.843566
998,BB,a,z,962.499966


In [29]:
pd.crosstab(df['region'], df['category'])

category,x,y,z
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,160,154,181
b,88,81,122
c,69,66,79


In [30]:
pd.crosstab(df['region'], [df["group"], df['category']])

group,AA,AA,AA,BB,BB,BB
category,x,y,z,x,y,z
region,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
a,78,76,82,82,78,99
b,51,42,40,37,39,82
c,33,32,48,36,34,31


In [31]:
pd.crosstab(df['region'], df['category'], margins=True)

category,x,y,z,All
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a,160,154,181,495
b,88,81,122,291
c,69,66,79,214
All,317,301,382,1000


In [32]:
pd.crosstab(df['region'], df['category'], normalize=True)

category,x,y,z
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,0.16,0.154,0.181
b,0.088,0.081,0.122
c,0.069,0.066,0.079


In [33]:
pd.crosstab(
    df["region"], 
    df["category"],
    values = df["sales"],
    aggfunc = "mean"
).round(2)

category,x,y,z
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
a,999.99,1004.3,994.48
b,998.86,997.52,1004.0
c,996.33,1003.18,1001.91


In [34]:
df_agg = pd.crosstab([df["group"], df['category']], df['region'])
df_agg

Unnamed: 0_level_0,region,a,b,c
group,category,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AA,x,78,51,33
AA,y,76,42,32
AA,z,82,40,48
BB,x,82,37,36
BB,y,78,39,34
BB,z,99,82,31


In [35]:
df_agg.swaplevel()

Unnamed: 0_level_0,region,a,b,c
category,group,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
x,AA,78,51,33
y,AA,76,42,32
z,AA,82,40,48
x,BB,82,37,36
y,BB,78,39,34
z,BB,99,82,31


In [36]:
df = pd.DataFrame(
    index=pd.date_range("2023-01-01", "2023-12-31")
)
df["value"] = list(range(len(df)))
df

Unnamed: 0,value
2023-01-01,0
2023-01-02,1
2023-01-03,2
2023-01-04,3
2023-01-05,4
...,...
2023-12-27,360
2023-12-28,361
2023-12-29,362
2023-12-30,363


In [37]:
df.resample("W").count().head()

Unnamed: 0,value
2023-01-01,1
2023-01-08,7
2023-01-15,7
2023-01-22,7
2023-01-29,7


In [38]:
df.resample("MS").sum()

Unnamed: 0,value
2023-01-01,465
2023-02-01,1246
2023-03-01,2294
2023-04-01,3135
2023-05-01,4185
2023-06-01,4965
2023-07-01,6076
2023-08-01,7037
2023-09-01,7725
2023-10-01,8928


In [39]:
df.resample("4M").max()

Unnamed: 0,value
2023-01-31,30
2023-05-31,150
2023-09-30,272
2024-01-31,364
