## 8. Grouping and Aggregation

- **Grouping**: `groupby()`
- **Aggregation**: `agg()`, `aggregate()`, `transform()`

In [1]:
from sklearn.datasets import fetch_california_housing
import pandas as pd

# Fetch the California Housing dataset
housing = fetch_california_housing(as_frame=True)
df = housing.frame

# Grouping by a single column
print(df.columns)
grouped = df.groupby('HouseAge')
print(grouped.size())

df

Index(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
       'Latitude', 'Longitude', 'MedHouseVal'],
      dtype='object')
HouseAge
1.0        4
2.0       58
3.0       62
4.0      191
5.0      244
6.0      160
7.0      175
8.0      206
9.0      205
10.0     264
11.0     254
12.0     238
13.0     302
14.0     412
15.0     512
16.0     771
17.0     698
18.0     570
19.0     502
20.0     465
21.0     446
22.0     399
23.0     448
24.0     478
25.0     566
26.0     619
27.0     488
28.0     471
29.0     461
30.0     476
31.0     458
32.0     565
33.0     615
34.0     689
35.0     824
36.0     862
37.0     537
38.0     394
39.0     369
40.0     304
41.0     296
42.0     368
43.0     353
44.0     356
45.0     294
46.0     245
47.0     198
48.0     177
49.0     134
50.0     136
51.0      48
52.0    1273
dtype: int64


Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [2]:
# Aggregating data using agg()
agg_df = df.groupby('HouseAge').agg({
    'MedInc': 'mean',
    'AveBedrms': 'max',
    'Population': 'sum'
})
print(agg_df)


            MedInc  AveBedrms  Population
HouseAge                                 
1.0       4.003400   2.928571      1314.0
2.0       5.167766   2.222222    120817.0
3.0       5.460258   4.046358    186110.0
4.0       5.180673   3.333333    537314.0
5.0       4.697636   4.703196    603017.0
6.0       4.383393   2.372093    346502.0
7.0       4.464457   4.676471    457289.0
8.0       4.459338   6.304348    415781.0
9.0       4.316425   5.045455    386830.0
10.0      4.050508   7.022321    452212.0
11.0      3.932593  11.181818    505436.0
12.0      3.988412   2.861789    506848.0
13.0      4.150874   7.968750    536457.0
14.0      4.115541  11.410714    765142.0
15.0      4.062305   6.084112    830622.0
16.0      4.293998  10.153846   1143822.0
17.0      3.935827  14.111111   1120882.0
18.0      3.977344   5.273585    916939.0
19.0      3.760389  11.000000    812882.0
20.0      3.778618   5.263158    718697.0
21.0      3.852120   8.053846    739376.0
22.0      3.922650   8.857143    6

In [3]:
df['NormInc'] = df.groupby('HouseAge')['MedInc'].transform(lambda x: (x - x.mean()) / x.std())
df[['HouseAge', 'MedInc', 'NormInc']].head()

Unnamed: 0,HouseAge,MedInc,NormInc
0,41.0,8.3252,3.666781
1,21.0,8.3014,2.462188
2,52.0,7.2574,1.459793
3,52.0,5.6431,0.76018
4,52.0,3.8462,-0.018569
