In [2]:
df = pd.DataFrame({'A': [1,2,3,4], 'B': [5,6,7,8]})
df

Unnamed: 0,A,B
0,1,5
1,2,6
2,3,7
3,4,8


In [3]:
# 3 statistical and 3 summarization features
df.agg(['mean', 'std', 'var', 'min', 'max', 'count'])

Unnamed: 0,A,B
mean,2.5,6.5
std,1.290994,1.290994
var,1.666667,1.666667
min,1.0,5.0
max,4.0,8.0
count,4.0,4.0


In [7]:
(df > 6).agg(['any', 'all'])

Unnamed: 0,A,B
any,False,True
all,False,False


In [6]:
df

Unnamed: 0,A,B
0,1,5
1,2,6
2,3,7
3,4,8


In [8]:
def range_func(x):
    return x.max() - x.min()

In [10]:
x = df.agg({'A':range_func})
print(x)

A    3
dtype: int64


In [11]:
df = pd.DataFrame({
    'Category': ['A', 'A', 'B', 'B'],
    'Value': [10,20,30,40]
})
df

Unnamed: 0,Category,Value
0,A,10
1,A,20
2,B,30
3,B,40


In [12]:
df.groupby('Category').agg(['sum', 'count', 'mean'])

Unnamed: 0_level_0,Value,Value,Value
Unnamed: 0_level_1,sum,count,mean
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
A,30,2,15.0
B,70,2,35.0


In [13]:
df = pd.DataFrame({
    'Team': ['A', 'A', 'B', 'B'],
    'Points': [10,20,30,40]
})

In [14]:
df

Unnamed: 0,Team,Points
0,A,10
1,A,20
2,B,30
3,B,40


In [19]:
grouped = df.groupby('Team')

In [16]:
x = grouped['Points'].sum()

In [17]:
x

Team
A    30
B    70
Name: Points, dtype: int64

In [20]:
# Multi column groupby

df = pd.DataFrame({
    'Team': ['A', 'A', 'B', 'B'],
    'Position': ['Forward', 'Guard', 'Forward', 'Guard'],
    'Points': [10,20,30,40]
})

In [21]:
df

Unnamed: 0,Team,Position,Points
0,A,Forward,10
1,A,Guard,20
2,B,Forward,30
3,B,Guard,40


In [22]:
print(df.groupby(['Team','Position'])['Points'].sum())

Team  Position
A     Forward     10
      Guard       20
B     Forward     30
      Guard       40
Name: Points, dtype: int64


In [23]:
df.groupby('Team').agg({'Points': ['sum', 'mean', 'max']})

Unnamed: 0_level_0,Points,Points,Points
Unnamed: 0_level_1,sum,mean,max
Team,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
A,30,15.0,20
B,70,35.0,40


In [24]:
# Custom aggregation using groupby
def range_func(x):
    return x.max() - x.min()

df.groupby('Team').agg({'Points': range_func})

Unnamed: 0_level_0,Points
Team,Unnamed: 1_level_1
A,10
B,10


In [29]:
# filter on group by
print(df.groupby('Team').filter(lambda x: x['Points'].mean() > 15))

  Team Position  Points  Points_mean
2    B  Forward      30         35.0
3    B    Guard      40         35.0


In [26]:
# broadcasting on group by
df['Points_mean'] = df.groupby('Team')['Points'].transform('mean')

In [27]:
df

Unnamed: 0,Team,Position,Points,Points_mean
0,A,Forward,10,15.0
1,A,Guard,20,15.0
2,B,Forward,30,35.0
3,B,Guard,40,35.0


# Case Study
## Groupby and aggregate Methods Exercise

In [30]:
import pandas as pd
# Sample Power Grid Utility data
data = {
    'Region': ['North', 'North', 'East', 'East', 'East', 'South', 'South', 'West', 'West', 'West'],
    'Substation': ['N1', 'N2', 'E1', 'E2', 'E3', 'S1', 'S2', 'W1', 'W2', 'W3'],
    'Energy_Consumed_MWh': [1200, 1400, 1100, 1000, 1050, 950, 970, 1800, 1750, 1690],
    'Energy_Supplied_MWh': [1250, 1450, 1150, 1080, 1100, 1000, 990, 1850, 1800, 1700],
    'Outage_Hours': [2, 3, 1, 0, 1, 2, 3, 4, 3, 2],
    'Billing_Cycle': ['2023-Q1', '2023-Q1', '2023-Q1', '2023-Q1', '2023-Q2', '2023-Q2', '2023-Q2', '2023-Q1', '2023-Q2', '2023-Q2']
}
df = pd.DataFrame(data)

In [31]:
df

Unnamed: 0,Region,Substation,Energy_Consumed_MWh,Energy_Supplied_MWh,Outage_Hours,Billing_Cycle
0,North,N1,1200,1250,2,2023-Q1
1,North,N2,1400,1450,3,2023-Q1
2,East,E1,1100,1150,1,2023-Q1
3,East,E2,1000,1080,0,2023-Q1
4,East,E3,1050,1100,1,2023-Q2
5,South,S1,950,1000,2,2023-Q2
6,South,S2,970,990,3,2023-Q2
7,West,W1,1800,1850,4,2023-Q1
8,West,W2,1750,1800,3,2023-Q2
9,West,W3,1690,1700,2,2023-Q2


In [34]:
df.isnull().sum()

Region                 0
Substation             0
Energy_Consumed_MWh    0
Energy_Supplied_MWh    0
Outage_Hours           0
Billing_Cycle          0
dtype: int64

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 6 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Region               10 non-null     object
 1   Substation           10 non-null     object
 2   Energy_Consumed_MWh  10 non-null     int64 
 3   Energy_Supplied_MWh  10 non-null     int64 
 4   Outage_Hours         10 non-null     int64 
 5   Billing_Cycle        10 non-null     object
dtypes: int64(3), object(3)
memory usage: 612.0+ bytes


## 1. Total Energy Consumption per Region

In [37]:
# Group by Region and calculate total Energy_Consumed_MWh.
df.groupby('Region')['Energy_Consumed_MWh'].sum()

Region
East     3150
North    2600
South    1920
West     5240
Name: Energy_Consumed_MWh, dtype: int64

## 2. Max and Min Outage per Region

In [38]:
# Group by Region and get the maximum and minimum Outage_Hours.
df.groupby('Region')['Outage_Hours'].agg(['max', 'min'])

Unnamed: 0_level_0,max,min
Region,Unnamed: 1_level_1,Unnamed: 2_level_1
East,1,0
North,3,2
South,3,2
West,4,2


## 3. Average Loss % per Region

In [39]:
# function to calculate loss in percentage 
def compute_loss_percent(group):
    loss = group['Energy_Supplied_MWh'].sum() - group['Energy_Consumed_MWh'].sum()
    return (loss / group['Energy_Supplied_MWh'].sum()) * 100


In [40]:
df.groupby('Region').apply(compute_loss_percent).round(2).rename("Loss_Percent (%)")

  df.groupby('Region').apply(compute_loss_percent).round(2).rename("Loss_Percent (%)")


Region
East     5.41
North    3.70
South    3.52
West     2.06
Name: Loss_Percent (%), dtype: float64

## 4: Normalize Energy Consumption within Region

In [41]:
df['Normalized_Consumption'] = df.groupby('Region')['Energy_Consumed_MWh'].transform(lambda x: (x - x.mean()) / x.std())

In [42]:
df

Unnamed: 0,Region,Substation,Energy_Consumed_MWh,Energy_Supplied_MWh,Outage_Hours,Billing_Cycle,Normalized_Consumption
0,North,N1,1200,1250,2,2023-Q1,-0.707107
1,North,N2,1400,1450,3,2023-Q1,0.707107
2,East,E1,1100,1150,1,2023-Q1,1.0
3,East,E2,1000,1080,0,2023-Q1,-1.0
4,East,E3,1050,1100,1,2023-Q2,0.0
5,South,S1,950,1000,2,2023-Q2,-0.707107
6,South,S2,970,990,3,2023-Q2,0.707107
7,West,W1,1800,1850,4,2023-Q1,0.968364
8,West,W2,1750,1800,3,2023-Q2,0.060523
9,West,W3,1690,1700,2,2023-Q2,-1.028887


## 5. Regions With Average Outage > 2

In [44]:
# calculate avg outage
avg_outages = df.groupby('Region')['Outage_Hours'].mean()
#get region with highoutage get it intolist 
regions_with_high_outages = avg_outages[avg_outages > 2].index.tolist()
#filter using isin
df[df['Region'].isin(regions_with_high_outages)]

Unnamed: 0,Region,Substation,Energy_Consumed_MWh,Energy_Supplied_MWh,Outage_Hours,Billing_Cycle,Normalized_Consumption
0,North,N1,1200,1250,2,2023-Q1,-0.707107
1,North,N2,1400,1450,3,2023-Q1,0.707107
5,South,S1,950,1000,2,2023-Q2,-0.707107
6,South,S2,970,990,3,2023-Q2,0.707107
7,West,W1,1800,1850,4,2023-Q1,0.968364
8,West,W2,1750,1800,3,2023-Q2,0.060523
9,West,W3,1690,1700,2,2023-Q2,-1.028887


## 6. Most Efficient Substation

In [45]:
df['Loss'] = df['Energy_Supplied_MWh'] - df['Energy_Consumed_MWh']
df[df['Loss'] == df['Loss'].min()]

Unnamed: 0,Region,Substation,Energy_Consumed_MWh,Energy_Supplied_MWh,Outage_Hours,Billing_Cycle,Normalized_Consumption,Loss
9,West,W3,1690,1700,2,2023-Q2,-1.028887,10


## 7. Total Supplied Energy per Billing Cycle

In [46]:
df.groupby('Billing_Cycle')['Energy_Supplied_MWh'].sum()

Billing_Cycle
2023-Q1    6780
2023-Q2    6590
Name: Energy_Supplied_MWh, dtype: int64

## 8. Multiple Aggregations per Region

In [48]:
df.groupby('Region').agg({
    'Energy_Consumed_MWh': 'mean',
    'Energy_Supplied_MWh': 'max',
    'Outage_Hours': 'sum'
})

Unnamed: 0_level_0,Energy_Consumed_MWh,Energy_Supplied_MWh,Outage_Hours
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
East,1050.0,1150,2
North,1300.0,1450,5
South,960.0,1000,5
West,1746.666667,1850,9


## 9. Outage Severity Flag + Count

In [52]:
df['Severe_Outage'] = df['Outage_Hours'] >= 3
df.groupby('Region')['Severe_Outage'].sum()

Region
East     0
North    1
South    1
West     2
Name: Severe_Outage, dtype: int64

In [53]:
df

Unnamed: 0,Region,Substation,Energy_Consumed_MWh,Energy_Supplied_MWh,Outage_Hours,Billing_Cycle,Normalized_Consumption,Loss,Severe_Outage
0,North,N1,1200,1250,2,2023-Q1,-0.707107,50,False
1,North,N2,1400,1450,3,2023-Q1,0.707107,50,True
2,East,E1,1100,1150,1,2023-Q1,1.0,50,False
3,East,E2,1000,1080,0,2023-Q1,-1.0,80,False
4,East,E3,1050,1100,1,2023-Q2,0.0,50,False
5,South,S1,950,1000,2,2023-Q2,-0.707107,50,False
6,South,S2,970,990,3,2023-Q2,0.707107,20,True
7,West,W1,1800,1850,4,2023-Q1,0.968364,50,True
8,West,W2,1750,1800,3,2023-Q2,0.060523,50,True
9,West,W3,1690,1700,2,2023-Q2,-1.028887,10,False


## 10 Pivot Table of Average Consumption

In [54]:
pivot_df=pd.pivot_table(df, values='Energy_Consumed_MWh', 
               index='Region', columns='Billing_Cycle', aggfunc='mean')

pivot_df

Billing_Cycle,2023-Q1,2023-Q2
Region,Unnamed: 1_level_1,Unnamed: 2_level_1
East,1050.0,1050.0
North,1300.0,
South,,960.0
West,1800.0,1720.0


# Rolling Mean

In [55]:
# calculate Rolling mean 
df = pd.DataFrame({'value': [10, 20, 30, 40, 50]})

In [56]:
df

Unnamed: 0,value
0,10
1,20
2,30
3,40
4,50


In [57]:
df['rolling_mean'] = df['value'].rolling(window=3).mean()

In [58]:
df

Unnamed: 0,value,rolling_mean
0,10,
1,20,
2,30,20.0
3,40,30.0
4,50,40.0


In [59]:
#Rolling with min_periods
df['rolling_mean_min1'] = df['value'].rolling(window=3, min_periods=1).mean()
# Useful when you want early values instead of NaN.

In [60]:
df

Unnamed: 0,value,rolling_mean,rolling_mean_min1
0,10,,10.0
1,20,,15.0
2,30,20.0,20.0
3,40,30.0,30.0
4,50,40.0,40.0


In [61]:
#Centered Rolling Window
df['centered_mean'] = df['value'].rolling(window=3, center=True).mean()
# Labels the result at the center of the window.


In [62]:
df

Unnamed: 0,value,rolling_mean,rolling_mean_min1,centered_mean
0,10,,10.0,
1,20,,15.0,20.0
2,30,20.0,20.0,30.0
3,40,30.0,30.0,40.0
4,50,40.0,40.0,


In [63]:
# time based rolling window 
df = pd.DataFrame({
    'date': pd.date_range(start='2023-01-01', periods=5, freq='D'),
    'value': [10, 20, 30, 40, 50]
})

In [64]:
df

Unnamed: 0,date,value
0,2023-01-01,10
1,2023-01-02,20
2,2023-01-03,30
3,2023-01-04,40
4,2023-01-05,50


In [65]:
df.set_index('date', inplace=True)
df['time_roll'] = df['value'].rolling('2D').mean()

In [66]:
df

Unnamed: 0_level_0,value,time_roll
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-01-01,10,10.0
2023-01-02,20,15.0
2023-01-03,30,25.0
2023-01-04,40,35.0
2023-01-05,50,45.0


# Expanding Mean

In [67]:
df = pd.DataFrame({'value': [10, 20, 30, 40, 50]})
df['expanding_mean'] = df['value'].expanding().mean()

In [68]:
df

Unnamed: 0,value,expanding_mean
0,10,10.0
1,20,15.0
2,30,20.0
3,40,25.0
4,50,30.0


In [69]:
#Expanding Sum with min_periods
df['expanding_sum_min3'] = df['value'].expanding(min_periods=3).sum()
# Returns NaN until at least 3 values are available.

In [70]:
df

Unnamed: 0,value,expanding_mean,expanding_sum_min3
0,10,10.0,
1,20,15.0,
2,30,20.0,60.0
3,40,25.0,100.0
4,50,30.0,150.0


In [71]:
df = pd.DataFrame({
    'A': [1, 2, 3],
    'B': [4, 5, 6]
})

In [72]:
df.expanding().sum()

Unnamed: 0,A,B
0,1.0,4.0
1,3.0,9.0
2,6.0,15.0
