In [1]:
import pandas as pd
import numpy as np

df = pd.DataFrame({
    'name': ['Alice', 'Bob', 'Charlie'],
    'score': [85, 92, 78],
    'income': [50000, 60000, 55000]
})

In [2]:
df['normalised'] = df.apply(lambda row: row['score'] + row['income'], axis = 1).transform(lambda x: x/x.sum())
df

Unnamed: 0,name,score,income,normalised
0,Alice,85,50000,0.303077
1,Bob,92,60000,0.363632
2,Charlie,78,55000,0.333291


In [3]:
import pandas as pd
import numpy as np

np.random.seed(42)
n = 100

df_util = pd.DataFrame({
    'CustomerID': np.random.choice(range(1001, 1051), size=n),
    'Region': np.random.choice(['North', 'South', 'East', 'West'], size=n),
    'Tariff': np.random.choice(['Residential', 'Commercial', 'Industrial'], size=n),
    'Usage_kWh': np.random.gamma(2.5, 1.5, size=n).round(2),
    'ReadingDate': pd.date_range('2024-01-01', periods=n, freq='D')
})

# Add billing rate and compute bill
rate_map = {'Residential': 5.0, 'Commercial': 6.5, 'Industrial': 7.5}
df_util['Rate'] = df_util['Tariff'].map(rate_map)
df_util['BillAmount'] = (df_util['Usage_kWh'] * df_util['Rate']).round(2)

# 1. .map() Exercises (Series-level)

## 1. Flag usage level

In [4]:
df_util['flag'] = df_util['Usage_kWh'].map(lambda x: 'High' if x > 5 else 'Low')

## 2. Map region to zone code

In [5]:
df_util['zone'] = df_util['Region'].map({'North': 'Z1', 'South': 'Z2', 'East': 'Z3', 'West': 'Z4'})

## 3. Map tariff to billing tier

In [6]:
df_util['billing_tier'] = df_util['Tariff'].map({'Residential': 'B1', 'Commercial': 'B2', 'Industrial': 'B3'})

## 4. Map usage to rounded buckets

In [7]:
df_util['Usage_kWh'] = df_util['Usage_kWh'].map(lambda x: round(x))

## 5. Map boolean flag for weekend readings

In [8]:
df_util['Weekdays'] = df_util['ReadingDate'].dt.day_name()
df_util['Weekend_flag'] = df_util['Weekdays'].map(lambda x: 'Weekend' if (x == 'Saturday' or x == 'Sunday') else 'Weekday')

In [9]:
df_util

Unnamed: 0,CustomerID,Region,Tariff,Usage_kWh,ReadingDate,Rate,BillAmount,flag,zone,billing_tier,Weekdays,Weekend_flag
0,1039,North,Commercial,4,2024-01-01,6.5,25.09,Low,Z1,B2,Monday,Weekday
1,1029,West,Industrial,5,2024-01-02,7.5,39.75,High,Z4,B3,Tuesday,Weekday
2,1015,North,Residential,5,2024-01-03,5.0,22.70,Low,Z1,B1,Wednesday,Weekday
3,1043,West,Industrial,4,2024-01-04,7.5,29.62,Low,Z4,B3,Thursday,Weekday
4,1008,West,Commercial,2,2024-01-05,6.5,12.80,Low,Z4,B2,Friday,Weekday
...,...,...,...,...,...,...,...,...,...,...,...,...
95,1015,West,Commercial,5,2024-04-05,6.5,35.30,High,Z4,B2,Friday,Weekday
96,1045,West,Residential,5,2024-04-06,5.0,27.00,High,Z4,B1,Saturday,Weekend
97,1001,South,Residential,7,2024-04-07,5.0,35.35,High,Z2,B1,Sunday,Weekend
98,1025,South,Commercial,3,2024-04-08,6.5,21.45,Low,Z2,B2,Monday,Weekday


# 2. .apply() Exercises (Row-wise or Column-wise)

## 1. Create summary string per row

In [10]:
df_util['summary'] = df_util.apply(lambda row: str(f'Date: {row['ReadingDate']} Amount: {row['BillAmount']}'), axis = 1)

In [11]:
df_util

Unnamed: 0,CustomerID,Region,Tariff,Usage_kWh,ReadingDate,Rate,BillAmount,flag,zone,billing_tier,Weekdays,Weekend_flag,summary
0,1039,North,Commercial,4,2024-01-01,6.5,25.09,Low,Z1,B2,Monday,Weekday,Date: 2024-01-01 00:00:00 Amount: 25.09
1,1029,West,Industrial,5,2024-01-02,7.5,39.75,High,Z4,B3,Tuesday,Weekday,Date: 2024-01-02 00:00:00 Amount: 39.75
2,1015,North,Residential,5,2024-01-03,5.0,22.70,Low,Z1,B1,Wednesday,Weekday,Date: 2024-01-03 00:00:00 Amount: 22.7
3,1043,West,Industrial,4,2024-01-04,7.5,29.62,Low,Z4,B3,Thursday,Weekday,Date: 2024-01-04 00:00:00 Amount: 29.62
4,1008,West,Commercial,2,2024-01-05,6.5,12.80,Low,Z4,B2,Friday,Weekday,Date: 2024-01-05 00:00:00 Amount: 12.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1015,West,Commercial,5,2024-04-05,6.5,35.30,High,Z4,B2,Friday,Weekday,Date: 2024-04-05 00:00:00 Amount: 35.3
96,1045,West,Residential,5,2024-04-06,5.0,27.00,High,Z4,B1,Saturday,Weekend,Date: 2024-04-06 00:00:00 Amount: 27.0
97,1001,South,Residential,7,2024-04-07,5.0,35.35,High,Z2,B1,Sunday,Weekend,Date: 2024-04-07 00:00:00 Amount: 35.35
98,1025,South,Commercial,3,2024-04-08,6.5,21.45,Low,Z2,B2,Monday,Weekday,Date: 2024-04-08 00:00:00 Amount: 21.45


## 2. Apply conditional billing category

In [12]:
df_util['BillingCategory'] = df_util['BillAmount'].apply(lambda x: 'High Amount' if x>35 else 'Low Amount')

## 3. Apply function to multiple columns

In [13]:
df_util['TotalBill'] = df_util.apply(lambda x: x['Rate'] * x['Usage_kWh'], axis=1)

## 4. Apply lambda to column-wise stats

In [14]:
df_util['Running_usage'] = df_util['Usage_kWh'].apply(lambda x: x).mean()

In [15]:
df_util

Unnamed: 0,CustomerID,Region,Tariff,Usage_kWh,ReadingDate,Rate,BillAmount,flag,zone,billing_tier,Weekdays,Weekend_flag,summary,BillingCategory,TotalBill,Running_usage
0,1039,North,Commercial,4,2024-01-01,6.5,25.09,Low,Z1,B2,Monday,Weekday,Date: 2024-01-01 00:00:00 Amount: 25.09,Low Amount,26.0,3.84
1,1029,West,Industrial,5,2024-01-02,7.5,39.75,High,Z4,B3,Tuesday,Weekday,Date: 2024-01-02 00:00:00 Amount: 39.75,High Amount,37.5,3.84
2,1015,North,Residential,5,2024-01-03,5.0,22.70,Low,Z1,B1,Wednesday,Weekday,Date: 2024-01-03 00:00:00 Amount: 22.7,Low Amount,25.0,3.84
3,1043,West,Industrial,4,2024-01-04,7.5,29.62,Low,Z4,B3,Thursday,Weekday,Date: 2024-01-04 00:00:00 Amount: 29.62,Low Amount,30.0,3.84
4,1008,West,Commercial,2,2024-01-05,6.5,12.80,Low,Z4,B2,Friday,Weekday,Date: 2024-01-05 00:00:00 Amount: 12.8,Low Amount,13.0,3.84
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1015,West,Commercial,5,2024-04-05,6.5,35.30,High,Z4,B2,Friday,Weekday,Date: 2024-04-05 00:00:00 Amount: 35.3,High Amount,32.5,3.84
96,1045,West,Residential,5,2024-04-06,5.0,27.00,High,Z4,B1,Saturday,Weekend,Date: 2024-04-06 00:00:00 Amount: 27.0,Low Amount,25.0,3.84
97,1001,South,Residential,7,2024-04-07,5.0,35.35,High,Z2,B1,Sunday,Weekend,Date: 2024-04-07 00:00:00 Amount: 35.35,High Amount,35.0,3.84
98,1025,South,Commercial,3,2024-04-08,6.5,21.45,Low,Z2,B2,Monday,Weekday,Date: 2024-04-08 00:00:00 Amount: 21.45,Low Amount,19.5,3.84


## 5. Apply conditional logic to flag anomalies

In [16]:
df_util['anamolyFlag'] = df_util['Usage_kWh'].apply(lambda x: 'Very Low Usage' if x < 3 else ('Appropriate Usage' if x>=3 and x<=7 else 'Very High Usage'))

# 3. .transform() Exercises (Group-wise)

## 1. Normalize usage per region

In [17]:
df_util['Normalize'] = df_util.groupby('Region')['Usage_kWh'].transform(lambda x: (x - x.min()) / (x.max() - x.min()))

## 2. Rank usage within tariff

In [18]:
df_util['rank'] = df_util.groupby('Tariff')['Usage_kWh'].transform(lambda x: x.rank(method='dense'))

## 3. Flag top 2 usage per region

In [19]:
df_util.groupby('Region')['Usage_kWh'].transform(lambda x: x>=x.nlargest(2).min())

0     False
1     False
2     False
3     False
4     False
      ...  
95    False
96    False
97     True
98    False
99    False
Name: Usage_kWh, Length: 100, dtype: bool

## 4. Rolling mean of usage per customer

In [20]:
df_util['Rolling_mean'] = df_util.groupby('CustomerID')['Usage_kWh'].transform(lambda x: x.rolling(window=2,min_periods = 1).mean())

In [21]:
df_util

Unnamed: 0,CustomerID,Region,Tariff,Usage_kWh,ReadingDate,Rate,BillAmount,flag,zone,billing_tier,Weekdays,Weekend_flag,summary,BillingCategory,TotalBill,Running_usage,anamolyFlag,Normalize,rank,Rolling_mean
0,1039,North,Commercial,4,2024-01-01,6.5,25.09,Low,Z1,B2,Monday,Weekday,Date: 2024-01-01 00:00:00 Amount: 25.09,Low Amount,26.0,3.84,Appropriate Usage,0.272727,4.0,4.0
1,1029,West,Industrial,5,2024-01-02,7.5,39.75,High,Z4,B3,Tuesday,Weekday,Date: 2024-01-02 00:00:00 Amount: 39.75,High Amount,37.5,3.84,Appropriate Usage,0.571429,6.0,5.0
2,1015,North,Residential,5,2024-01-03,5.0,22.70,Low,Z1,B1,Wednesday,Weekday,Date: 2024-01-03 00:00:00 Amount: 22.7,Low Amount,25.0,3.84,Appropriate Usage,0.363636,5.0,5.0
3,1043,West,Industrial,4,2024-01-04,7.5,29.62,Low,Z4,B3,Thursday,Weekday,Date: 2024-01-04 00:00:00 Amount: 29.62,Low Amount,30.0,3.84,Appropriate Usage,0.428571,5.0,4.0
4,1008,West,Commercial,2,2024-01-05,6.5,12.80,Low,Z4,B2,Friday,Weekday,Date: 2024-01-05 00:00:00 Amount: 12.8,Low Amount,13.0,3.84,Very Low Usage,0.142857,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1015,West,Commercial,5,2024-04-05,6.5,35.30,High,Z4,B2,Friday,Weekday,Date: 2024-04-05 00:00:00 Amount: 35.3,High Amount,32.5,3.84,Appropriate Usage,0.571429,5.0,6.5
96,1045,West,Residential,5,2024-04-06,5.0,27.00,High,Z4,B1,Saturday,Weekend,Date: 2024-04-06 00:00:00 Amount: 27.0,Low Amount,25.0,3.84,Appropriate Usage,0.571429,5.0,4.5
97,1001,South,Residential,7,2024-04-07,5.0,35.35,High,Z2,B1,Sunday,Weekend,Date: 2024-04-07 00:00:00 Amount: 35.35,High Amount,35.0,3.84,Appropriate Usage,1.000000,7.0,7.0
98,1025,South,Commercial,3,2024-04-08,6.5,21.45,Low,Z2,B2,Monday,Weekday,Date: 2024-04-08 00:00:00 Amount: 21.45,Low Amount,19.5,3.84,Appropriate Usage,0.428571,3.0,3.0


## 5. Bill-to-usage ratio per tariff

In [22]:
# df_util['Bill_to_Usage_Ratio'] = df_util['BillAmount'] / df_util['Usage_kWh']
df_util['Bill_to_Usage_Ratio'] = df_util.groupby('Tariff').apply(lambda x: x['BillAmount']).reset_index(drop=True)

  df_util['Bill_to_Usage_Ratio'] = df_util.groupby('Tariff').apply(lambda x: x['BillAmount']).reset_index(drop=True)


In [23]:
df_util

Unnamed: 0,CustomerID,Region,Tariff,Usage_kWh,ReadingDate,Rate,BillAmount,flag,zone,billing_tier,...,Weekend_flag,summary,BillingCategory,TotalBill,Running_usage,anamolyFlag,Normalize,rank,Rolling_mean,Bill_to_Usage_Ratio
0,1039,North,Commercial,4,2024-01-01,6.5,25.09,Low,Z1,B2,...,Weekday,Date: 2024-01-01 00:00:00 Amount: 25.09,Low Amount,26.0,3.84,Appropriate Usage,0.272727,4.0,4.0,25.09
1,1029,West,Industrial,5,2024-01-02,7.5,39.75,High,Z4,B3,...,Weekday,Date: 2024-01-02 00:00:00 Amount: 39.75,High Amount,37.5,3.84,Appropriate Usage,0.571429,6.0,5.0,12.80
2,1015,North,Residential,5,2024-01-03,5.0,22.70,Low,Z1,B1,...,Weekday,Date: 2024-01-03 00:00:00 Amount: 22.7,Low Amount,25.0,3.84,Appropriate Usage,0.363636,5.0,5.0,28.66
3,1043,West,Industrial,4,2024-01-04,7.5,29.62,Low,Z4,B3,...,Weekday,Date: 2024-01-04 00:00:00 Amount: 29.62,Low Amount,30.0,3.84,Appropriate Usage,0.428571,5.0,4.0,12.02
4,1008,West,Commercial,2,2024-01-05,6.5,12.80,Low,Z4,B2,...,Weekday,Date: 2024-01-05 00:00:00 Amount: 12.8,Low Amount,13.0,3.84,Very Low Usage,0.142857,2.0,2.0,21.32
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1015,West,Commercial,5,2024-04-05,6.5,35.30,High,Z4,B2,...,Weekday,Date: 2024-04-05 00:00:00 Amount: 35.3,High Amount,32.5,3.84,Appropriate Usage,0.571429,5.0,6.5,27.55
96,1045,West,Residential,5,2024-04-06,5.0,27.00,High,Z4,B1,...,Weekend,Date: 2024-04-06 00:00:00 Amount: 27.0,Low Amount,25.0,3.84,Appropriate Usage,0.571429,5.0,4.5,28.35
97,1001,South,Residential,7,2024-04-07,5.0,35.35,High,Z2,B1,...,Weekend,Date: 2024-04-07 00:00:00 Amount: 35.35,High Amount,35.0,3.84,Appropriate Usage,1.000000,7.0,7.0,27.00
98,1025,South,Commercial,3,2024-04-08,6.5,21.45,Low,Z2,B2,...,Weekday,Date: 2024-04-08 00:00:00 Amount: 21.45,Low Amount,19.5,3.84,Appropriate Usage,0.428571,3.0,3.0,35.35


# 4. reduce() Exercises (Aggregation)

## 1. Total usage across all customers

In [24]:
from functools import reduce

df_util['TotalUsage'] = reduce(lambda x,y: x+y, df_util['Usage_kWh'])

In [25]:
df_util

Unnamed: 0,CustomerID,Region,Tariff,Usage_kWh,ReadingDate,Rate,BillAmount,flag,zone,billing_tier,...,summary,BillingCategory,TotalBill,Running_usage,anamolyFlag,Normalize,rank,Rolling_mean,Bill_to_Usage_Ratio,TotalUsage
0,1039,North,Commercial,4,2024-01-01,6.5,25.09,Low,Z1,B2,...,Date: 2024-01-01 00:00:00 Amount: 25.09,Low Amount,26.0,3.84,Appropriate Usage,0.272727,4.0,4.0,25.09,384
1,1029,West,Industrial,5,2024-01-02,7.5,39.75,High,Z4,B3,...,Date: 2024-01-02 00:00:00 Amount: 39.75,High Amount,37.5,3.84,Appropriate Usage,0.571429,6.0,5.0,12.80,384
2,1015,North,Residential,5,2024-01-03,5.0,22.70,Low,Z1,B1,...,Date: 2024-01-03 00:00:00 Amount: 22.7,Low Amount,25.0,3.84,Appropriate Usage,0.363636,5.0,5.0,28.66,384
3,1043,West,Industrial,4,2024-01-04,7.5,29.62,Low,Z4,B3,...,Date: 2024-01-04 00:00:00 Amount: 29.62,Low Amount,30.0,3.84,Appropriate Usage,0.428571,5.0,4.0,12.02,384
4,1008,West,Commercial,2,2024-01-05,6.5,12.80,Low,Z4,B2,...,Date: 2024-01-05 00:00:00 Amount: 12.8,Low Amount,13.0,3.84,Very Low Usage,0.142857,2.0,2.0,21.32,384
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1015,West,Commercial,5,2024-04-05,6.5,35.30,High,Z4,B2,...,Date: 2024-04-05 00:00:00 Amount: 35.3,High Amount,32.5,3.84,Appropriate Usage,0.571429,5.0,6.5,27.55,384
96,1045,West,Residential,5,2024-04-06,5.0,27.00,High,Z4,B1,...,Date: 2024-04-06 00:00:00 Amount: 27.0,Low Amount,25.0,3.84,Appropriate Usage,0.571429,5.0,4.5,28.35,384
97,1001,South,Residential,7,2024-04-07,5.0,35.35,High,Z2,B1,...,Date: 2024-04-07 00:00:00 Amount: 35.35,High Amount,35.0,3.84,Appropriate Usage,1.000000,7.0,7.0,27.00,384
98,1025,South,Commercial,3,2024-04-08,6.5,21.45,Low,Z2,B2,...,Date: 2024-04-08 00:00:00 Amount: 21.45,Low Amount,19.5,3.84,Appropriate Usage,0.428571,3.0,3.0,35.35,384


## 2. Concatenate all unique regions

In [34]:
unique_values = reduce(lambda x,y: x|{y}, df_util['Region'],set())
unique_values = list(unique_values)
unique_values

['West', 'South', 'North', 'East']

## 3. Reduce list of bill amounts to max difference

In [45]:
df_util['Maximum'] = reduce(lambda x,y: x.max() - y.min(), df_util['BillAmount'])

AttributeError: 'float' object has no attribute 'max'

df_util

## 4. Reduce usage to average manually

In [47]:
df_util['Average_usage'] = reduce(lambda x,y: (x + y), df_util['Usage_kWh']) / len(df_util['Usage_kWh'])

In [48]:
df_util

Unnamed: 0,CustomerID,Region,Tariff,Usage_kWh,ReadingDate,Rate,BillAmount,flag,zone,billing_tier,...,TotalBill,Running_usage,anamolyFlag,Normalize,rank,Rolling_mean,Bill_to_Usage_Ratio,TotalUsage,Maximum,Average_usage
0,1039,North,Commercial,4,2024-01-01,6.5,25.09,Low,Z1,B2,...,26.0,3.84,Appropriate Usage,0.272727,4.0,4.0,25.09,384,10.48,3.84
1,1029,West,Industrial,5,2024-01-02,7.5,39.75,High,Z4,B3,...,37.5,3.84,Appropriate Usage,0.571429,6.0,5.0,12.80,384,10.48,3.84
2,1015,North,Residential,5,2024-01-03,5.0,22.70,Low,Z1,B1,...,25.0,3.84,Appropriate Usage,0.363636,5.0,5.0,28.66,384,10.48,3.84
3,1043,West,Industrial,4,2024-01-04,7.5,29.62,Low,Z4,B3,...,30.0,3.84,Appropriate Usage,0.428571,5.0,4.0,12.02,384,10.48,3.84
4,1008,West,Commercial,2,2024-01-05,6.5,12.80,Low,Z4,B2,...,13.0,3.84,Very Low Usage,0.142857,2.0,2.0,21.32,384,10.48,3.84
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1015,West,Commercial,5,2024-04-05,6.5,35.30,High,Z4,B2,...,32.5,3.84,Appropriate Usage,0.571429,5.0,6.5,27.55,384,10.48,3.84
96,1045,West,Residential,5,2024-04-06,5.0,27.00,High,Z4,B1,...,25.0,3.84,Appropriate Usage,0.571429,5.0,4.5,28.35,384,10.48,3.84
97,1001,South,Residential,7,2024-04-07,5.0,35.35,High,Z2,B1,...,35.0,3.84,Appropriate Usage,1.000000,7.0,7.0,27.00,384,10.48,3.84
98,1025,South,Commercial,3,2024-04-08,6.5,21.45,Low,Z2,B2,...,19.5,3.84,Appropriate Usage,0.428571,3.0,3.0,35.35,384,10.48,3.84


## 5. Reduce customer IDs to a string

In [55]:
df_util['CustomerID'] = reduce(lambda acc,x: acc+[str(x)], df_util['CustomerID'], [])

In [56]:
df_util

Unnamed: 0,CustomerID,Region,Tariff,Usage_kWh,ReadingDate,Rate,BillAmount,flag,zone,billing_tier,...,TotalBill,Running_usage,anamolyFlag,Normalize,rank,Rolling_mean,Bill_to_Usage_Ratio,TotalUsage,Maximum,Average_usage
0,1039,North,Commercial,4,2024-01-01,6.5,25.09,Low,Z1,B2,...,26.0,3.84,Appropriate Usage,0.272727,4.0,4.0,25.09,384,10.48,3.84
1,1029,West,Industrial,5,2024-01-02,7.5,39.75,High,Z4,B3,...,37.5,3.84,Appropriate Usage,0.571429,6.0,5.0,12.80,384,10.48,3.84
2,1015,North,Residential,5,2024-01-03,5.0,22.70,Low,Z1,B1,...,25.0,3.84,Appropriate Usage,0.363636,5.0,5.0,28.66,384,10.48,3.84
3,1043,West,Industrial,4,2024-01-04,7.5,29.62,Low,Z4,B3,...,30.0,3.84,Appropriate Usage,0.428571,5.0,4.0,12.02,384,10.48,3.84
4,1008,West,Commercial,2,2024-01-05,6.5,12.80,Low,Z4,B2,...,13.0,3.84,Very Low Usage,0.142857,2.0,2.0,21.32,384,10.48,3.84
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1015,West,Commercial,5,2024-04-05,6.5,35.30,High,Z4,B2,...,32.5,3.84,Appropriate Usage,0.571429,5.0,6.5,27.55,384,10.48,3.84
96,1045,West,Residential,5,2024-04-06,5.0,27.00,High,Z4,B1,...,25.0,3.84,Appropriate Usage,0.571429,5.0,4.5,28.35,384,10.48,3.84
97,1001,South,Residential,7,2024-04-07,5.0,35.35,High,Z2,B1,...,35.0,3.84,Appropriate Usage,1.000000,7.0,7.0,27.00,384,10.48,3.84
98,1025,South,Commercial,3,2024-04-08,6.5,21.45,Low,Z2,B2,...,19.5,3.84,Appropriate Usage,0.428571,3.0,3.0,35.35,384,10.48,3.84
