In [54]:
import pandas as pd
import numpy as np
import random

np.random.seed(42)

regions = ['North', 'South', 'East', 'West']
types = ['Residential', 'Commercial']
statuses = ['Paid', 'Unpaid', '', np.nan]
tariffs = ['T1', 'T2', 'T3']
remarks = ['OK', 'Check meter', 'Replaced', '', np.nan, 'Duplicate']

data = []
for i in range(200):
    date_format = random.choice(['%Y-%m-%d', '%d/%m/%Y', '%m-%d-%Y'])
    date = pd.Timestamp('2023-01-01') + pd.to_timedelta(random.randint(0, 180), unit='D')
    formatted_date = date.strftime(date_format) if random.random() > 0.05 else ''

    row = {
        'CustomerID': f"C{str(i+1).zfill(4)}",
        'MeterID': f"M{str(random.randint(1000, 9999))}",
        'Region': random.choice(regions),
        'ConnectionType': random.choice(types),
        'ReadingDate': formatted_date,
        'Energy_kWh': random.choice([round(random.uniform(100, 1200), 2), '', np.nan, 99999]),
        'BillingAmount': random.choice([round(random.uniform(500, 5000), 2), 'NA', '', np.nan]),
        'PaymentStatus': random.choice(statuses),
        'TariffCode': random.choice(tariffs),
        'Remarks': random.choice(remarks)
    }
    data.append(row)

df = pd.DataFrame(data)
df.to_csv('utility_meter_readings.csv', index=False)
print("✅ Dataset saved as 'utility_meter_readings.csv'")

✅ Dataset saved as 'utility_meter_readings.csv'


In [55]:
df = pd.read_csv('utility_meter_readings.csv')
df1 = df

In [56]:
df1

Unnamed: 0,CustomerID,MeterID,Region,ConnectionType,ReadingDate,Energy_kWh,BillingAmount,PaymentStatus,TariffCode,Remarks
0,C0001,M6587,East,Commercial,01/02/2023,753.59,,Paid,T3,Check meter
1,C0002,M9122,East,Residential,2023-03-13,99999.00,,Paid,T3,
2,C0003,M1018,North,Commercial,03/03/2023,99999.00,,Paid,T2,Replaced
3,C0004,M1615,West,Commercial,2023-01-23,,,,T2,Check meter
4,C0005,M7523,North,Residential,09/02/2023,,,,T2,OK
...,...,...,...,...,...,...,...,...,...,...
195,C0196,M7600,North,Commercial,2023-03-25,99999.00,2032.17,Paid,T3,OK
196,C0197,M2103,East,Residential,22/05/2023,99999.00,,Paid,T3,Check meter
197,C0198,M8002,South,Commercial,06-25-2023,581.91,,,T2,
198,C0199,M8029,West,Residential,2023-06-12,,1869.82,Unpaid,T1,Check meter


In [57]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   CustomerID      200 non-null    object 
 1   MeterID         200 non-null    object 
 2   Region          200 non-null    object 
 3   ConnectionType  200 non-null    object 
 4   ReadingDate     191 non-null    object 
 5   Energy_kWh      110 non-null    float64
 6   BillingAmount   51 non-null     float64
 7   PaymentStatus   99 non-null     object 
 8   TariffCode      200 non-null    object 
 9   Remarks         140 non-null    object 
dtypes: float64(2), object(8)
memory usage: 15.8+ KB


In [58]:
df1.isna().sum()

CustomerID          0
MeterID             0
Region              0
ConnectionType      0
ReadingDate         9
Energy_kWh         90
BillingAmount     149
PaymentStatus     101
TariffCode          0
Remarks            60
dtype: int64

# Category: Missing & Format Fixes

## 1.	Replace all '', 'NA', and ' ' with np.nan

In [59]:
for col in df1.columns:
  df1[col] = df1[col].astype(str).replace(to_replace=r'^\s*$|^NA$', value=np.nan, regex=True)

In [60]:
df1.isna().sum()

CustomerID        0
MeterID           0
Region            0
ConnectionType    0
ReadingDate       0
Energy_kWh        0
BillingAmount     0
PaymentStatus     0
TariffCode        0
Remarks           0
dtype: int64

## 2.	Convert ReadingDate to datetime (handle mixed formats)

In [61]:
df1['ReadingDate'] = pd.to_datetime(df1['ReadingDate'], errors='coerce', format='mixed')
df1['ReadingDate'].fillna(method='ffill', inplace=True)
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   CustomerID      200 non-null    object        
 1   MeterID         200 non-null    object        
 2   Region          200 non-null    object        
 3   ConnectionType  200 non-null    object        
 4   ReadingDate     200 non-null    datetime64[ns]
 5   Energy_kWh      200 non-null    object        
 6   BillingAmount   200 non-null    object        
 7   PaymentStatus   200 non-null    object        
 8   TariffCode      200 non-null    object        
 9   Remarks         200 non-null    object        
dtypes: datetime64[ns](1), object(9)
memory usage: 15.8+ KB


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df1['ReadingDate'].fillna(method='ffill', inplace=True)
  df1['ReadingDate'].fillna(method='ffill', inplace=True)


## 3.	Fill missing Energy_kWh with median per Region

In [62]:
df1['Energy_kWh'] = pd.to_numeric(df1['Energy_kWh'], errors='coerce')
median_by_region = df1.groupby('Region')['Energy_kWh'].median()
for region in df1['Region'].unique():
    df1.loc[(df1['Region'] == region) & (df1['Energy_kWh'].isna()), 'Energy_kWh'] = median_by_region[region]

df1['Energy_kWh']

0        753.59
1      99999.00
2      99999.00
3      99999.00
4       1075.76
         ...   
195    99999.00
196    99999.00
197      581.91
198    99999.00
199     1038.35
Name: Energy_kWh, Length: 200, dtype: float64

## 4.	Fill missing BillingAmount with 0 and convert to float

In [63]:
df1['BillingAmount'] = pd.to_numeric(df1['BillingAmount'], errors='coerce')
df1['BillingAmount'] = df1['BillingAmount'].fillna(0)
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   CustomerID      200 non-null    object        
 1   MeterID         200 non-null    object        
 2   Region          200 non-null    object        
 3   ConnectionType  200 non-null    object        
 4   ReadingDate     200 non-null    datetime64[ns]
 5   Energy_kWh      200 non-null    float64       
 6   BillingAmount   200 non-null    float64       
 7   PaymentStatus   200 non-null    object        
 8   TariffCode      200 non-null    object        
 9   Remarks         200 non-null    object        
dtypes: datetime64[ns](1), float64(2), object(7)
memory usage: 15.8+ KB


## 5. Drop rows where CustomerID or MeterID is missing

In [64]:
df1 = df1.dropna(subset=['CustomerID', 'MeterID'])
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   CustomerID      200 non-null    object        
 1   MeterID         200 non-null    object        
 2   Region          200 non-null    object        
 3   ConnectionType  200 non-null    object        
 4   ReadingDate     200 non-null    datetime64[ns]
 5   Energy_kWh      200 non-null    float64       
 6   BillingAmount   200 non-null    float64       
 7   PaymentStatus   200 non-null    object        
 8   TariffCode      200 non-null    object        
 9   Remarks         200 non-null    object        
dtypes: datetime64[ns](1), float64(2), object(7)
memory usage: 15.8+ KB


# Category: String & Categorical Cleanup

## 6. Strip whitespace and standardize PaymentStatus to title case

In [65]:
df1['PaymentStatus'] = df1['PaymentStatus'].astype('string')
df1['PaymentStatus'] = df1['PaymentStatus'].str.strip().str.title()
df1

Unnamed: 0,CustomerID,MeterID,Region,ConnectionType,ReadingDate,Energy_kWh,BillingAmount,PaymentStatus,TariffCode,Remarks
0,C0001,M6587,East,Commercial,2023-01-02,753.59,0.00,Paid,T3,Check meter
1,C0002,M9122,East,Residential,2023-03-13,99999.00,0.00,Paid,T3,
2,C0003,M1018,North,Commercial,2023-03-03,99999.00,0.00,Paid,T2,Replaced
3,C0004,M1615,West,Commercial,2023-01-23,99999.00,0.00,Nan,T2,Check meter
4,C0005,M7523,North,Residential,2023-09-02,1075.76,0.00,Nan,T2,OK
...,...,...,...,...,...,...,...,...,...,...
195,C0196,M7600,North,Commercial,2023-03-25,99999.00,2032.17,Paid,T3,OK
196,C0197,M2103,East,Residential,2023-05-22,99999.00,0.00,Paid,T3,Check meter
197,C0198,M8002,South,Commercial,2023-06-25,581.91,0.00,Nan,T2,
198,C0199,M8029,West,Residential,2023-06-12,99999.00,1869.82,Unpaid,T1,Check meter


## 7. Replace missing PaymentStatus with 'Unknown'

In [66]:
df1['PaymentStatus'] = df1['PaymentStatus'].replace('Nan', 'Unknown')
df1

Unnamed: 0,CustomerID,MeterID,Region,ConnectionType,ReadingDate,Energy_kWh,BillingAmount,PaymentStatus,TariffCode,Remarks
0,C0001,M6587,East,Commercial,2023-01-02,753.59,0.00,Paid,T3,Check meter
1,C0002,M9122,East,Residential,2023-03-13,99999.00,0.00,Paid,T3,
2,C0003,M1018,North,Commercial,2023-03-03,99999.00,0.00,Paid,T2,Replaced
3,C0004,M1615,West,Commercial,2023-01-23,99999.00,0.00,Unknown,T2,Check meter
4,C0005,M7523,North,Residential,2023-09-02,1075.76,0.00,Unknown,T2,OK
...,...,...,...,...,...,...,...,...,...,...
195,C0196,M7600,North,Commercial,2023-03-25,99999.00,2032.17,Paid,T3,OK
196,C0197,M2103,East,Residential,2023-05-22,99999.00,0.00,Paid,T3,Check meter
197,C0198,M8002,South,Commercial,2023-06-25,581.91,0.00,Unknown,T2,
198,C0199,M8029,West,Residential,2023-06-12,99999.00,1869.82,Unpaid,T1,Check meter


## 8. Normalize TariffCode to uppercase and validate against known codes

In [67]:
df1['TariffCode'] = df1['TariffCode'].str.upper()
df1['TariffCode'].isin(['T1', 'T2', 'T3'])

0      True
1      True
2      True
3      True
4      True
       ... 
195    True
196    True
197    True
198    True
199    True
Name: TariffCode, Length: 200, dtype: bool

## 9. Remove rows where Remarks contains 'Duplicate'

In [68]:
masked_data = df1[df1['Remarks'] == 'Duplicate'].index
df1.drop(masked_data, inplace=True)
df1

Unnamed: 0,CustomerID,MeterID,Region,ConnectionType,ReadingDate,Energy_kWh,BillingAmount,PaymentStatus,TariffCode,Remarks
0,C0001,M6587,East,Commercial,2023-01-02,753.59,0.00,Paid,T3,Check meter
1,C0002,M9122,East,Residential,2023-03-13,99999.00,0.00,Paid,T3,
2,C0003,M1018,North,Commercial,2023-03-03,99999.00,0.00,Paid,T2,Replaced
3,C0004,M1615,West,Commercial,2023-01-23,99999.00,0.00,Unknown,T2,Check meter
4,C0005,M7523,North,Residential,2023-09-02,1075.76,0.00,Unknown,T2,OK
...,...,...,...,...,...,...,...,...,...,...
195,C0196,M7600,North,Commercial,2023-03-25,99999.00,2032.17,Paid,T3,OK
196,C0197,M2103,East,Residential,2023-05-22,99999.00,0.00,Paid,T3,Check meter
197,C0198,M8002,South,Commercial,2023-06-25,581.91,0.00,Unknown,T2,
198,C0199,M8029,West,Residential,2023-06-12,99999.00,1869.82,Unpaid,T1,Check meter


## 10 .Standardize ConnectionType to 'Residential' or 'Commercial' only

In [69]:
df1['ConnectionType'].str.strip().str.title()
df1['ConnectionType'].where(df1['ConnectionType'].isin(['Residential','Commercial']), np.nan)

0       Commercial
1      Residential
2       Commercial
3       Commercial
4      Residential
          ...     
195     Commercial
196    Residential
197     Commercial
198    Residential
199     Commercial
Name: ConnectionType, Length: 173, dtype: object

# Category: Filtering & Searching

## 11. Filter all Unpaid customers with usage > 1000 kWh

In [70]:
df1.query("PaymentStatus == 'Unpaid' & Energy_kWh > 1000")

Unnamed: 0,CustomerID,MeterID,Region,ConnectionType,ReadingDate,Energy_kWh,BillingAmount,PaymentStatus,TariffCode,Remarks
11,C0012,M4035,South,Commercial,2023-09-05,1038.35,0.0,Unpaid,T3,OK
14,C0015,M3145,West,Residential,2023-01-28,99999.0,1397.04,Unpaid,T1,Replaced
15,C0016,M7787,East,Residential,2023-01-06,50497.985,0.0,Unpaid,T3,OK
20,C0021,M5820,West,Residential,2023-05-06,99999.0,0.0,Unpaid,T2,OK
46,C0047,M1619,South,Commercial,2023-05-02,99999.0,721.36,Unpaid,T2,
57,C0058,M3670,West,Commercial,2023-05-05,99999.0,0.0,Unpaid,T2,
61,C0062,M6072,North,Commercial,2023-03-10,1075.76,0.0,Unpaid,T3,Check meter
62,C0063,M1974,West,Commercial,2023-04-18,99999.0,1570.33,Unpaid,T3,Check meter
68,C0069,M8354,East,Residential,2023-03-14,50497.985,4232.83,Unpaid,T1,OK
72,C0073,M5046,South,Residential,2023-03-14,1128.24,0.0,Unpaid,T1,OK


## 12. Select customers from South region with TariffCode == 'T2'

In [71]:
df1.query("Region == 'South' & TariffCode == 'T2'")

Unnamed: 0,CustomerID,MeterID,Region,ConnectionType,ReadingDate,Energy_kWh,BillingAmount,PaymentStatus,TariffCode,Remarks
46,C0047,M1619,South,Commercial,2023-05-02,99999.0,721.36,Unpaid,T2,
47,C0048,M8316,South,Residential,2023-01-05,620.25,0.0,Unknown,T2,
56,C0057,M6383,South,Commercial,2023-01-08,1038.35,0.0,Paid,T2,
90,C0091,M4458,South,Commercial,2023-01-14,99999.0,0.0,Unknown,T2,Replaced
103,C0104,M1086,South,Commercial,2023-06-15,99999.0,4808.33,Paid,T2,
111,C0112,M3054,South,Residential,2023-04-04,157.33,1121.93,Unknown,T2,Replaced
113,C0114,M1730,South,Residential,2023-04-21,1182.88,4890.05,Unpaid,T2,Check meter
124,C0125,M1032,South,Commercial,2023-05-29,1010.47,0.0,Paid,T2,OK
126,C0127,M8222,South,Commercial,2023-06-21,1038.35,0.0,Unknown,T2,
127,C0128,M1414,South,Commercial,2023-03-28,1038.35,0.0,Unknown,T2,OK


## 13. Filter readings between March and May 2023

In [72]:
df1[df1['ReadingDate'].between('2023-03-01', '2023-05-31')]

Unnamed: 0,CustomerID,MeterID,Region,ConnectionType,ReadingDate,Energy_kWh,BillingAmount,PaymentStatus,TariffCode,Remarks
1,C0002,M9122,East,Residential,2023-03-13,99999.00,0.00,Paid,T3,
2,C0003,M1018,North,Commercial,2023-03-03,99999.00,0.00,Paid,T2,Replaced
9,C0010,M3965,West,Commercial,2023-03-31,99999.00,1478.62,Paid,T3,
12,C0013,M6083,South,Residential,2023-04-12,235.56,1798.40,Unknown,T3,Check meter
16,C0017,M8123,North,Residential,2023-05-01,1075.76,0.00,Paid,T1,Replaced
...,...,...,...,...,...,...,...,...,...,...
187,C0188,M6786,South,Residential,2023-04-29,496.53,0.00,Unpaid,T1,OK
193,C0194,M2387,North,Commercial,2023-05-04,409.59,0.00,Unknown,T2,OK
195,C0196,M7600,North,Commercial,2023-03-25,99999.00,2032.17,Paid,T3,OK
196,C0197,M2103,East,Residential,2023-05-22,99999.00,0.00,Paid,T3,Check meter


## 14. Find customers with BillingAmount > 4000 and Energy_kWh < 500

In [73]:
df1.query("BillingAmount > 4000 & Energy_kWh < 500")

Unnamed: 0,CustomerID,MeterID,Region,ConnectionType,ReadingDate,Energy_kWh,BillingAmount,PaymentStatus,TariffCode,Remarks


## 15. Use .query() to extract all Commercial connections with missing remarks

In [74]:
df1.query("Remarks == 'nan' & ConnectionType == 'Commercial'")

Unnamed: 0,CustomerID,MeterID,Region,ConnectionType,ReadingDate,Energy_kWh,BillingAmount,PaymentStatus,TariffCode,Remarks
9,C0010,M3965,West,Commercial,2023-03-31,99999.0,1478.62,Paid,T3,
28,C0029,M2751,South,Commercial,2023-02-07,99999.0,0.0,Unknown,T3,
44,C0045,M9466,North,Commercial,2023-03-09,1142.5,3390.49,Unknown,T2,
46,C0047,M1619,South,Commercial,2023-05-02,99999.0,721.36,Unpaid,T2,
56,C0057,M6383,South,Commercial,2023-01-08,1038.35,0.0,Paid,T2,
57,C0058,M3670,West,Commercial,2023-05-05,99999.0,0.0,Unpaid,T2,
64,C0065,M6074,North,Commercial,2023-06-30,1075.76,878.06,Unknown,T3,
65,C0066,M3386,South,Commercial,2023-04-22,1038.35,3960.23,Unknown,T3,
78,C0079,M9724,West,Commercial,2023-02-08,99999.0,0.0,Paid,T3,
84,C0085,M9449,West,Commercial,2023-06-15,99999.0,0.0,Paid,T2,


## 16. Sort by ReadingDate descending and extract top 10 recent readings

In [75]:
df1.sort_values(by=['ReadingDate'], ascending=False)

Unnamed: 0,CustomerID,MeterID,Region,ConnectionType,ReadingDate,Energy_kWh,BillingAmount,PaymentStatus,TariffCode,Remarks
121,C0122,M3948,West,Commercial,2023-10-06,99999.000,4795.69,Unknown,T1,
175,C0176,M7456,North,Commercial,2023-10-05,99999.000,0.00,Unknown,T3,Replaced
110,C0111,M7787,East,Commercial,2023-10-03,50497.985,3700.77,Unpaid,T2,OK
59,C0060,M9780,East,Commercial,2023-10-03,50497.985,0.00,Unknown,T3,Check meter
129,C0130,M6062,West,Commercial,2023-10-03,99999.000,0.00,Unpaid,T1,OK
...,...,...,...,...,...,...,...,...,...,...
47,C0048,M8316,South,Residential,2023-01-05,620.250,0.00,Unknown,T2,
144,C0145,M5546,East,Residential,2023-01-04,50497.985,0.00,Unknown,T2,
0,C0001,M6587,East,Commercial,2023-01-02,753.590,0.00,Paid,T3,Check meter
7,C0008,M8168,West,Commercial,2023-01-02,99999.000,0.00,Unknown,T1,Replaced


## 17. Sort by Region and then by Energy_kWh ascending

In [76]:
df1.sort_values(by=['Region','Energy_kWh'], ascending=[True, False])

Unnamed: 0,CustomerID,MeterID,Region,ConnectionType,ReadingDate,Energy_kWh,BillingAmount,PaymentStatus,TariffCode,Remarks
1,C0002,M9122,East,Residential,2023-03-13,99999.00,0.00,Paid,T3,
29,C0030,M3272,East,Commercial,2023-02-18,99999.00,0.00,Unknown,T3,Check meter
30,C0031,M8481,East,Commercial,2023-01-18,99999.00,0.00,Unknown,T2,Replaced
66,C0067,M8110,East,Commercial,2023-01-23,99999.00,2568.85,Unknown,T3,Replaced
99,C0100,M7124,East,Commercial,2023-06-12,99999.00,0.00,Unknown,T3,OK
...,...,...,...,...,...,...,...,...,...,...
70,C0071,M6925,West,Residential,2023-04-23,776.28,0.00,Paid,T1,
19,C0020,M4721,West,Commercial,2023-05-29,683.00,0.00,Unknown,T1,OK
176,C0177,M4212,West,Residential,2023-05-26,639.25,0.00,Unknown,T2,
109,C0110,M9456,West,Commercial,2023-04-13,495.54,0.00,Paid,T3,


## 18. Extract columns: CustomerID, ReadingDate, Energy_kWh, BillingAmount

In [79]:
df1[['CustomerID', 'ReadingDate', 'Energy_kWh', 'BillingAmount']]

Unnamed: 0,CustomerID,ReadingDate,Energy_kWh,BillingAmount
0,C0001,2023-01-02,753.59,0.00
1,C0002,2023-03-13,99999.00,0.00
2,C0003,2023-03-03,99999.00,0.00
3,C0004,2023-01-23,99999.00,0.00
4,C0005,2023-09-02,1075.76,0.00
...,...,...,...,...
195,C0196,2023-03-25,99999.00,2032.17
196,C0197,2023-05-22,99999.00,0.00
197,C0198,2023-06-25,581.91,0.00
198,C0199,2023-06-12,99999.00,1869.82


## 19. Create a new column CostPerUnit = BillingAmount / Energy_kWh

In [82]:
df1['CostPerUnit'] = df1['BillingAmount'] / df1['Energy_kWh']
df1

Unnamed: 0,CustomerID,MeterID,Region,ConnectionType,ReadingDate,Energy_kWh,BillingAmount,PaymentStatus,TariffCode,Remarks,CostPerUnit
0,C0001,M6587,East,Commercial,2023-01-02,753.59,0.00,Paid,T3,Check meter,0.000000
1,C0002,M9122,East,Residential,2023-03-13,99999.00,0.00,Paid,T3,,0.000000
2,C0003,M1018,North,Commercial,2023-03-03,99999.00,0.00,Paid,T2,Replaced,0.000000
3,C0004,M1615,West,Commercial,2023-01-23,99999.00,0.00,Unknown,T2,Check meter,0.000000
4,C0005,M7523,North,Residential,2023-09-02,1075.76,0.00,Unknown,T2,OK,0.000000
...,...,...,...,...,...,...,...,...,...,...,...
195,C0196,M7600,North,Commercial,2023-03-25,99999.00,2032.17,Paid,T3,OK,0.020322
196,C0197,M2103,East,Residential,2023-05-22,99999.00,0.00,Paid,T3,Check meter,0.000000
197,C0198,M8002,South,Commercial,2023-06-25,581.91,0.00,Unknown,T2,,0.000000
198,C0199,M8029,West,Residential,2023-06-12,99999.00,1869.82,Unpaid,T1,Check meter,0.018698


## 20. Export cleaned DataFrame to utility_meter_readings_cleaned.csv

In [83]:
df1.to_csv('utility_meter_readings_cleaned.csv')