In [100]:
import json

# Synthetic JSON Dataset: smart_meter_data.json
data = {
  "region": "West",
  "city": "Mumbai",
  "meters": [
    {
      "meter_id": "MTR001",
      "customer": {"name": "Asha", "type": "Residential"},
      "readings": [
        {"timestamp": "2025-07-01T00:00", "kWh": 1.2},
        {"timestamp": "2025-07-01T01:00", "kWh": None},
        {"timestamp": "2025-07-01T02:00", "kWh": 1.5}
      ]
    },
    {
      "meter_id": "MTR002",
      "customer": {"name": "Ravi", "type": "Commercial"},
      "readings": [
        {"timestamp": "2025-07-01T00:00", "kWh": 5.0},
        {"timestamp": "2025-07-01T01:00", "kWh": 5.5},
        {"timestamp": "2025-07-01T02:00", "kWh": "error"}
      ]
    }
  ]
}
with open('smart_meter_data.json','w') as f:
    json.dump(data, f)

In [101]:
with open('smart_meter_data.json','r') as f:
    raw_data = json.load(f)

## 1.	Load the JSON file and print its keys and structure.

In [102]:
raw_data

{'region': 'West',
 'city': 'Mumbai',
 'meters': [{'meter_id': 'MTR001',
   'customer': {'name': 'Asha', 'type': 'Residential'},
   'readings': [{'timestamp': '2025-07-01T00:00', 'kWh': 1.2},
    {'timestamp': '2025-07-01T01:00', 'kWh': None},
    {'timestamp': '2025-07-01T02:00', 'kWh': 1.5}]},
  {'meter_id': 'MTR002',
   'customer': {'name': 'Ravi', 'type': 'Commercial'},
   'readings': [{'timestamp': '2025-07-01T00:00', 'kWh': 5.0},
    {'timestamp': '2025-07-01T01:00', 'kWh': 5.5},
    {'timestamp': '2025-07-01T02:00', 'kWh': 'error'}]}]}

## 2.	Flatten the readings list using json_normalize.

In [103]:
from pandas import json_normalize

df = json_normalize(
    raw_data['meters'],
    record_path = 'readings',
    meta = ['meter_id', ['customer', 'name'], ['customer', 'type']],
    meta_prefix = 'meta_',
    errors = 'ignore'
)

In [104]:
df.columns = df.columns.str.replace('meta_', '', regex=False)
df.columns = df.columns.str.replace('.', '_', regex=False)

In [105]:
df

Unnamed: 0,timestamp,kWh,meter_id,customer_name,customer_type
0,2025-07-01T00:00,1.2,MTR001,Asha,Residential
1,2025-07-01T01:00,,MTR001,Asha,Residential
2,2025-07-01T02:00,1.5,MTR001,Asha,Residential
3,2025-07-01T00:00,5.0,MTR002,Ravi,Commercial
4,2025-07-01T01:00,5.5,MTR002,Ravi,Commercial
5,2025-07-01T02:00,error,MTR002,Ravi,Commercial


## 3.	Extract meter ID, customer name, and type from nested objects.

In [106]:
df[['meter_id','customer_name','customer_type']]

Unnamed: 0,meter_id,customer_name,customer_type
0,MTR001,Asha,Residential
1,MTR001,Asha,Residential
2,MTR001,Asha,Residential
3,MTR002,Ravi,Commercial
4,MTR002,Ravi,Commercial
5,MTR002,Ravi,Commercial


## 4.	Convert timestamp to datetime format.

In [107]:
import pandas as pd 

df['timestamp'] = pd.to_datetime(df['timestamp'])

## 5.	Convert kWh column to numeric, handling errors.

In [108]:
df['kWh'] = pd.to_numeric(df['kWh'], errors='coerce')

## 6.	Use forward fill to treat nulls in kWh.

In [109]:
df['kWh'].fillna(method='ffill', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['kWh'].fillna(method='ffill', inplace=True)
  df['kWh'].fillna(method='ffill', inplace=True)


In [110]:
df

Unnamed: 0,timestamp,kWh,meter_id,customer_name,customer_type
0,2025-07-01 00:00:00,1.2,MTR001,Asha,Residential
1,2025-07-01 01:00:00,1.2,MTR001,Asha,Residential
2,2025-07-01 02:00:00,1.5,MTR001,Asha,Residential
3,2025-07-01 00:00:00,5.0,MTR002,Ravi,Commercial
4,2025-07-01 01:00:00,5.5,MTR002,Ravi,Commercial
5,2025-07-01 02:00:00,5.5,MTR002,Ravi,Commercial


## 7.	Replace remaining missing values with 0.

In [111]:
df.fillna(0, inplace = True)

In [112]:
df

Unnamed: 0,timestamp,kWh,meter_id,customer_name,customer_type
0,2025-07-01 00:00:00,1.2,MTR001,Asha,Residential
1,2025-07-01 01:00:00,1.2,MTR001,Asha,Residential
2,2025-07-01 02:00:00,1.5,MTR001,Asha,Residential
3,2025-07-01 00:00:00,5.0,MTR002,Ravi,Commercial
4,2025-07-01 01:00:00,5.5,MTR002,Ravi,Commercial
5,2025-07-01 02:00:00,5.5,MTR002,Ravi,Commercial


## 8.	Filter only Residential type customers.

In [113]:
df.query("customer_type == 'Residential'")

Unnamed: 0,timestamp,kWh,meter_id,customer_name,customer_type
0,2025-07-01 00:00:00,1.2,MTR001,Asha,Residential
1,2025-07-01 01:00:00,1.2,MTR001,Asha,Residential
2,2025-07-01 02:00:00,1.5,MTR001,Asha,Residential


## 9.	Slice data from 1 AM to 2 AM.

In [114]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   timestamp      6 non-null      datetime64[ns]
 1   kWh            6 non-null      float64       
 2   meter_id       6 non-null      object        
 3   customer_name  6 non-null      object        
 4   customer_type  6 non-null      object        
dtypes: datetime64[ns](1), float64(1), object(3)
memory usage: 372.0+ bytes


In [128]:
import datetime

df[df['timestamp'].dt.time.between(datetime.time(1,0), datetime.time(2,0))]

Unnamed: 0,timestamp,kWh,meter_id,customer_name,customer_type
1,2025-07-01 01:00:00,1.2,MTR001,Asha,Residential
2,2025-07-01 02:00:00,1.5,MTR001,Asha,Residential
4,2025-07-01 01:00:00,5.5,MTR002,Ravi,Commercial
5,2025-07-01 02:00:00,5.5,MTR002,Ravi,Commercial


## 10.	Add a column called hour based on timestamp.

In [129]:
df['hour'] = df['timestamp'].dt.hour

In [130]:
df

Unnamed: 0,timestamp,kWh,meter_id,customer_name,customer_type,hour
0,2025-07-01 00:00:00,1.2,MTR001,Asha,Residential,0
1,2025-07-01 01:00:00,1.2,MTR001,Asha,Residential,1
2,2025-07-01 02:00:00,1.5,MTR001,Asha,Residential,2
3,2025-07-01 00:00:00,5.0,MTR002,Ravi,Commercial,0
4,2025-07-01 01:00:00,5.5,MTR002,Ravi,Commercial,1
5,2025-07-01 02:00:00,5.5,MTR002,Ravi,Commercial,2


## 11.	Group by meter_id and get average usage.

In [133]:
average_usage = df.groupby('meter_id')['kWh'].mean()
average_usage

meter_id
MTR001    1.300000
MTR002    5.333333
Name: kWh, dtype: float64

## 12.	Flag rows where kWh exceeds 4.0 as an anomaly.

In [135]:
df[df['kWh'] > 4.0]

Unnamed: 0,timestamp,kWh,meter_id,customer_name,customer_type,hour
3,2025-07-01 00:00:00,5.0,MTR002,Ravi,Commercial,0
4,2025-07-01 01:00:00,5.5,MTR002,Ravi,Commercial,1
5,2025-07-01 02:00:00,5.5,MTR002,Ravi,Commercial,2


## 13.	Create a pivot table showing average kWh by hour.

In [141]:
average_kWh_by_hour = df.groupby('hour')['kWh'].mean()

In [142]:
print(average_kWh_by_hour)

hour
0    3.10
1    3.35
2    3.50
Name: kWh, dtype: float64
