In [19]:
import os
import glob
import pandas as pd
import json

In [21]:
electricity_folder = r"C:\Users\M. Faizan\Desktop\SE\6th\Data_Science_Assign2\raw\electricity_raw_data"
weather_folder = r"C:\Users\M. Faizan\Desktop\SE\6th\Data_Science_Assign2\raw\weather_raw_data"
output_dir = r"C:\Users\M. Faizan\Desktop\SE\6th\Saved"

In [23]:
electricity_data = []
for file_path in glob.glob(os.path.join(electricity_folder, "*.json")):
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
        records = data.get("response", {}).get("data", [])
        electricity_data.extend(records)
        print(f"Loaded {file_path}: {len(records)} records")

electricity_df = pd.DataFrame(electricity_data)
electricity_df.rename(columns={"period": "datetime", "value": "electricity_demand"}, inplace=True)
electricity_df["datetime"] = pd.to_datetime(electricity_df["datetime"], format="%Y-%m-%dT%H", errors='coerce')

electricity_df.drop_duplicates(subset=["datetime"], inplace=True)
print(f"Electricity Data: {electricity_df.shape[0]} records, {electricity_df.shape[1]} features")

weather_data = []
for file_path in glob.glob(os.path.join(weather_folder, "*.csv")):
    df = pd.read_csv(file_path, encoding="utf-8", header=0)
    df.columns = df.columns.str.strip()  
    weather_data.append(df)
    print(f"Loaded {file_path}: {df.shape[0]} records")

weather_df = pd.concat(weather_data, ignore_index=True)
print(f"Weather Data: {weather_df.shape[0]} records, {weather_df.shape[1]} features")

if "datetime" in weather_df.columns:
    weather_df["datetime"] = pd.to_datetime(weather_df["datetime"], errors='coerce')
elif "timestamp" in weather_df.columns:
    weather_df.rename(columns={"timestamp": "datetime"}, inplace=True)
    weather_df["datetime"] = pd.to_datetime(weather_df["datetime"], errors='coerce')
elif "date" in weather_df.columns: 
    weather_df.rename(columns={"date": "datetime"}, inplace=True)
    weather_df["datetime"] = pd.to_datetime(weather_df["datetime"], errors='coerce')
else:
    raise KeyError("No valid datetime column found in weather data")

weather_df.dropna(subset=["datetime"], inplace=True)

electricity_df["datetime"] = electricity_df["datetime"].dt.tz_localize(None)
weather_df["datetime"] = weather_df["datetime"].dt.tz_localize(None)

final_df = pd.merge(electricity_df, weather_df, on="datetime", how="inner")

final_df.drop_duplicates(inplace=True)
final_df.dropna(inplace=True)

# Ensure electricity_demand is numeric
final_df["electricity_demand"] = pd.to_numeric(final_df["electricity_demand"], errors='coerce')

# Detect anomalies (e.g., extreme values)
if "electricity_demand" in final_df.columns:
    outlier_threshold = final_df["electricity_demand"].quantile(0.99)
    anomalies = final_df[final_df["electricity_demand"] > outlier_threshold]
    print(f"Detected {anomalies.shape[0]} anomalies (top 1% extreme values)")
    
os.makedirs(output_dir, exist_ok=True)  

output_csv_path = os.path.join(output_dir, "merged_and_concatenated_data.csv")
final_df.to_csv(output_csv_path, index=False)

print(f"Final cleaned data saved to: {output_csv_path}")

print("Data Summary:")
print(final_df.info())

Loaded C:\Users\M. Faizan\Desktop\SE\6th\Data_Science_Assign2\raw\electricity_raw_data\hourly_demand_2022-01-01.json: 275 records
Loaded C:\Users\M. Faizan\Desktop\SE\6th\Data_Science_Assign2\raw\electricity_raw_data\hourly_demand_2022-01-02.json: 275 records
Loaded C:\Users\M. Faizan\Desktop\SE\6th\Data_Science_Assign2\raw\electricity_raw_data\hourly_demand_2022-01-03.json: 275 records
Loaded C:\Users\M. Faizan\Desktop\SE\6th\Data_Science_Assign2\raw\electricity_raw_data\hourly_demand_2022-01-04.json: 275 records
Loaded C:\Users\M. Faizan\Desktop\SE\6th\Data_Science_Assign2\raw\electricity_raw_data\hourly_demand_2022-01-05.json: 275 records
Loaded C:\Users\M. Faizan\Desktop\SE\6th\Data_Science_Assign2\raw\electricity_raw_data\hourly_demand_2022-01-06.json: 275 records
Loaded C:\Users\M. Faizan\Desktop\SE\6th\Data_Science_Assign2\raw\electricity_raw_data\hourly_demand_2022-01-07.json: 275 records
Loaded C:\Users\M. Faizan\Desktop\SE\6th\Data_Science_Assign2\raw\electricity_raw_data\hou