In [None]:
## Electricity Demand Data Analysis and Processing

This project involves integrating multiple CSV and JSON files containing electricity demand and weather data. The objectives 
are to clean and preprocess the data, detect and handle outliers, perform exploratory data analysis (EDA), and build a regression 
model to predict electricity demand. The final deliverables are a cleaned CSV file and this Jupyter Notebook documenting the entire process.

## Environment Setup

The following libraries are required for this project:
- pandas
- matplotlib
- seaborn
- scikit-learn
- statsmodels
- json

You can install these using pip:


import os
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew, kurtosis, zscore
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


## Data Integration

The project data is split into multiple CSV files (weather data) and JSON files (electricity demand data). The following code merges 
these files into single CSV files for further processing.


# Function to merge CSV files
def merge_csv_files(directory):
    all_files = [f for f in os.listdir(directory) if f.endswith('.csv')]
    df_list = [pd.read_csv(os.path.join(directory, file)) for file in all_files]
    merged_df = pd.concat(df_list, ignore_index=True)
    return merged_df

# Function to merge JSON files
def merge_json_files(directory):
    all_files = [f for f in os.listdir(directory) if f.endswith('.json')]
    df_list = []
    for file in all_files:
        with open(os.path.join(directory, file), 'r') as f:
            data = json.load(f)
            # Adjust the following line based on your JSON structure
            df = pd.DataFrame(data["response"]["data"])
            df_list.append(df)
    merged_df = pd.concat(df_list, ignore_index=True)
    return merged_df

# Merge the files (update paths as needed)
csv_directory = "path/to/csv_files"
json_directory = "path/to/json_files"
weather_df = merge_csv_files(csv_directory)
electricity_df = merge_json_files(json_directory)

# Combine the merged CSV and JSON data
combined_df = pd.concat([weather_df, electricity_df], axis=1)


## Data Preprocessing and Cleaning

In this step, we:
- Drop columns with more than 50% missing values.
- Fill missing values using median imputation for numeric columns and mode imputation for categorical columns.
- Convert the PERIOD column to a datetime format and extract features such as hour, day, month, etc.
- Remove duplicate rows.


def preprocess_data(df):
    # Drop columns with >50% missing values
    df = df.dropna(thresh=len(df)*0.5, axis=1)
    
    # Fill missing values
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col].fillna(df[col].mode()[0], inplace=True)
        else:
            df[col].fillna(df[col].median(), inplace=True)
    
    # Convert 'period' to datetime and extract features
    if 'period' in df.columns:
        df['period'] = pd.to_datetime(df['period'], errors='coerce').ffill()
        df['period'] = df['period'].astype('datetime64[ns]')
        df['hour'] = df['period'].dt.hour
        df['day'] = df['period'].dt.day
        df['month'] = df['period'].dt.month
        df['year'] = df['period'].dt.year
        df['day_of_week'] = df['period'].dt.dayofweek
        df['is_weekend'] = df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)
    
    # Remove duplicate rows
    df.drop_duplicates(inplace=True)
    return df

cleaned_df = preprocess_data(combined_df)


## Exploratory Data Analysis (EDA)

Below are some visualizations and statistical summaries to understand the distribution, relationships, and trends in the data.


# Statistical Summary
cleaned_df.describe()

# Histogram of key features
plt.figure(figsize=(10,6))
sns.histplot(cleaned_df['value'], kde=True)
plt.title("Distribution of Electricity Demand")
plt.xlabel("Electricity Demand")
plt.ylabel("Frequency")
plt.show()

# Time Series Plot (if applicable)
if 'period' in cleaned_df.columns:
    plt.figure(figsize=(12,6))
    plt.plot(cleaned_df['period'], cleaned_df['value'])
    plt.title("Electricity Demand Over Time")
    plt.xlabel("Time")
    plt.ylabel("Demand")
    plt.show()

# Correlation Heatmap
plt.figure(figsize=(10,8))
sns.heatmap(cleaned_df.corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()


## Regression Modeling

We use a linear regression model to predict electricity demand based on time-based features extracted from the 'period' column. The dataset is split into training and testing sets, and the model's performance is evaluated using Mean Squared Error (MSE), Root Mean Squared Error (RMSE), and R² score.

# Select features and target
features = ['hour', 'day', 'month', 'day_of_week', 'is_weekend']
X = cleaned_df[features]
y = cleaned_df['value']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build and train model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

print("MSE:", mse)
print("RMSE:", rmse)
print("R2 Score:", r2)


## Final Submission

The final deliverables for this project are:
- **final_cleaned_data.csv:** The cleaned and processed dataset.
- **project.ipynb:** This Jupyter Notebook documenting the entire data processing, analysis, and modeling workflow.

These files are submitted as part of the project.
