<a href="https://colab.research.google.com/github/EnDFLab/EMCS_Project/blob/smart_meter_data_analytics/Smart_Meter_data_analytics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Importing Required Packages**

In [88]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pytz

**Creating a Dictionary for Consumers name**

In [89]:
consumer = {"physics":"1DN8M_FNpJ73Zb43nap6910TrStzjIlyF",
              "management":"1DcuVrXwOICXyI_KtcYxp2WWEtsVn7m6x",
              "electrical":"1DnhLgBQ8YSqDmI7rA8nplQEAyOWH_oiO",
              "civil":"1DaW-MB6cVM8jm5MPTg8br6pxJdqIEsEQ",
              "boyshostel":"1DsGE4NfgtfM8CvPQ-Vvh5EjW2MMo1YXW",
              "biotech":"1D_Bb5EJHCejxnhzdhEqc6geRXxWtGr4_",
              "transformer":"1DynnA5YDAH9xJq7KIhIc4fSzrvH_y_22"
              }

**Function to create a df from a csv file and remove unwanted rows, column.**

In [90]:
def read_raw_csv(id):
  if id in consumer.keys():
    Url = "https://drive.google.com/uc?export=download&id=" + consumer[id]
    df = pd.read_csv(Url,skiprows =1)
    df.columns = df.columns.str.strip() # Remove Leading and Trailing Whitespaces from Column
    df.set_index("Last Updated",inplace = True);
    df.drop(labels=["Voltage(V)","Current(A)","Power(W)","PF","Exported Energy(kWh)"],axis=1,inplace=True)
    return(df)
  else:
    return(None)

**Function for Data Preprocessing**

In [91]:
def data_preprocessing(df):
      df = replace_comma(df)
      #df.to_csv("/content/drive/MyDrive/sorted_data/" + "physics_comma_replaced.csv",float_format='%.2f');
      df = df_total(df)
      #df.to_csv("/content/drive/MyDrive/sorted_data/" + "physics_time_stamp_added.csv",float_format='%.2f');
      df = fill_missing_timestamp(df)
      #df.to_csv("/content/drive/MyDrive/sorted_data/" + "physics_added_time_stamp_added.csv",float_format='%.2f');
      df = fill_missing_value(df)
      #df.to_csv("/content/drive/MyDrive/sorted_data/" + "physics_added_missing_value_added.csv",float_format='%.2f');
      df = calculate_consumption(df)
      #df.to_csv("/content/drive/MyDrive/sorted_data/" + "physics_consumption.csv",float_format='%.2f');
      return df

**Function to Calculate Actual Energy Consumption from unit consumed**

In [92]:
def calculate_consumption(df):
  df["total"] = df["total"] - df["total"].shift(1)
  df.drop(index=df.index[0],axis=0,inplace=True)
  return df

**Function to fill missing values of energy consumption**

In [93]:
def fill_missing_value(df):
    """
    Fill missing values in the 'total' column using linear interpolation.

    Args:
        df (pd.DataFrame): Input DataFrame with a 'total' column.

    Returns:
        pd.DataFrame: DataFrame with missing values in 'total' filled.
    """
    # Perform linear interpolation for missing values
    df["total"] = df["total"].interpolate(method='linear', limit_direction='both')

    # Round values to two decimal places
    df["total"] = df["total"].round(2)

    return df

**Function to fill missing time stamp**

In [94]:

def fill_missing_timestamp(df):
    # Reset index to make 'timestamp' a regular column
    df = df.reset_index()
    print(df.head())  # Debugging step: Print first few rows

    # Round the 'timestamp' to the nearest 15 minutes
    df['timestamp'] = pd.to_datetime(df['timestamp']).dt.round(freq='15min')

    # Remove duplicate timestamps (keeping the last one)
    df = df.loc[~df['timestamp'].duplicated(keep='last')]

    # Set 'timestamp' back as the index
    df.set_index('timestamp', inplace=True)

    # Calculate the start and end timestamps
    start = df.index.min()
    end = df.index.max()

    # Generate the desired date range with the specified frequency
    dates = pd.date_range(start=start, end=end, freq='15Min')

    # Reindex the dataframe to the new date range
    df = df.reindex(dates)
    df.index.name = 'timestamp'

    return df

**Calculating Total row sum**



In [95]:
def df_total(df):
    #create pivot table
    df = pd.pivot_table(data=df,index=["Last Updated"],values=["Grid Consumption(kWh)"],columns="Meter")
    df.index = pd.to_datetime(df.index) #need this before sorting index
    df = df.sort_index(ascending=True)
    df = df.ffill()
    df["Total"] = df.sum(axis=1)
    df.drop(labels=["Grid Consumption(kWh)"],axis=1,inplace=True)
    df.reset_index(inplace=True) #Remove Index Name
    df.columns = [None] * len(df.columns) # Remove Column name

    df.columns = ['timestamp', 'total']
    df.set_index("timestamp",inplace=True)
    df["total"] = df["total"].round(2)



    #df.index.name = None
    #df.set_index("Last Updated",inplace=True)

    #df.rename_axis(None,inplace = True)
    #df.reset_index(inplace=True)
    #df.index.name = "timestamp"

    #df.drop(index=1, inplace=True) #Removes 2nd row index 1 is second row

    return df

**Function to Remove commas from Grid Consumption Column**

In [96]:
#Function to remove the comma from Grid Consumption Column
def replace_comma(df):
    df["Grid Consumption(kWh)"] = df["Grid Consumption(kWh)"].replace(",","",regex = True)
    df['Grid Consumption(kWh)'] = pd.to_numeric(df['Grid Consumption(kWh)'], errors='coerce')
    print("dtype of Grid Consumption(kWh)is:")
    print(df.dtypes)
    return(df)

In [97]:
def fetch_weather_data():
  weather_df = pd.read_csv("https://drive.google.com/uc?export=download&id=1mezuQQr_XKMYOT_X46UoUgbtDmoV_qPj")
  weather_df.drop(labels=["period", "dni", "ghi"], axis=1, inplace=True)
  return weather_df

In [98]:
def add_temperature(df, weather_df):
    # Convert 'period_end' in weather_df to datetime and set to Nepali time zone
    weather_df["period_end"] = pd.to_datetime(weather_df["period_end"], utc=True).dt.tz_convert("Asia/Kathmandu")

    # Remove timezone information
    weather_df["period_end"] = weather_df["period_end"].dt.tz_localize(None)

    # Set 'period_end' as the index
    weather_df.set_index("period_end", inplace=True)

    # Ensure 'timestamp' (or equivalent) in df is also datetime
    df.index = pd.to_datetime(df.index)

    # Resample weather_df to 15-minute intervals
    weather_df = weather_df.resample('15T').mean().round(2)

    # Outer join to align both DataFrames
    merged_df = df.join(weather_df, how='outer')

    # Drop rows with NaN values (optional, based on your use case)
    merged_df.dropna(inplace=True)

    # Reset index to make 'period_end' a regular column
    merged_df.reset_index(inplace=True)
    merged_df.rename(columns={'index': 'period_end'}, inplace=True)

    # Add a holiday flag (1 if Saturday, 0 otherwise)
    merged_df['holiday'] = merged_df['period_end'].apply(lambda x: 1 if pd.Timestamp(x).weekday() == 5 else 0)

    # Set 'period_end' back as the index
    merged_df.set_index('period_end', inplace=True)

    return merged_df


**Main Body**

In [99]:
#Uncomment for using all  consumer id from dictonary defined above
consumer_id = tuple(consumer.keys())
#Create a df for a single consumer only
#consumer_id = ["physics"] #
#print(consumer_id)
weather_df = fetch_weather_data()
temp_df = weather_df
print(weather_df.head())
for id in consumer_id:
    df = read_raw_csv(id)
    if df is not None:
        #print(f"first 5 rows of consumer {id} is: \n")
        #print(df.head())
        #print(df.dtypes)
        #print(df['Grid Consumption(kWh)'].unique())
        #sending data frame for pre-Processing
        df = data_preprocessing(df);
        #print(df.index)
        #print(df.head)
        df = add_temperature(df,weather_df)
        #print(df.head())
        df.to_csv("/content/drive/MyDrive/sorted_data/" + f"{id}_preprocessed.csv",float_format='%.2f');

        #call weather_df


        #print(f"first 5 rows of consumer {id} after pre-processing is: \n")
        #print(df.head())
        #print(df.dtypes)
        #print("After Processing")

    else:
        print(f"No data found for consumer {id}")
    weather_df = temp_df
    weather_df = weather_df.reset_index()

   air_temp                 period_end
0         8  2024-01-01T00:05:00+00:00
1         8  2024-01-01T00:10:00+00:00
2         8  2024-01-01T00:15:00+00:00
3         8  2024-01-01T00:20:00+00:00
4         8  2024-01-01T00:25:00+00:00


  df = pd.read_csv(Url,skiprows =1)


dtype of Grid Consumption(kWh)is:
Meter                     object
Grid Consumption(kWh)    float64
dtype: object
            timestamp  total
0 2024-01-07 12:00:26   5.94
1 2024-01-07 12:05:19   5.95
2 2024-01-07 12:11:36   5.95
3 2024-01-07 12:16:39   5.96
4 2024-01-07 12:20:33   5.99


  weather_df = weather_df.resample('15T').mean().round(2)


dtype of Grid Consumption(kWh)is:
Meter                     object
Grid Consumption(kWh)    float64
dtype: object
            timestamp   total
0 2024-01-05 12:06:39    0.00
1 2024-01-10 09:37:42  220.52
2 2024-01-10 09:40:40  220.89
3 2024-01-10 09:45:34  221.46
4 2024-01-10 09:50:29  222.09


  weather_df = weather_df.resample('15T').mean().round(2)


dtype of Grid Consumption(kWh)is:
Meter                     object
Grid Consumption(kWh)    float64
dtype: object
            timestamp  total
0 2024-07-24 11:39:11  12.11
1 2024-07-24 11:40:10  12.14
2 2024-07-24 11:45:35  12.26
3 2024-07-24 11:50:27  12.38
4 2024-07-24 11:55:24  12.48


  weather_df = weather_df.resample('15T').mean().round(2)
  df = pd.read_csv(Url,skiprows =1)


dtype of Grid Consumption(kWh)is:
Meter                     object
Grid Consumption(kWh)    float64
dtype: object
            timestamp  total
0 2024-01-12 00:00:02  20.67
1 2024-01-12 00:05:53  20.69
2 2024-01-12 00:10:46  20.70
3 2024-01-12 00:15:39  20.72
4 2024-01-12 00:20:32  20.73


  weather_df = weather_df.resample('15T').mean().round(2)


dtype of Grid Consumption(kWh)is:
Meter                     object
Grid Consumption(kWh)    float64
dtype: object
            timestamp  total
0 2024-09-26 12:18:49  31.66
1 2024-09-26 12:20:46  31.69
2 2024-09-26 12:25:40  31.77
3 2024-09-26 12:30:33  31.84
4 2024-09-26 12:35:25  31.91


  weather_df = weather_df.resample('15T').mean().round(2)


dtype of Grid Consumption(kWh)is:
Meter                     object
Grid Consumption(kWh)    float64
dtype: object
            timestamp  total
0 2024-01-04 15:50:06   0.34
1 2024-01-04 15:55:57   0.43
2 2024-01-04 16:00:50   0.52
3 2024-01-04 16:05:45   0.62
4 2024-01-04 16:10:36   0.70


  weather_df = weather_df.resample('15T').mean().round(2)


dtype of Grid Consumption(kWh)is:
Meter                     object
Grid Consumption(kWh)    float64
dtype: object
            timestamp  total
0 2024-08-01 14:34:24  43.37
1 2024-08-01 14:35:22  45.18
2 2024-08-01 14:40:17  54.44
3 2024-08-01 14:45:08  63.60
4 2024-08-01 14:50:01  73.09


  weather_df = weather_df.resample('15T').mean().round(2)
