# Labeling Performance Analysis

This document runs a single site through a performance ratio analsys.

It uses this analyses to label the site (on that day) based on Level 1 faults

# 1. Libraries Import

In [None]:
# ========================================================
# = Libraries import
# ========================================================

import matplotlib.pyplot as plt
import matplotlib
import pandas as pd
import numpy as np
import boto3
import pytz
import datetime
import os

import warnings
warnings.filterwarnings('ignore')
import seaborn as sns

from datetime import timedelta

# 2. AWS credentials

In [None]:
# ========================================================
# = AWS Credentials
# ========================================================

PROD_AWS_PROFILE = "gsesami-prod"
AWS_REGION = "ap-southeast-2"

prod_session = boto3.session.Session(profile_name=PROD_AWS_PROFILE)

prod_client = prod_session.client(
    "timestream-query", region_name=AWS_REGION)

# 3. Defining the Site ID, and dates:

In [None]:
# Reading all sites
sites_list = pd.read_csv('./input_data/Site_List.csv')

# Reading all monitors
monitors_list = pd.read_csv('./input_data/Monitors_List.csv')

In [None]:
# Define site_id to be analysed:
site_id = 'c4e9450c-1d4c-44ad-af9e-38a0b4a1a58d'

# Time period
date_start = '2023-04-25'   
date_end = '2023-10-25'

In [None]:
# getting site_id name:
site_id_full = 'SITE|' + str(site_id)

# Checking
print("This analysis will be performed on the site: ", site_id_full)

In [None]:
# Checking timezone
timezone_value = 'Australia/Sydney'
timezone_value = sites_list[sites_list['source'] == site_id_full].iloc[0]['timezone']

time_starttz = pytz.timezone('UTC').localize(datetime.datetime.strptime(date_start, '%Y-%m-%d'))
time_endtz = pytz.timezone('UTC').localize(datetime.datetime.strptime(date_end, '%Y-%m-%d'))

# 4. Functions

## 4.1. Querying and dataframe

In [None]:
def read_metric_site(date_start, date_end, measure_name, site_id):
    timeid = []
    data_values = []
    ##----------------- read the Performance  --------------##
    query = """SELECT date, max_by(measure_value::double, time) as prod_val
                FROM "DiagnoProd"."DiagnoProd"
                WHERE measure_name = '""" + measure_name + """'
                AND siteId = '""" + site_id + """'
                AND date BETWEEN '""" + date_start + """'
                AND '""" + date_end + """'
                GROUP BY date
                ORDER BY date """
    
    client = prod_client
    paginator = client.get_paginator("query")
    page_iterator = paginator.paginate(QueryString=query,)
    i = 1
    for page in page_iterator:
        # print(page)
        try:
            timeid_page = [f[0]['ScalarValue'] for f in pd.DataFrame(page["Rows"])['Data']]
            data_values_page = [f[1]['ScalarValue'] for f in pd.DataFrame(page["Rows"])['Data']]
            timeid = timeid + timeid_page
            data_values = data_values + data_values_page
        except KeyError:
            print('Page {%d} has no data available:'%i)
        i = i+1
    return timeid, data_values

In [None]:
def build_dataframe(timeid, measure_name, data_values):
    # ============== Check if there is data available for the pv system =============
    if len(timeid)!=0:
        timeid = pd.to_datetime(timeid)
        if timeid.tzinfo is None:
            print('this is not tz-aware')
            if timezone_value is not None:
                timeid = timeid.tz_localize('UTC').tz_convert(timezone_value)
                # timeid = timeid.tz_localize(timezone_list[i])
            else:
                print('no timezone in the table')
                timeid = timeid.tz_localize('UTC').tz_convert('Australia/Sydney')
                # timeid = timeid.tz_localize('Australia/Sydney')
        else:
            print('this is tz-aware')
        
        timesort = timeid.sort_values()
        data = pd.DataFrame(data={'time':timeid, measure_name: data_values})
        data.sort_values('time', inplace=True)
        data.set_index('time', inplace=True)
        data[measure_name] = data[measure_name].astype(float)
    else:
        data = pd.DataFrame(data_values, index=timeid, columns=[measure_name])
    
    return data

## 4.2. Helper Functions

In [None]:
# ==================================
# = Merging clear skies and expected
# ==================================

def merge_clear_expe(df1, df2):
    df_merged = df1.join(df2)
    df_merged['expected_over_clear'] =  (df_merged['Irrad.kWh.m2.Daily'] / df_merged['EnergyYield.kWh.Daily'] * 100).round(0)
    df_merged['date'] =  df_merged.index
    return df_merged

In [None]:
# ===============================================
# = Getting colours from values for plotting bars
# ===============================================

def colors_from_values(values, palette_name):
    # normalize the values to range [0, 1]
    normalized = (values - min(values)) / (max(values) - min(values))
    # convert to indices
    indices = np.round(normalized * (len(values) - 1)).astype(np.int32)
    # use the indices to get the colors
    palette = sns.color_palette(palette_name, len(values))
    return np.array(palette).take(indices, axis=0)

# 5. Getting values Getting the clear sky values

## 5.1. Clear sky values

In [None]:
# ===================================================
# = Reading EnergyYield.kWh.Daily from AWS TimeStream
# ===================================================

measure_name = 'EnergyYield.kWh.Daily'
timeid, data_values = read_metric_site(date_start, date_end, measure_name, site_id)
df_clear = build_dataframe(timeid, measure_name, data_values)

## 5.2. Expected Generation

In [None]:
# ================================================
# = Reading Irrad.kWh.m2.Daily from AWS TimeStream
# ================================================

measure_name = 'Irrad.kWh.m2.Daily'
timeid, data_values = read_metric_site(date_start, date_end, measure_name, site_id)
df_expected = build_dataframe(timeid, measure_name, data_values)
# Fixing it as a float:
df_expected['Irrad.kWh.m2.Daily'] = df_expected['Irrad.kWh.m2.Daily'].astype(float)

## 5.3. Measured Generation

In [None]:
# ==================================================
# = Reading Production.kWh.Daily from AWS TimeStream
# ==================================================

measure_name = 'Production.kWh.Daily'
timeid, data_values = read_metric_site(date_start, date_end, measure_name, site_id)
df_production = build_dataframe(timeid, measure_name, data_values)
# Fixing it as float
df_production['Production.kWh.Daily'] = df_production['Production.kWh.Daily'].astype(float)

In [None]:
# Merging clear and expected:
df_merged = df_clear.join(df_expected)
# Merging (clear and expected) and production
df_merged = df_merged.join(df_production)

## 5.4. Cloudiness - Comparison on Clear skies x Expected

In [None]:
# Getting the performance ratio:
df_merged['expected_over_clear'] =  (df_merged['Irrad.kWh.m2.Daily'] / df_merged['EnergyYield.kWh.Daily'] * 100).round(0)
# Getting this extra column flor plotting:
df_merged['date'] =  df_merged.index

# 6. Features

## 6.1. Adding low-cloudiness

In [None]:
# ========================================================
# = Checking values above a certain threshold when comparing clear skies and expected
# ========================================================

# Define the threshold for low cloudiness days:
threshold_low_cloudiness = 80

# Make it low_cloudiness aware:
df_merged.loc[df_merged['expected_over_clear'] >= threshold_low_cloudiness, 'is_low_clousdiness_day'] = True 
df_merged.loc[df_merged['expected_over_clear'] < threshold_low_cloudiness, 'is_low_clousdiness_day'] = False

## 6.2. Daily performance ratio (%)

In [None]:
# getting new df to setup following analysis:
df_performance = df_merged
df_performance['Performance.perc.Daily'] = (df_performance['Production.kWh.Daily'] / df_performance['Irrad.kWh.m2.Daily'] * 100).round(0)

## 6.3. Weekend and Weekdays

In [None]:
# extract the day of the week using the weekday() method from dataframe
df_performance['day_of_week'] = df_performance['date'].apply(lambda x: x.weekday())

# create a binary indicator variable for weekends vs weekdays
df_performance['is_weekend'] = df_performance['day_of_week'].apply(lambda x: x in [5, 6])

## 6.4. Seasons

In [None]:
# Get the month using the datetime.month attribute
df_performance['month'] = df_performance['date'].dt.month

# Dictionary to map the month to the season
# Note that this has been done for Australia (Southern hemisphere)

seasons = {1: 'summer', 
           2: 'summer', 
           3: 'autumn', 
           4: 'autumn', 
           5: 'autumn', 
           6: 'winter', 
           7: 'winter', 
           8: 'winter', 
           9: 'spring', 
           10: 'spring', 
           11: 'spring', 
           12: 'summer'
           }



df_performance['season'] = df_performance['month'].apply(lambda x: seasons[x])

## 6.4. Kicking off outliers

In [None]:
# If necessary to kick off outliers:
#df_performance = df_performance[df_performance['Performance.perc.Daily'] < 120]

# 7. Exploring the performance variance

## 7.1. Functions to check performance

In [None]:
def performance_check(row):
    if row['Performance.perc.Daily'] >= 80:
        val = 'ok'
    elif row['Performance.perc.Daily'] >=60:
        val = 'medium'
    else:
        val = 'under'
    return val

def performance_and_LC_check(row):
    if row['is_low_clousdiness_day'] == False:
        val = 'High Cloudiness'
    else:
        val = row['performancelabel']
    return val

df_performance['performancelabel'] = df_performance.apply(performance_check, axis=1)
df_performance['performancelabel'] = df_performance.apply(performance_and_LC_check, axis=1)

## 7.2. Visual performance check - Barplots (Expected over measured)

In [None]:
palette ={"ok": "green", "medium": "yellow", "under": "red", "High Cloudiness":"grey"}

matplotlib.rc_file_defaults()

ax1 = sns.set_style(style=None, rc=None)

fig, ax1 = plt.subplots(figsize=(50,15))
plt.xticks(rotation=90)
plt.grid()

sns.lineplot(data = df_performance['Irrad.kWh.m2.Daily'], marker='o', sort = False, ax=ax1, label='Expected', color='green')

ax2 = ax1.twinx()

sns.lineplot(data = df_performance['Production.kWh.Daily'], marker='X', sort = False, ax=ax1, label='Measured', color='blue')

sns.barplot(data = df_performance, x='date', y='Performance.perc.Daily', hue='performancelabel', palette=palette, alpha=0.8, dodge=None)

fig.suptitle('Site Name = '+ str(site_id_full) +'\nSite ID = '+ str(site_id_full) +'\nPerformance over time (daily aggregate) - All days' + '\nLow cloudiness threshold = '+ str(threshold_low_cloudiness) + '% [expected/clear_sky]')
sns.move_legend(ax1, "upper left", bbox_to_anchor=(1, 1))
sns.move_legend(ax2, "upper left", bbox_to_anchor=(1, 0.8))

figname = str(site_id + '.png')

## Saving:
# fig.savefig(figname)

# 8. Working with low cloudiness days

## 8.1. Getting low cloudiness days

In [None]:
df_LC = df_performance[df_performance['is_low_clousdiness_day'] == True]

## 8.2. Visual check on low cloudiness' days

In [None]:
palette ={"ok": "green", "medium": "yellow", "under": "red"}

matplotlib.rc_file_defaults()
ax1 = sns.set_style(style=None, rc=None )

fig, ax1 = plt.subplots(figsize=(40,10))
plt.xticks(rotation=90)
plt.grid()

fig.suptitle('Site Name = '+ str(site_id_full) +'\nSite ID = '+ str(site_id_full) +'\nPerformance over time (daily aggregate) - Only low cloudiness days' + '\nLow cloudiness threshold = '+ str(threshold_low_cloudiness) + '% [expected/clear_sky]')

ax = sns.barplot(
    data=df_LC, 
    x='date',
    y='Performance.perc.Daily',
    hue='performancelabel',
    palette=palette,
    dodge=None
    )
for i in ax.containers:
    ax.bar_label(i,)

plt.show()

# 9. Functions to label Level 1 faults based on performance ratio

## 9.1. Major Underperformance

System is performing at less than 60 % for 3 days or more.

In [None]:
def check_major_underperformance(df, window_size, threshold):
    # create a boolean mask to identify the rows where the 'Performance.perc.Daily' column is below 60 (or threshold)
    # we'll use this to filter out the dataframe
    mask = df['Performance.perc.Daily'] < threshold

    # create a new column 'majpr_underperformance' with the default value 'FALSE'
    df['major_underperformance'] = 'FALSE'

    # label the rows where the 'Performance.perc.Daily' column has dropped below 60 for 3 consecutive days or more
    # 1st, loop through the df:
    for i in range((window_size-1), len(df)):
        # check if the mask apply for N consecutive days (window_size):
        if all(mask.iloc[i - j] for j in range(window_size)):
            # Label it as such:
            # Mark all days in the sequence as 'Minor underperformance'
            ## Using window_size to do this retroactively:
            for j in range(window_size):
                df.loc[df.index[i-j], 'major_underperformance'] = 'Major underperformance'

    return df

## 9.2. Minor Underperformance

System is performing at less than 80 % for 7 days or more.

In [None]:
def check_minor_underperformance(df, window_size, threshold):
    # create a boolean mask to identify the rows where the 'Performance.perc.Daily' column is below the threshold
    mask = df['Performance.perc.Daily'] < threshold

    # create a new column 'minor_underperformance' with the default value 'FALSE'
    df['minor_underperformance'] = 'FALSE'

    # label the rows where the 'Performance.perc.Daily' column has dropped below the threshold for the specified number of consecutive days or more
    # 1st, loop through the df:
    for i in range((window_size-1), len(df)):
        # check if the mask apply for N consecutive days (window_size):        
        if mask.iloc[i] and all(mask.iloc[i-window_size+1:i+1]):
            # Label it as such:
            # Mark all days in the sequence as 'Minor underperformance'
            ## Using window_size to do this retroactively:
            for j in range(window_size):
                df.loc[df.index[i-j], 'minor_underperformance'] = 'Minor underperformance'

    return df

## 9.3. Weekend and Weekdays underperformance

Weekend Underperformance: The performance is often lower on weekends.

Weekdays Underperformance: The performance is often lower on weekdays.

In [None]:
# Checking if:
# The performance of days in the current week (weekdays or weekends) 
# falls below the performance of the opposite type of days (weekends or weekdays) 
# from the previous week by a certain threshold. 
# If so, it labels them as underperforming. 


def check_week_performance(df, threshold, performance_col='Performance.perc.Daily'):
    # create new columns for week and week performance labels
    # using datetime to add week
    df['Week'] = df.index.to_period('W').astype(str)
    # initialising weekday average
    df['Prev Weekday Avg'] = 0
    # initialising weekend average
    df['Prev Weekend Avg'] = 0
    # initialising default week performance value (FALSE)
    # this might be changed to either 'weekday underperformance' or 'weekend underperformance' (or remain unchanged)
    df['week_underperformance'] = 'FALSE'

    # calculating the weekday and weekend averages for each week
    unique_weeks = df['Week'].unique()
    for i, week in enumerate(unique_weeks):
        if i == 0:  # skip the first week as there is no previous week to compare
            continue

        prev_week = unique_weeks[i - 1]
        prev_week_df = df[df['Week'] == prev_week]

        # for weekdays:
        prev_weekday_avg = prev_week_df.loc[~prev_week_df['is_weekend'], performance_col].mean()
        # for weekends:
        prev_weekend_avg = prev_week_df.loc[prev_week_df['is_weekend'], performance_col].mean()

        # label the rows where the performance is below the average for their respective day types against the previous week's daytypes

        # For each day in the current week, the function checks:
        ## If it's a weekday and its performance is below the previous week's weekend average minus the threshold, it labels it as 'Weekday underperformance'
        ## If it's a weekend and its performance is below the previous week's weekday average minus the threshold, it labels it as 'Weekend underperformance'.
        df.loc[(df['Week'] == week) & (~df['is_weekend']) & (df[performance_col] < prev_weekend_avg - threshold), 'week_underperformance'] = 'Weekday underperformance'
        df.loc[(df['Week'] == week) & (df['is_weekend']) & (df[performance_col] < prev_weekday_avg - threshold), 'week_underperformance'] = 'Weekend underperformance'

        # store the previous week's averages in the dataframe
        df.loc[df['Week'] == week, 'Prev Weekday Avg'] = prev_weekday_avg
        df.loc[df['Week'] == week, 'Prev Weekend Avg'] = prev_weekend_avg

    return df

## 9.4. Seasonal underperformance

Winter Underperformance: There is a seasonal underperformance in winter, when compared to the average of the other 3 seasons.

Summer Underperformance: There is a seasonal underperformance in summer, when compared to the average of the other 3 seasons.

In [None]:
def check_seasonal_performance(df, threshold):
    # Initialize the 'seasonal_fault' column with False
    df['seasonal_underperformance'] = 'FALSE'
    
    # We want to check Seasonal underperfromance only after we have 1 year worth of data.
    # We'll get check if the dataset has at leat 1 year worth of data:
    ## Get the date of the oldest record in the dataset
    oldest_date = df.index.min()
    ## Get the current date and make it timezone-aware
    current_date = df.index.min()
    # Check if the oldest record is more than a year old
    if (current_date - oldest_date) < timedelta(days=365):
        print("Oldest data is less than a year old. The function needs data older than one year.")
        return df
    
    # Get the average for each season
    season_avg = df.groupby('season')['Performance.perc.Daily'].mean()
    
    # Create a new column 'season_avg_performance' with the average value for the corresponding season
    df['season_avg_performance'] = df['season'].apply(lambda x: season_avg[x])
    
    # Calculate the sum of the averages for the other seasons
    other_season_sum = {
        'summer': (season_avg['autumn'] + season_avg['winter'] + season_avg['spring']) / 3,
        'winter': (season_avg['autumn'] + season_avg['summer'] + season_avg['spring']) / 3
    }
    
    # Check for underperformance and update the 'seasonal_fault' column accordingly
    for index, row in df.iterrows():
        season = row['season']
        performance = row['Performance.perc.Daily']

        # Only start labeling after a year has passed
        if (index - oldest_date) < timedelta(days=365):
            continue

        if season == 'summer' and performance < other_season_sum['summer'] - threshold:
            df.loc[index, 'seasonal_underperformance'] = 'summer underperformance'
        elif season == 'winter' and performance < other_season_sum['winter'] - threshold:
            df.loc[index, 'seasonal_underperformance'] = 'winter underperformance'
    
    return df

# 10. Labelling

## 10.1. Parameters

In [None]:
# Parameters for checking major underperformance
window_size_major_und = 3
threshold_performance_major_und = 60

# Parameters for checking minor underperformance
window_size_minor_und = 7
threshold_performance_minor_und = 80

# Parameters for weekend and weekdays underperormance
threshold_performance_weekends_weekdays = 20

# Parameters for checking seasonal underperformance
threshold_performance_seasonal = 20

## 10.2. Running functions

In [None]:
check_major_underperformance(df_LC, window_size_major_und, threshold_performance_major_und)

check_minor_underperformance(df_LC, window_size_minor_und, threshold_performance_minor_und)

check_week_performance(df_LC, threshold_performance_weekends_weekdays)

check_seasonal_performance(df_LC, threshold_performance_seasonal)

# 11. Cleaning up

In [None]:
# Keep only the performance and label columns
level1_sites_labels_df = df_LC[[
    'EnergyYield.kWh.Daily','Irrad.kWh.m2.Daily','Production.kWh.Daily','Performance.perc.Daily',
    'major_underperformance', 'minor_underperformance', 'week_underperformance', 'seasonal_underperformance'
    ]]

# Function to create the array for the new column
# It starts from 4:, since the first 4 columns are NOT meant to be in the level1-labels-site columns
def create_array(row):
    return [value for value in row[4:] if value != 'FALSE']

# Create the new 'level1-labels-site' column
level1_sites_labels_df['level1-labels-site'] = level1_sites_labels_df.apply(create_array, axis=1)


# If want to drop rows with empty arrays in 'level1-labels-site':
# Currently not dropping, to keep rows in which there wasn't any fault.
# level1_sites_labels_df = level1_sites_labels_df.loc[level1_sites_labels_df['level1-labels-site'].apply(len) > 0]

# Keeping even the empty arrays to showcase non-faulty days:
level1_sites_labels_df

# 12. Saving the result

In [None]:
# To save it:
output_dir = './1A_individual_outputs/'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

level1_sites_labels_df.to_csv(f'{output_dir}{site_id}.csv')