# Performance Analysis

This document runs a single site through a performance ratio analsys.

# 1. Libraries Import

In [None]:
# ========================================================
# = Libraries import
# ========================================================

from collections import defaultdict
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd
import numpy as np
import boto3
import pytz
import math
from zoneinfo import ZoneInfo
import datetime
import geopy.distance
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
import plotly.express as px

# 2. AWS credentials

In [None]:
# ========================================================
# = AWS Credentials
# ========================================================

PROD_AWS_PROFILE = "gsesami-prod"
AWS_REGION = "us-west-2"

prod_session = boto3.session.Session(profile_name=PROD_AWS_PROFILE)

prod_client = prod_session.client(
    "timestream-query", region_name=AWS_REGION)

# 3. Getting performance

### Notes on timestream variable names:

Irrad.kWh.m2.Daily = Expected Generation

EnergyYield.kWh.Daily = Clear Sky Model

Production.kWh.Daily = Measured Generation

Performance = Measured / Expected * 100

## 3.1. Defining the Site ID, and dates:

In [None]:
# Site ID
site_id = '4dddc226-3464-4c95-aded-875e490a2f02'
# Time period
date_start = '2022-01-01'

# Setting date_end to today
today = datetime.date.today().strftime('%Y-%m-%d')
date_end = today

In [None]:
# getting site_id name:
site_id_full = 'SITE|' + str(site_id)
# Reading from DynameDB output:
# df_names_full = pd.read_csv('./input_data/Site_List_2023-01-24.csv')

# Getting site name
# site_name = df_names_full.loc[df_names_full['source'] == site_id_full, 'name'].iloc[0]

# Checking
print("This analysis will be performed on the site: ", site_id_full)

## 3.2. Getting the Clear Sky Model

In [None]:
# ========================================================
# = Reading EnergyYield.kWh.Daily from AWS TimeStream
# ========================================================

# As for now, during testing, we'll keep the test limited to a single location so not to query timestream constantly:

def readClear(date_start, date_end, measure_name, site_id):
    timeid = []
    data_values = []
    ##----------------- read the Performance  --------------##
    query = """SELECT date, max_by(measure_value::double, time) as prod_val
                FROM "GSESTimeseries"."GSESTimeseriesTable"
                WHERE measure_name = '""" + measure_name + """'
                AND siteId = '""" + site_id + """'
                AND date BETWEEN '""" + date_start + """'
                AND '""" + date_end + """'
                GROUP BY date
                ORDER BY date """
    
    client = prod_client
    paginator = client.get_paginator("query")
    page_iterator = paginator.paginate(QueryString=query,)
    i = 1
    for page in page_iterator:
        # print(page)
        try:
            timeid_page = [f[0]['ScalarValue'] for f in pd.DataFrame(page["Rows"])['Data']]
            data_values_page = [f[1]['ScalarValue'] for f in pd.DataFrame(page["Rows"])['Data']]
            timeid = timeid + timeid_page
            data_values = data_values + data_values_page
        except KeyError:
            print('Page {%d} has no data available:'%i)
        i = i+1
    return timeid, data_values

measure_name = 'EnergyYield.kWh.Daily'
timeid, data_values = readClear(date_start, date_end, measure_name, site_id)

# df_production = pd.DataFrame(data_values, index=timeid, columns=measure_name)

In [None]:
# As for now, during testing, we'll keep the test limited to a single location so not to query timestream constantly:

df_clear = pd.DataFrame(data_values, index=timeid, columns=[measure_name])
df_clear['EnergyYield.kWh.Daily'] = df_clear['EnergyYield.kWh.Daily'].astype(float)

# As for now, during testing, we'll keep the test limited to a single location so not to query timestream constantly:
df_clear

## 3.3. Getting Expected Generation (Irrad.kWh.m2.Daily)

In [None]:
# ========================================================
# = Reading Irrad.kWh.m2.Daily from AWS TimeStream
# ========================================================

# As for now, during testing, we'll keep the test limited to a single location so not to query timestream constantly:


def readExpected(date_start, date_end, measure_name, site_id):
    timeid = []
    data_values = []
    ##----------------- read the Performance  --------------##
    query = """SELECT date, max_by(measure_value::double, time) as prod_val
                FROM "GSESTimeseries"."GSESTimeseriesTable"
                WHERE measure_name = '""" + measure_name + """'
                AND siteId = '""" + site_id + """'
                AND date BETWEEN '""" + date_start + """'
                AND '""" + date_end + """'
                GROUP BY date
                ORDER BY date """
    
    client = prod_client
    paginator = client.get_paginator("query")
    page_iterator = paginator.paginate(QueryString=query,)
    i = 1
    for page in page_iterator:
        # print(page)
        try:
            timeid_page = [f[0]['ScalarValue'] for f in pd.DataFrame(page["Rows"])['Data']]
            data_values_page = [f[1]['ScalarValue'] for f in pd.DataFrame(page["Rows"])['Data']]
            timeid = timeid + timeid_page
            data_values = data_values + data_values_page
        except KeyError:
            print('Page {%d} has no data available:'%i)
        i = i+1
    return timeid, data_values

measure_name = 'Irrad.kWh.m2.Daily'

timeid, data_values = readExpected(date_start, date_end, measure_name, site_id)
# df_production = pd.DataFrame(data_values, index=timeid, columns=measure_name)

In [None]:
df_expected = pd.DataFrame(data_values, index=timeid, columns=[measure_name])
df_expected['Irrad.kWh.m2.Daily'] = df_expected['Irrad.kWh.m2.Daily'].astype(float)
df_expected

In [None]:
## Saving to CSV
# df_expected.to_csv('./input_data/2_Testing_Performance_analysis/Drummoyne Oval/df_expected_DO.csv')

## Reading to CSV
#df_expected = pd.read_csv('./input_data/2_Testing_Performance_analysis/Drummoyne Oval/df_expected_DO.csv', index_col=0)

## 3.4. Comparison on Clear skies x Expected

In [None]:
df_merged = df_clear.join(df_expected)
df_merged['expected_over_clear'] =  (df_merged['Irrad.kWh.m2.Daily'] / df_merged['EnergyYield.kWh.Daily'] * 100).round(0)
df_merged['date'] =  df_merged.index
df_merged

In [None]:
# ========================================================
# = Merging clear skies and expected
# ========================================================

def merge_clear_expe(df1, df2):
    df_merged = df1.join(df2)
    df_merged['expected_over_clear'] =  (df_merged['Irrad.kWh.m2.Daily'] / df_merged['EnergyYield.kWh.Daily'] * 100).round(0)
    df_merged['date'] =  df_merged.index
    return df_merged

In [None]:
# Saving to CSV
# df_merged.to_csv('./input_data/2_Testing_Performance_analysis/Drummoyne Oval/df_merged_clear_and_expected_DO.csv')

# Reading from CSV
#df_merged = pd.read_csv('./input_data/2_Testing_Performance_analysis/Drummoyne Oval/df_merged_clear_and_expected_DO.csv', index_col=0)

In [None]:
# Getting colours for plotting:
def colors_from_values(values, palette_name):
    # normalize the values to range [0, 1]
    normalized = (values - min(values)) / (max(values) - min(values))
    # convert to indices
    indices = np.round(normalized * (len(values) - 1)).astype(np.int32)
    # use the indices to get the colors
    palette = sns.color_palette(palette_name, len(values))
    return np.array(palette).take(indices, axis=0)

In [None]:
# ========================================================
# = Plotting Performance Ratio over time
# ========================================================

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

matplotlib.rc_file_defaults()
ax1 = sns.set_style(style=None, rc=None )

fig, ax1 = plt.subplots(figsize=(30,10))

plt.xticks(rotation=90)


sns.lineplot(data = df_merged['Irrad.kWh.m2.Daily'], marker='o', sort = False, ax=ax1)
ax2 = ax1.twinx()
sns.lineplot(data = df_merged['EnergyYield.kWh.Daily'], marker='o', sort = False, ax=ax1)
ax3 = ax2.twinx()

sns.barplot(data = df_merged, x='date', y='expected_over_clear', alpha=0.3, ax=ax3, palette=colors_from_values(df_merged['expected_over_clear'], "RdYlGn"))

In [None]:
# ========================================================
# = Checking values above a certain threshold when comparing clear skies and expected
# ========================================================

# Defining the threshold for low cloudiness:
## a.ka.: Whenever a day has more than 80% of measured generation over expected, that's considered a low cloudiness day
threshold_low_cloudiness = 80

# Applying:
df_merged.loc[df_merged['expected_over_clear'] >= threshold_low_cloudiness, 'is_low_clousdiness_day'] = True 
df_merged.loc[df_merged['expected_over_clear'] < threshold_low_cloudiness, 'is_low_clousdiness_day'] = False


## 3.5. Getting measured generation (Production.kWh.Daily)

In [None]:
# ========================================================
# = Reading Production.kWh.Daily from AWS TimeStream
# ========================================================

def readMeasured(date_start, date_end, measure_name, site_id):
    timeid = []
    data_values = []
    ##----------------- read the Performance  --------------##
    query = """SELECT date, max_by(measure_value::double, time) as prod_val
                FROM "GSESTimeseries"."GSESTimeseriesTable"
                WHERE measure_name = '""" + measure_name + """'
                AND siteId = '""" + site_id + """'
                AND date BETWEEN '""" + date_start + """'
                AND '""" + date_end + """'
                GROUP BY date
                ORDER BY date """
    
    client = prod_client
    paginator = client.get_paginator("query")
    page_iterator = paginator.paginate(QueryString=query,)
    i = 1
    for page in page_iterator:
        # print(page)
        try:
            timeid_page = [f[0]['ScalarValue'] for f in pd.DataFrame(page["Rows"])['Data']]
            data_values_page = [f[1]['ScalarValue'] for f in pd.DataFrame(page["Rows"])['Data']]
            timeid = timeid + timeid_page
            data_values = data_values + data_values_page
        except KeyError:
            print('Page {%d} has no data available:'%i)
        i = i+1
    return timeid, data_values

measure_name = 'Production.kWh.Daily'

timeid, data_values = readMeasured(date_start, date_end, measure_name, site_id)
# df_production = pd.DataFrame(data_values, index=timeid, columns=measure_name)

In [None]:
df_production = pd.DataFrame(data_values, index=timeid, columns=[measure_name])
df_production['Production.kWh.Daily'] = df_production['Production.kWh.Daily'].astype(float)

In [None]:
## Saving to CSV
# df_production.to_csv('./input_data/2_Testing_Performance_analysis/Drummoyne Oval/df_production_clear_and_expected_DO.csv')

## Reading CSV
# df_production = pd.read_csv('./input_data/2_Testing_Performance_analysis/Drummoyne Oval/df_production_clear_and_expected_DO.csv', index_col=0)

## 3.5. Merging them and getting daily performance (%)

In [None]:
df_performance = df_production.join(df_merged)
df_performance

In [None]:
# Saving to CSV
# df_performance.to_csv('./input_data/2_Testing_Performance_analysis/Drummoyne Oval/df_performance_DO.csv')

# Reading CSV
# df_performance = pd.read_csv('./input_data/2_Testing_Performance_analysis/Drummoyne Oval/df_performance_DO.csv', index_col=0)

In [None]:
df_performance['Performance.perc.Daily'] = (df_performance['Production.kWh.Daily'] / df_performance['Irrad.kWh.m2.Daily'] * 100).round(0)

## 3.6. Kicking off outliers

In [None]:
# If necessary, we can consider kicking off outliers:

#df_performance = df_performance[df_performance['Performance.perc.Daily'] < 120]

df_performance

# 4. Exploring the performance variance

In [None]:
## Currently in diagno, the following is hard-coded:
# > 80% performance = Good
# < 80% and > 60% = Average
# < 60% = Underperforming

def performance_check(row):
    if row['Performance.perc.Daily'] >= 80:
        val = 'ok'
    elif row['Performance.perc.Daily'] >=60:
        val = 'medium'
    else:
        val = 'under'
    return val

def performance_and_LC_check(row):
    if row['is_low_clousdiness_day'] == False:
        val = 'High Cloudiness'
    else:
        val = row['performancelabel']
    return val

df_performance['performancelabel'] = df_performance.apply(performance_check, axis=1)
df_performance['performancelabel'] = df_performance.apply(performance_and_LC_check, axis=1)


In [None]:
## Helper function to improve plotting:

def colors_from_values(values, palette_name):
    # normalize the values to range [0, 1]
    normalized = (values - min(values)) / (max(values) - min(values))
    # convert to indices
    indices = np.round(normalized * (len(values) - 1)).astype(np.int32)
    # use the indices to get the colors
    palette = sns.color_palette(palette_name, len(values))
    return np.array(palette).take(indices, axis=0)

In [None]:
palette ={"ok": "green", "medium": "yellow", "under": "red", "High Cloudiness":"grey"}

matplotlib.rc_file_defaults()

ax1 = sns.set_style(style=None, rc=None)

fig, ax1 = plt.subplots(figsize=(50,15))
plt.xticks(rotation=90)
plt.grid()

# Lineplot for Expected
sns.lineplot(data = df_performance['Irrad.kWh.m2.Daily'], marker='o', sort = False, ax=ax1, label='Expected', color='green', linewidth=3)
ax2 = ax1.twinx()

# Lineplot for Measured
sns.lineplot(data = df_performance['Production.kWh.Daily'], marker='X', sort = False, ax=ax1, label='Measured', color='blue', linewidth=3)

# Barplot for performance
sns.barplot(data = df_performance, x='date', y='Performance.perc.Daily', hue='performancelabel', palette=palette, alpha=0.8, ax=ax2)

fig.suptitle('Performance over time (daily aggregate) for ' + str(site_id_full) + '\n'+ 'Includes all days' +'\n' + str(site_id_full))
sns.move_legend(ax1, "upper left", bbox_to_anchor=(1, 1))
sns.move_legend(ax2, "upper left", bbox_to_anchor=(1, 0.8))

figname = str(site_id + '.png')
fig.savefig('./plots/' + figname)

In [None]:
palette ={"ok": "green", "medium": "yellow", "under": "red", "High Cloudiness":"grey"}

matplotlib.rc_file_defaults()

ax1 = sns.set_style(style=None, rc=None)

fig, ax1 = plt.subplots(figsize=(50,15))
plt.xticks(rotation=90)
plt.grid()

#sns.lineplot(data = df_performance['Irrad.kWh.m2.Daily'], marker='o', sort = False, ax=ax1, label='Expected', color='green')

#ax2 = ax1.twinx()

#sns.lineplot(data = df_performance['Production.kWh.Daily'], marker='X', sort = False, ax=ax1, label='Measured', color='blue')

sns.barplot(data = df_performance, x='date', y='Performance.perc.Daily', hue='performancelabel', palette=palette, alpha=0.8, dodge=None)

fig.suptitle('Site Name = '+ str(site_id_full) +'\nSite ID = '+ str(site_id_full) +'\nPerformance over time (daily aggregate) - All days' + '\nLow cloudiness threshold = '+ str(threshold_low_cloudiness) + '% [expected/clear_sky]')
sns.move_legend(ax1, "upper left", bbox_to_anchor=(1, 1))
sns.move_legend(ax2, "upper left", bbox_to_anchor=(1, 0.8))

figname = str(site_id + '.png')
fig.savefig('./plots/ONLY_PERF_' + figname)

# 5. Working with low cloudiness days

## 5.1. Getting low cloudiness days

In [None]:
df_LC = df_performance[df_performance['is_low_clousdiness_day'] == True]
df_LC

## 5.2. Reading from CSV

### If reading directly from CSV (not querying AWS timestream) start here:

In [None]:
'''
site_to_read_csv = str(site_id) + "_" + str(site_name)
site_to_read_csv

df_LC = pd.read_csv('./input_data/sites_stored_locally/'+ str(site_to_read_csv) +'.csv', index_col=0)
'''

## 5.3. Plotting low cloudiness' days

In [None]:
palette ={"ok": "green", "medium": "yellow", "under": "red"}

matplotlib.rc_file_defaults()
ax1 = sns.set_style(style=None, rc=None )

fig, ax1 = plt.subplots(figsize=(40,10))
plt.xticks(rotation=90)
plt.grid()

fig.suptitle('Site Name = '+ str(site_id_full) +'\nSite ID = '+ str(site_id_full) +'\nPerformance over time (daily aggregate) - Only low cloudiness days' + '\nLow cloudiness threshold = '+ str(threshold_low_cloudiness) + '% [expected/clear_sky]')

ax = sns.barplot(
    data=df_LC, 
    x='date',
    y='Performance.perc.Daily',
    hue='performancelabel',
    palette=palette,
    dodge=None
    )
for i in ax.containers:
    ax.bar_label(i,)

plt.show()

## 5.4. Functions to check underperformance

In [None]:
window_size = 7
threshold_performance = -10
threshold_underperformance_days = 7

In [None]:
def get_rolling_average(df, window_size):
    df['SMA'] = df['Performance.perc.Daily'].rolling(window_size).mean()
    return df

def add_comparative (df):
    df['comparative'] = np.nan
    for i in range(len(df)):
        df['comparative'].iloc[i] = (df['Performance.perc.Daily'].iloc[i] - df['SMA'].iloc[i])
    return df

def underperformance_check(df, threshold):
    df['underperforming'] = np.nan
    for i in range(len(df)):
        df["underperforming"].iloc[i] = df['comparative'].iloc[i] < threshold
    return df

def rolling_underperformance(df, days):
    df['countUnder'] = np.nan
    # Rolling count of underperforming days:
    ix = pd.Series(range(df.shape[0])).where((~df['underperforming']).values, np.nan).ffill().values
    notna = pd.notna(ix)
    df["countUnder"] = df[notna].groupby(ix[notna]).cumcount()

    return df

In [None]:
# Updating SMA based on TRUE values of unpderforming
def compare_underperfDay_with_SMA_of_under(df):
    df['comparative_of_under'] = np.nan
    steps_to_shift = 0
    for i in range(len(df)):
        if df['performancelabel'][i] == 'under':
            steps_to_shift = steps_to_shift + 1
            df['comparative_of_under'].iloc[i] = (df['Performance.perc.Daily'].iloc[i] - df['SMA'].iloc[i-steps_to_shift])
            df['SMA'].iloc[i] = df['SMA'].iloc[i-steps_to_shift]
        else:
            steps_to_shift = 0
            df['comparative_of_under'].iloc[i] = (df['Performance.perc.Daily'].iloc[i] - df['SMA'].iloc[i])
    return df

def underperformance_check_of_under(df, threshold):
    df['underperforming_of_under'] = np.nan
    for i in range(len(df)):
        df["underperforming_of_under"].iloc[i] = df['comparative_of_under'].iloc[i] < threshold
    return df

def rolling_underperformance_of_under(df, days):
    df['countTrue_of_under'] = np.nan
    # Rolling count of underperforming days:
    ix = pd.Series(range(df.shape[0])).where((~df['underperforming_of_under']).values, np.nan).ffill().values
    notna = pd.notna(ix)
    df["countTrue_of_under"] = df[notna].groupby(ix[notna]).cumcount()
    return df


In [None]:
def persistent_fault_check(df, days):
    fault_sites = []
    if df['countTrue_of_under'].iloc[-1] >= days:
        print(
            'Persistent fault detected at '
            + site_id_full
            + '\nParameters are:' 
            + '\nRolling average window = ' 
            + str(window_size) 
            + '\nPerformance threshold = ' 
            + str(threshold_performance)
            )
        fault_sites.append(site_id_full)
    else:
        print("No long persistant faults detected")
    return site_id_full

def retro_persistent_fault_check(df, days):
    fault_sites = []
    dates_fault_started = []
    fault_details = np.empty
    faulty_df = df[df['countTrue_of_under'] >= days]
    sudden_unresolved = False
    if not faulty_df.empty:
        dates_fault_started = faulty_df[faulty_df['countTrue_of_under'] == days]['date']
        count_fault = faulty_df.count()[0]
        sudden_unresolved = True
        print(
            'Sudden and unresolved fault detected at '
            + site_id_full 
            +'\nSiteID: '
            + site_id_full
            + '\nDates in which fault started are:' 
            + dates_fault_started
            + '\n' 
            + '\nTotal days of fault = '
            +  str(count_fault)
            )
        # fault_details.append(site_id_full, site_name, sudden_unresolved, count_fault, faulty_df, dates_fault_started)
        fault_sites.append(site_id_full)
        return site_id_full, site_id_full, sudden_unresolved, count_fault, faulty_df, dates_fault_started
    else:
        sudden_unresolved = False
        print("No long persistant faults detected at " + site_id_full)

## 5.5. Checking underperformance

In [None]:
# Absolute analysis:
get_rolling_average(df_LC, window_size)
add_comparative(df_LC)
underperformance_check(df_LC, threshold_performance)
rolling_underperformance(df_LC, threshold_underperformance_days)

In [None]:
# Analysis on days that underperformed, excluding such days from the rolling average:
compare_underperfDay_with_SMA_of_under(df_LC)
underperformance_check_of_under(df_LC, threshold_performance)
rolling_underperformance_of_under(df_LC, threshold_underperformance_days)

In [None]:
# To retroactively check if there was fault:
retro_persistent_fault_check(df_LC, threshold_underperformance_days)

In [None]:
# site_id_full, site_id_full, sudden_unresolved, count_fault, faulty_df, dates_fault_started = retro_persistent_fault_check(df_LC, threshold_underperformance_days)

## 5.6. Saving the result

In [None]:
## To save it:
# df_LC.to_csv(str(site_id_full) + str(site_id) + '_dataframe_low_cloudiness.csv')