In [None]:
# ========================================================
# = Libraries import
# ========================================================

from collections import defaultdict
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd
import numpy as np
import boto3
import pytz
import math
from zoneinfo import ZoneInfo
import datetime
import geopy.distance
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns

In [None]:
# ========================================================
# = AWS Credentials
# ========================================================

PROD_AWS_PROFILE = "gsesami-prod"
AWS_REGION = "us-west-2"

prod_session = boto3.session.Session(profile_name=PROD_AWS_PROFILE)

prod_client = prod_session.client(
    "timestream-query", region_name=AWS_REGION)

# 2. Getting SiteIDs

In [None]:
# Getting list of sites:
df_sites = pd.read_csv('./input_data/Site_List.csv')

# To review:
df_sites = df_sites[~df_sites['performanceSinceInceptionSortKey'].isna()].reset_index(drop=True)

In [None]:
clear_sky = 'EnergyYield.kWh.Daily'
expected = 'Irrad.kWh.m2.Daily'
measured = 'Production.kWh.Daily'

# ========================================================
# = Define period
# ========================================================

date_start = '2022-01-01'
date_end = '2023-02-01'

# ========================================================
# = Thresholds
# ========================================================

threshold_low_cloudiness = 80
window_size = 7
threshold_performance = -10
threshold_underperformance_days = 7

# 3. Functions to build the dataframe

In [None]:
def get_site_id(df, sequence):
    site_id = df['source'].loc[sequence].removeprefix('SITE|')
    return site_id

def get_site_id_full(df, sequence):
    site_id_full = df['source'].loc[sequence]
    return site_id_full

def get_site_name(df, sequence):
    site_name = df['name'].loc[sequence]
    return site_name

In [None]:
def get_site_info(df, sequence):
    site_id = get_site_id(df, sequence)
    site_id_full = get_site_id_full(df, sequence)
    # site_name = get_site_name(df, sequence)
    return site_id, site_id_full

In [None]:
def performance_check(row):
    if row['Performance.perc.Daily'] >= 80:
        val = 'ok'
    elif row['Performance.perc.Daily'] >=60:
        val = 'medium'
    else:
        val = 'under'
    return val

# 4. Functions to fetch data from AWS

In [None]:
# ========================================================
# = Reading EnergyYield.kWh.Daily from AWS TimeStream
# ========================================================

def readClear(date_start, date_end, measure_name, site_id):
    timeid = []
    data_values = []
    ##----------------- read the Performance  --------------##
    query = """SELECT date, max_by(measure_value::double, time) as prod_val
                FROM "GSESTimeseries"."GSESTimeseriesTable"
                WHERE measure_name = '""" + measure_name + """'
                AND siteId = '""" + site_id + """'
                AND date BETWEEN '""" + date_start + """'
                AND '""" + date_end + """'
                GROUP BY date
                ORDER BY date """
    
    client = prod_client
    paginator = client.get_paginator("query")
    page_iterator = paginator.paginate(QueryString=query,)
    i = 1
    for page in page_iterator:
        # print(page)
        try:
            timeid_page = [f[0]['ScalarValue'] for f in pd.DataFrame(page["Rows"])['Data']]
            data_values_page = [f[1]['ScalarValue'] for f in pd.DataFrame(page["Rows"])['Data']]
            timeid = timeid + timeid_page
            data_values = data_values + data_values_page
        except KeyError:
            print('Page {%d} has no data available:'%i)
        i = i+1
    return timeid, data_values

In [None]:
# ========================================================
# = Reading Irrad.kWh.m2.Daily from AWS TimeStream
# ========================================================


def readExpected(date_start, date_end, measure_name, site_id):
    timeid = []
    data_values = []
    ##----------------- read the Performance  --------------##
    query = """SELECT date, max_by(measure_value::double, time) as prod_val
                FROM "GSESTimeseries"."GSESTimeseriesTable"
                WHERE measure_name = '""" + measure_name + """'
                AND siteId = '""" + site_id + """'
                AND date BETWEEN '""" + date_start + """'
                AND '""" + date_end + """'
                GROUP BY date
                ORDER BY date """
    
    client = prod_client
    paginator = client.get_paginator("query")
    page_iterator = paginator.paginate(QueryString=query,)
    i = 1
    for page in page_iterator:
        # print(page)
        try:
            timeid_page = [f[0]['ScalarValue'] for f in pd.DataFrame(page["Rows"])['Data']]
            data_values_page = [f[1]['ScalarValue'] for f in pd.DataFrame(page["Rows"])['Data']]
            timeid = timeid + timeid_page
            data_values = data_values + data_values_page
        except KeyError:
            print('Page {%d} has no data available:'%i)
        i = i+1
    return timeid, data_values

In [None]:
# ========================================================
# = Reading Production.kWh.Daily from AWS TimeStream
# ========================================================

def readMeasured(date_start, date_end, measure_name, site_id):
    timeid = []
    data_values = []
    ##----------------- read the Performance  --------------##
    query = """SELECT date, max_by(measure_value::double, time) as prod_val
                FROM "GSESTimeseries"."GSESTimeseriesTable"
                WHERE measure_name = '""" + measure_name + """'
                AND siteId = '""" + site_id + """'
                AND date BETWEEN '""" + date_start + """'
                AND '""" + date_end + """'
                GROUP BY date
                ORDER BY date """
    
    client = prod_client
    paginator = client.get_paginator("query")
    page_iterator = paginator.paginate(QueryString=query,)
    i = 1
    for page in page_iterator:
        # print(page)
        try:
            timeid_page = [f[0]['ScalarValue'] for f in pd.DataFrame(page["Rows"])['Data']]
            data_values_page = [f[1]['ScalarValue'] for f in pd.DataFrame(page["Rows"])['Data']]
            timeid = timeid + timeid_page
            data_values = data_values + data_values_page
        except KeyError:
            print('Page {%d} has no data available:'%i)
        i = i+1
    return timeid, data_values

# 5. Functions to check faults

In [None]:
def get_rolling_average(df, window_size):
    df['SMA'] = df['Performance.perc.Daily'].rolling(window_size).mean()
    return df

def add_comparative (df):
    df['comparative'] = np.nan
    for i in range(len(df)):
        df['comparative'].iloc[i] = (df['Performance.perc.Daily'].iloc[i] - df['SMA'].iloc[i])
    return df

def underperformance_check(df, threshold):
    df['underperforming'] = np.nan
    for i in range(len(df)):
        df["underperforming"].iloc[i] = df['comparative'].iloc[i] < threshold
    return df

def rolling_underperformance(df, days):
    df['countUnder'] = np.nan
    # Rolling count of underperforming days:
    ix = pd.Series(range(df.shape[0])).where((~df['underperforming']).values, np.nan).ffill().values
    notna = pd.notna(ix)
    df["countUnder"] = df[notna].groupby(ix[notna]).cumcount()

    return df

In [None]:
# Updating SMA based on TRUE values of unpderforming
def compare_underperfDay_with_SMA_of_under(df):
    df['comparative_of_under'] = np.nan
    steps_to_shift = 0
    for i in range(len(df)):
        if df['performancelabel'][i] == 'under':
            steps_to_shift = steps_to_shift + 1
            df['comparative_of_under'].iloc[i] = (df['Performance.perc.Daily'].iloc[i] - df['SMA'].iloc[i-steps_to_shift])
            df['SMA'].iloc[i] = df['SMA'].iloc[i-steps_to_shift]
        else:
            steps_to_shift = 0
            df['comparative_of_under'].iloc[i] = (df['Performance.perc.Daily'].iloc[i] - df['SMA'].iloc[i])
    return df

def underperformance_check_of_under(df, threshold):
    df['underperforming_of_under'] = np.nan
    for i in range(len(df)):
        df["underperforming_of_under"].iloc[i] = df['comparative_of_under'].iloc[i] < threshold
    return df

def rolling_underperformance_of_under(df, days):
    df['countTrue_of_under'] = np.nan
    # Rolling count of underperforming days:
    ix = pd.Series(range(df.shape[0])).where((~df['underperforming_of_under']).values, np.nan).ffill().values
    notna = pd.notna(ix)
    df["countTrue_of_under"] = df[notna].groupby(ix[notna]).cumcount()
    return df


In [None]:
def retro_persistent_fault_check(df, days):
    fault_sites = []
    dates_fault_started = []
    fault_details = np.empty
    faulty_df = df[df['countTrue_of_under'] >= days]
    sudden_unresolved = False
    if not faulty_df.empty:
        dates_fault_started = faulty_df[faulty_df['countTrue_of_under'] == days]['date']
        count_fault = faulty_df.count()[0]
        sudden_unresolved = True
        print(
            'Sudden and unresolved fault detected at '
            + site_id_full
            + '\nDates in which fault started are:' 
            + dates_fault_started
            + '\n' 
            + '\nTotal days of fault = '
            +  str(count_fault)
            )
        # fault_details.append(site_id_full, site_name, sudden_unresolved, count_fault, faulty_df, dates_fault_started)
        fault_sites.append(site_id_full)
        return site_id_full, sudden_unresolved, count_fault, faulty_df, dates_fault_started
    else:
        sudden_unresolved = False
        print("No long persistant faults detected at " + site_id_full)

# 6. Checking long and persistent faults

In [None]:
fault_sites = []
fault_dict = {}

for i in range(len(df_sites)):
    try:
        site_id, site_id_full = get_site_info(df_sites, i)

        # ========================================================
        # = Getting Clear sky and expected generation values
        # ========================================================
        timeid, data_values = readClear(date_start, date_end, clear_sky, site_id)

        df_clear = pd.DataFrame(data_values, index=timeid, columns=[clear_sky])
        df_clear['EnergyYield.kWh.Daily'] = df_clear['EnergyYield.kWh.Daily'].astype(float)

        timeid, data_values = readExpected(date_start, date_end, expected, site_id)

        df_expected = pd.DataFrame(data_values, index=timeid, columns=[expected])
        df_expected['Irrad.kWh.m2.Daily'] = df_expected['Irrad.kWh.m2.Daily'].astype(float)

        # ========================================================
        # = Merging clear skies and expected
        # ========================================================

        def merge_clear_expe(df1, df2):
            df_merged = df1.join(df2)
            df_merged['expected_over_clear'] =  (df_merged['Irrad.kWh.m2.Daily'] / df_merged['EnergyYield.kWh.Daily'] * 100).round(0)
            df_merged['date'] =  df_merged.index
            return df_merged

        df_merged = merge_clear_expe(df_clear, df_expected)

        # ========================================================
        # = Getting low cloudiness days
        # ========================================================

        df_merged.loc[df_merged['expected_over_clear'] >= threshold_low_cloudiness, 'is_low_clousdiness_day'] = True 
        df_merged.loc[df_merged['expected_over_clear'] < threshold_low_cloudiness, 'is_low_clousdiness_day'] = False

        # ========================================================
        # = Reading Production.kWh.Daily from AWS TimeStream
        # ========================================================

        timeid, data_values = readMeasured(date_start, date_end, measured, site_id)

        df_production = pd.DataFrame(data_values, index=timeid, columns=[measured])
        df_production['Production.kWh.Daily'] = df_production['Production.kWh.Daily'].astype(float)

        # ========================================================
        # = Merging it and getting a % of performance daily
        # ========================================================

        df_performance = df_production.join(df_merged)
        df_performance['Performance.perc.Daily'] = (df_performance['Production.kWh.Daily'] / df_performance['Irrad.kWh.m2.Daily'] * 100).round(0)
        
        df_performance['performancelabel'] = df_performance.apply(performance_check, axis=1)

        df_LC = df_performance[df_performance['is_low_clousdiness_day'] == True]

        # df_LC.to_csv('./input_data/sites_stored_locally/' + str(site_id) + '_' + str(site_id_full) + '.csv')

        print("Checking persistant faults for: " + str(site_id_full))

        # Absolute analysis:
        get_rolling_average(df_LC, window_size)
        add_comparative(df_LC)
        underperformance_check(df_LC, threshold_performance)
        rolling_underperformance(df_LC, threshold_underperformance_days)

        # Analysis on days that underperformed, excluding such days from the rolling average:
        compare_underperfDay_with_SMA_of_under(df_LC)
        underperformance_check_of_under(df_LC, threshold_performance)
        rolling_underperformance_of_under(df_LC, threshold_underperformance_days)

        fault_sites.append(retro_persistent_fault_check(df_LC, threshold_underperformance_days))

    except Exception as e:
        
        print(e)

print(fault_sites)

In [None]:
fault_sites

In [None]:
# Creating a dataframe of faulty days
df_faulty = pd.DataFrame(columns=['site_id','site_name','sudden_unresolved','faulty_days', 'faulty_df', 'dates_fault_started'])

In [None]:
# Making a dataframe with not-none faulty sites
fault_res = [i for i in fault_sites if i is not None]

for i in range(len(fault_res)):
    df_faulty.loc[i] = [fault_res[i][0],fault_res[i][1],fault_res[i][2],fault_res[i][3],fault_res[i][4],fault_res[i][5]]
df_faulty

In [None]:
# Saving it to a CSV
df_faulty.to_csv('SiteIDs_with_sudden_and_unresolved.csv')

# Cleaning Up

In [None]:
df_faulty