# 1. Libraries Import

In [None]:
# ========================================================
# = Libraries import
# ========================================================

import numpy as np
import boto3
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import datetime
from datetime import timedelta
from pandas.tseries.offsets import DateOffset

import os

import pytz

import pvlib
from pvlib import irradiance

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go

# 2. AWS credentials

In [None]:
# ========================================================
# = AWS Credentials
# ========================================================

PROD_AWS_PROFILE = "gsesami-prod"
AWS_REGION = "ap-southeast-2"

prod_session = boto3.session.Session(profile_name=PROD_AWS_PROFILE)

prod_client = prod_session.client(
    "timestream-query", region_name=AWS_REGION)

# 3. Querying TimeStream

## 3.1. Monitor ID, Site ID, and Metadata

In [None]:
# Reading all sites
sites_list = pd.read_csv('./input_data/Site_List.csv')
# Reading all monitors
monitors_list = pd.read_csv('./input_data/Monitors_List.csv')

In [None]:
# Monitor ID
MID = '282136'

# Displaying MID
MID_full = str("MNTR|" + MID)
print(f'Analysing {MID_full}')

# Get site_id from MID
site_id = monitors_list.loc[monitors_list['source'] == str("MNTR|" + MID), 'siteId'].iloc[0]
print(f'Included under {site_id}')

In [None]:
# Making sure the monitor list is of type string:
monitors_list['source'] = monitors_list['source'].astype(str)

# filtering the dataframe based on the monitor in question:
monitor_row = monitors_list.loc[monitors_list['source'] == MID_full]

# If the monitor ID exists, isolate each useful variable:
if not monitor_row.empty:
    # latitude
    # Notice that the data output here for latitude is really weird,
    # Have to fix it a bit:
    mid_latitude = float(monitor_row['latitude'].values[0])
    # longitude:
    mid_longitude = float(monitor_row['longitude'].values[0])
    # loss:
    mid_loss = float( 1 - monitor_row['loss'].values[0] )
    # manufacturer api:
    mid_manufacturerApi = monitor_row['manufacturerApi'].values[0] 
    # pvSize:
    mid_pvsizewatt = monitor_row['pvSizeWatt'].values[0]
    # tilt:
    mid_tilt = float(monitor_row['tilt'].values[0])
    # weatherstationid:
    mid_weatherStationId = monitor_row['weatherStationId'].values[0]
    # Azymuth:
    mid_azimuth = float(monitor_row['azimuth'].values[0])
        
    print(f'Latitude: {mid_latitude}')
    print(f'Longitude: {mid_longitude}')
    print(f'Loss: {mid_loss}')
    print(f'pvSizeWatt: {mid_pvsizewatt}')
    print(f'ManufacturerApi: {mid_manufacturerApi}')
    print(f'Tilt: {mid_tilt}')
    print(f'WeatherStationId: {mid_weatherStationId}')
    print(f'Azimuth: {mid_azimuth}')
else:
    print(f"No data found for the Monitor ID {MID}")

## 3.2. Time period

In [None]:
###### TIME PERIOD #######
time_start = '2023-04-20'
time_end = '2023-04-25'

# Setting date_end to today
#today = datetime.date.today().strftime('%Y-%m-%d')
#time_end = today

# I'm geting these from the manually labelled faults, I'll need it to be a string to run the SQL query
# However, I want to add 1 day before and after these dates for a proper plot, so:

## Gotta convert it to datetime:
time_start = datetime.datetime.strptime(time_start, '%Y-%m-%d')
time_end = datetime.datetime.strptime(time_end, '%Y-%m-%d')

## Subtract one day
time_start = time_start - timedelta(days=1)
time_end = time_end + timedelta(days=1)

# Convertting again from datetime object back to string
time_start = time_start.strftime('%Y-%m-%d')
time_end = time_end.strftime('%Y-%m-%d')

# Checking timezone
timezone_value = sites_list[sites_list['source'] == site_id].iloc[0]['timezone']

# time_endtz = datetime.datetime.fromisoformat(time_end_short)
time_starttz = pytz.timezone('UTC').localize(datetime.datetime.strptime(time_start, '%Y-%m-%d'))
time_endtz = pytz.timezone('UTC').localize(datetime.datetime.strptime(time_end, '%Y-%m-%d'))

print(f'Current analysis being performed for days {time_start} to {time_end}')
print(f'This monitor is located at: {timezone_value}')

## 3.3 Helper functions to read metrics

In [None]:
def read_metric(time_start, time_end, measure_name, MID):
    """
    read raw data from the AWS database
    :param time_start: time start, e.g., '2022-10-02'
    :param time_end: time end, e.g., '2023-04-05'
    :param measure_name: measurement metric, e.g.,'Gen.W'
    :param MID: monitor id
    :return:
    """
    timeid = []
    data_values = []
    ##----------------- read the Performance  --------------##
    query = """SELECT time, measure_value::bigint
                    FROM "DiagnoProd"."DiagnoProd"
                    WHERE measure_name = '""" + measure_name + """'
                    AND MID = '""" + MID + """'
                    AND time BETWEEN '""" + time_start + """'
                    AND '""" + time_end + """' """

    client = prod_client
    paginator = client.get_paginator("query")
    page_iterator = paginator.paginate(QueryString=query,)
    i = 1
    for page in page_iterator:
        # print(page)
        try:
            timeid_page = [f[0]['ScalarValue'] for f in pd.DataFrame(page["Rows"])['Data']]
            data_values_page = [f[1]['ScalarValue'] for f in pd.DataFrame(page["Rows"])['Data']]
            timeid = timeid + timeid_page
            data_values = data_values + data_values_page
        except KeyError:
            print('Page {%d} has no data available:'%i)
        i = i+1
    return timeid, data_values

In [None]:
def build_dataframe(timeid, measure_name, data_values, timezone_value):
    """
    change the time zone
    :param timeid: time read from the AWS
    :param measure_name:
    :param data_values: value read from the ASW
    :param timezone_value: time zone
    :return:
    """
    timeid = pd.to_datetime(timeid)
    if timeid.tzinfo is None:
        print('this is not tz-aware')
        if timezone_value is not None:
            timeid = timeid.tz_localize('UTC').tz_convert(timezone_value)
        else:
            print('no timezone in the table')
            timeid = timeid.tz_localize('UTC').tz_convert('Australia/Sydney')
    else:
        print('this is tz-aware')
    data = pd.DataFrame(data={'time':timeid, measure_name: data_values})
    data.sort_values('time', inplace=True)
    # data.set_index('time', inplace=True)
    data[measure_name] = data[measure_name].astype(float)
    return data

In [None]:
def change_tz(timeid):
    # print('rawtimeid:', timeid)
    tzinfo_str = timeid[0].tzinfo
    hour_offset = tzinfo_str.utcoffset(datetime.datetime(2022,1,1))
    hms = str(hour_offset).split(':')
    time_modified = timeid + datetime.timedelta(hours=int(hms[0]), minutes=int(hms[1]), seconds=int(hms[2]))
    time_utc = time_modified.dt.tz_convert('UTC')
    # print('modified:', time_utc)
    return time_utc

## 3.4. Helper functions to get low-cloudiness days

In [None]:
def read_metric_site(date_start, date_end, measure_name, site_id):
    timeid = []
    data_values = []
    ##----------------- read the Performance  --------------##
    query = """SELECT date, max_by(measure_value::double, time) as prod_val
                FROM "DiagnoProd"."DiagnoProd"
                WHERE measure_name = '""" + measure_name + """'
                AND siteId = '""" + site_id + """'
                AND date BETWEEN '""" + date_start + """'
                AND '""" + date_end + """'
                GROUP BY date
                ORDER BY date """
    
    client = prod_client
    paginator = client.get_paginator("query")
    page_iterator = paginator.paginate(QueryString=query,)
    i = 1
    for page in page_iterator:
        # print(page)
        try:
            timeid_page = [f[0]['ScalarValue'] for f in pd.DataFrame(page["Rows"])['Data']]
            data_values_page = [f[1]['ScalarValue'] for f in pd.DataFrame(page["Rows"])['Data']]
            timeid = timeid + timeid_page
            data_values = data_values + data_values_page
        except KeyError:
            print('Page {%d} has no data available:'%i)
        i = i+1
    return timeid, data_values

In [None]:
def build_dataframe_site(timeid, measure_name, data_values):
    # ============== Check if there is data available for the pv system =============
    if len(timeid)!=0:
        timeid = pd.to_datetime(timeid)
        if timeid.tzinfo is None:
            print('this is not tz-aware')
            if timezone_value is not None:
                timeid = timeid.tz_localize('UTC').tz_convert(timezone_value)
                # timeid = timeid.tz_localize(timezone_list[i])
            else:
                print('no timezone in the table')
                timeid = timeid.tz_localize('UTC').tz_convert('Australia/Sydney')
                # timeid = timeid.tz_localize('Australia/Sydney')
        else:
            print('this is tz-aware')
        
        timesort = timeid.sort_values()
        data = pd.DataFrame(data={'time':timeid, measure_name: data_values})
        data.sort_values('time', inplace=True)
        data.set_index('time', inplace=True)
        data[measure_name] = data[measure_name].astype(float)
    else:
        data = pd.DataFrame(data_values, index=timeid, columns=[measure_name])
    
    return data

In [None]:
# ==================================
# = Merging clear skies and expected
# ==================================

def merge_clear_expe(df1, df2):
    df_merged = df1.join(df2)
    df_merged['expected_over_clear'] =  (df_merged['Irrad.kWh.m2.Daily'] / df_merged['EnergyYield.kWh.Daily'] * 100).round(0)
    df_merged['date'] =  df_merged.index
    return df_merged

## 3.5. Setting up a 5minutes datetime dataframe

In [None]:
time_index5min = pd.date_range(start=time_starttz, end=time_endtz, freq='5min').tz_convert('UTC')
df_5min = pd.DataFrame(index=np.arange(len(time_index5min)))
df_5min['time'] = time_index5min

# 4. Reading AC Data

## 4.1. Building the Gen.W dataframe

In [None]:
# ========================================================
# = Reading P(AC) total from AWS TimeStream
# = Metric is Gen.W 
# ========================================================

measure_name = 'Gen.W'
timeid, data_values = read_metric(time_start, time_end, measure_name, MID)
df_genW = build_dataframe(timeid, measure_name, data_values, timezone_value)

In [None]:
# ========================================================
# = Adjusting the df_genW to have 5 minutes increments
# ========================================================

# Convert the 'time' column in df_5min to timezone
df_5min['time'] = df_5min['time'].dt.tz_convert(timezone_value)

df_genW = pd.merge_asof(df_5min, df_genW, on="time")

# Getting the first valid index:
first_valid_index = df_genW['Gen.W'].first_valid_index()
df_genW = df_genW[first_valid_index:].reset_index(drop=True)

In [None]:
def plot_5minGenw(df, time_start, time_end):
    # plot the data
    fig = plt.figure(figsize=(24,3)) 
    plt.plot(df['time'], df['Gen.W'])
    plt.title(f"Gen.W Data [5 minutes] from {time_start} to {time_end}")
    plt.xlabel('Time')
    plt.ylabel('Gen.W')
    plt.grid(True)
    plt.show()

    # Create the desired directory if it doesn't exist
    dir_path = f"./recurrent_faults_plots/5MIN_RAWGENW/{MID}"
    if not os.path.exists(dir_path):
        os.makedirs(dir_path)

    # Save it
    plt.savefig(f"{dir_path}/{time_start}_{time_end}.png")

In [None]:
plot_5minGenw(df_genW, time_start, time_end)

In [None]:
# making sure there's no negative generation
## I'll be saving a copy of the original Gen.W to include an analysis of negative generation afterwards

df_genW['original_genW'] = df_genW['Gen.W']

# Now clipping
df_genW['Gen.W'] = df_genW['Gen.W'].clip(lower=0)

In [None]:
# plot the data
fig = plt.figure(figsize=(24,3)) 
plt.plot(df_genW['time'], df_genW['Gen.W'])
plt.title(f"Gen.W Data [5 minutes] from {time_start} to {time_end}")
plt.xlabel('Time')
plt.ylabel('Gen.W')
plt.grid(True)
plt.show()

## 4.2. Plotting faults

In [None]:
# Defining the start and end of faults
start_faults = pd.Timestamp(time_start)
end_faults = pd.Timestamp(time_end)

In [None]:
# Convert the 'time' column to datetime
# df_genW['time'] = pd.to_datetime(df_genW['time'])

# Creating a copy of the 'time' column
df_genW['time_copy'] = df_genW['time']

# Setting the 'time' column as the index
df_genW = df_genW.set_index('time')

# Resampling the dataframe hourly
df_genW_resampled = df_genW.resample('H').mean()

# Reseting the index
df_genW_resampled.reset_index(level=0, inplace=True)
df_genW.reset_index(level=0, inplace=True)

In [None]:
# Filtering the dataframe
df_sequence_fault = df_genW_resampled[(df_genW_resampled['time'].dt.date >= start_faults.date()) & (df_genW_resampled['time'].dt.date <= end_faults.date())]

# plot the data
plt.figure(figsize=(24,3)) 
plt.plot(df_sequence_fault['time'], df_sequence_fault['Gen.W'])
plt.title(f"Gen.W Data [hourly] from {start_faults} to {end_faults}")
plt.xlabel('Time')
plt.ylabel('Gen.W')
plt.grid(True)
plt.show()

## 4.3. Plotting single days:

In [None]:
day_to_analyse = start_faults + timedelta(days=2)

In [None]:
df_day = df_genW_resampled[df_genW_resampled['time'].dt.date == day_to_analyse.date()]

# plot the data
plt.figure(figsize=(12,6)) 
plt.plot(df_day['time'], df_day['Gen.W'])
plt.title(f"Gen.W Data for {day_to_analyse}, 2023")
plt.xlabel('Time')
plt.ylabel('Gen.W')
plt.grid(True)
plt.show()

## 4.4. Plotting a full week

In [None]:
# Select data for the 7-day period
start_date = day_to_analyse - DateOffset(days=3)
end_date = day_to_analyse + DateOffset(days=3)

df_7_days = df_genW_resampled[(df_genW_resampled['time'].dt.date >= start_date.date()) & (df_genW_resampled['time'].dt.date <= end_date.date())]

# plot the data
plt.figure(figsize=(24,3)) 
plt.plot(df_7_days['time'], df_7_days['Gen.W'])
plt.title(f"Gen.W Data from {start_date.date()} to {end_date.date()}, 2023")
plt.xlabel('Time')
plt.ylabel('Gen.W')
plt.grid(True)
plt.show()

## 4.5. Getting Low-Cloudiness days

In [None]:
# ===================================================
# = Reading EnergyYield.kWh.Daily from AWS TimeStream
# ===================================================

# making sure the site ID has no prefix
site_id = site_id.removeprefix('SITE|')

# Reading:
measure_name = 'EnergyYield.kWh.Daily'
timeid, data_values = read_metric_site(time_start, time_end, measure_name, site_id)
df_clear = build_dataframe_site(timeid, measure_name, data_values)

# ================================================
# = Reading Irrad.kWh.m2.Daily from AWS TimeStream
# ================================================

measure_name = 'Irrad.kWh.m2.Daily'
timeid, data_values = read_metric_site(time_start, time_end, measure_name, site_id)
df_expected = build_dataframe_site(timeid, measure_name, data_values)

# Fixing it as a float:
df_expected['Irrad.kWh.m2.Daily'] = df_expected['Irrad.kWh.m2.Daily'].astype(float)

# ==================================================
# = Reading Production.kWh.Daily from AWS TimeStream
# ==================================================

measure_name = 'Production.kWh.Daily'
timeid, data_values = read_metric_site(time_start, time_end, measure_name, site_id)
df_production = build_dataframe_site(timeid, measure_name, data_values)
# Fixing it as float
df_production['Production.kWh.Daily'] = df_production['Production.kWh.Daily'].astype(float)


# Merging clear and expected:
df_merged = df_clear.join(df_expected)
# Merging (clear and expected) and production
df_merged = df_merged.join(df_production)

# Getting the performance ratio:
df_merged['expected_over_clear'] =  (df_merged['Irrad.kWh.m2.Daily'] / df_merged['EnergyYield.kWh.Daily'] * 100).round(0)

# Getting this extra column flor plotting:
df_merged['date'] =  df_merged.index.date

# ========================================================
# = Checking values above a certain threshold when comparing clear skies and expected
# ========================================================

# Define the threshold for low cloudiness days:
threshold_low_cloudiness = 80

# Make it low_cloudiness aware:
df_merged.loc[df_merged['expected_over_clear'] >= threshold_low_cloudiness, 'is_low_cloudiness_day'] = True 
df_merged.loc[df_merged['expected_over_clear'] < threshold_low_cloudiness, 'is_low_cloudiness_day'] = False

df_site = df_merged

# 5. Data from PVLib

## 5.1. Getting sunrise and sunset times

In [None]:
# Every day only has 1 sunrise and 1 sunset, so we can go with uique days:
unique_dates = df_genW['time'].dt.date.unique()

# initialising:
sun_info = {}
# iterating over my array of uniquedates:
for date in unique_dates:
    # pvlib expects a localised value, not an iterable list, so I'm wrapping my date timestamp:
    localized_date = pd.DatetimeIndex([pd.Timestamp(date).tz_localize(timezone_value)]) 
    # getting the results, which is a 1 row dataframe with 3 columns:
    sun_results = pvlib.solarposition.sun_rise_set_transit_spa(localized_date, mid_latitude, mid_longitude)
    sun_info[date] = sun_results.values[0] 

# Convert dictionary to a DataFrame:
sun_info_df = pd.DataFrame.from_dict(sun_info, orient='index', columns=['sunrise', 'sunset', 'transit'])

## 5.2. Merging with df_genW

In [None]:
# creating a date column for the merge:
df_genW['date'] = df_genW['time'].dt.date

# Reset index in sun_info_df and rename the index column as 'date'
sun_info_df = sun_info_df.reset_index().rename(columns={'index':'date'})

# Convert the 'date' column to datetime in both dataframes
df_genW['date'] = pd.to_datetime(df_genW['date'])
sun_info_df['date'] = pd.to_datetime(sun_info_df['date'])

# Merge df_genW with sun_info_df
df_genW = pd.merge(df_genW, sun_info_df, on='date', how='left')

## 5.3. Adding GHI

In [None]:
# Convert 'time' column to datetime, to be sure:
df_genW['time'] = pd.to_datetime(df_genW['time'])

# Create a copy of 'time' column before setting it as index, I'll need this for the labelling
df_genW['time_copy'] = df_genW['time']

# Set 'time' column as index
df_genW.set_index('time', inplace=True)

# Running pvlib to get GHI:
location = pvlib.location.Location(mid_latitude, mid_longitude, tz=timezone_value)
df_genW['clear_sky_ghi'] = location.get_clearsky(df_genW.index, model='ineichen')['ghi']

# Renaming time_copy to time again:
df_genW.rename(columns={'time_copy': 'time'}, inplace=True)

## 5.4. Clear Sky Values

Reference:

https://pvlib-python.readthedocs.io/en/stable/user_guide/clearsky.html

In [None]:
def get_irradiance(loc, times, tilt, surface_azimuth):
    # Generate clearsky data using the Ineichen model, which is the default
    # The get_clearsky method returns a dataframe with values for GHI, DNI,and DHI
    clearsky = loc.get_clearsky(times)
    # Get solar azimuth and zenith to pass to the transposition function
    solar_position = loc.get_solarposition(times=times)
    # Use the get_total_irradiance function to transpose the GHI to POA
    POA_irradiance = irradiance.get_total_irradiance(
        surface_tilt=tilt,
        surface_azimuth=surface_azimuth,
        dni=clearsky['dni'],
        ghi=clearsky['ghi'],
        dhi=clearsky['dhi'],
        solar_zenith=solar_position['apparent_zenith'],
        solar_azimuth=solar_position['azimuth'])
    # Return DataFrame with only GHI and POA
    return pd.DataFrame({'GHI': clearsky['ghi'],
                         'POA': POA_irradiance['poa_global']})

In [None]:
# Setting up a dataframe with local info:
time_index5min_local = pd.date_range(start=pd.to_datetime(time_start).tz_localize(timezone_value), end=pd.to_datetime(time_end).tz_localize(timezone_value), freq='5min')

# Getting the location
loc = pvlib.location.Location(mid_latitude, mid_longitude, tz=timezone_value)
pvlib_irr_pre = get_irradiance(loc, time_index5min_local, mid_tilt, mid_azimuth)

In [None]:
# Correcting for size and loss
pvlib_irr_pre['POA'] = pvlib_irr_pre['POA'] * mid_pvsizewatt * mid_loss / 1000

In [None]:
# Merging the dataframes
merged_df = pd.merge(df_genW, pvlib_irr_pre, left_index=True, right_index=True)

# Assign the column
merged_df['theoretical_clear-sky_generation.W'] = merged_df['POA']

# If you want the result back in df_genW, you can do:
df_genW = merged_df

### 5.4.1. Resampling hourly:

In [None]:
df_genW_resample = df_genW.copy()  # create a copy of the original dataframe

df_genW_resample['time'] = pd.to_datetime(df_genW_resample['time']) # convert time to datetime if it's not
df_genW_resample.set_index('time', inplace=True) # set time as index

# Create new df with resampling
df_genW_resample = df_genW_resample.resample('H').mean()

## 5.5. Overlapping Clear sky values and measured generation

### 5.5.1. Before shifting the timeseries

In [None]:
start_date = df_7_days['time'].min()
end_date = df_7_days['time'].max()

df_sliced = df_genW_resample.loc[start_date:end_date]

plt.figure(figsize=(24, 3))
sns.lineplot(x = df_sliced.index, y = df_sliced['Gen.W'], label = 'Gen.W')
sns.lineplot(x = df_sliced.index, y = df_sliced['theoretical_clear-sky_generation.W'], label = 'Theoretical clear sky generation')

plt.title(f'Gen.W and Theoretical Clear Sky Generation Over Time - With Azimuth = {mid_azimuth} and Tilt = {mid_tilt}')
plt.xlabel('Time')
plt.ylabel('Value')
plt.legend()

plt.show()


### 5.5.2. Shifting the dataframe to overlap generation and clear sky

In [None]:
# Note that before shifting I want to apply a threshold on the aggregate hourly dataframe before shifting, to avoid situations in which there's more rows of measured than theoretical

threshold_minimum_generation = df_genW_resample['Gen.W'].max() * 0.01
threshold_minimum_theoretical = df_genW_resample['theoretical_clear-sky_generation.W'].max() * 0.01

df_genW_resample.loc[df_genW_resample['Gen.W'] <  threshold_minimum_generation, 'Gen.W'] = 0
df_genW_resample.loc[df_genW_resample['theoretical_clear-sky_generation.W'] <  threshold_minimum_theoretical, 'theoretical_clear-sky_generation.W'] = 0

In [None]:
threshold_minimum_theoretical = df_genW_resample['theoretical_clear-sky_generation.W'].max() * 0.01
df_genW_resample.loc[df_genW_resample['theoretical_clear-sky_generation.W'] <  threshold_minimum_theoretical, 'theoretical_clear-sky_generation.W'] = 0

In [None]:
# Note that cadence_of_observation is in minutes.

def shift_timeseries(df,cadence_of_observation = 5):
    # Finding the first non-zero generation timestamp for each DataFrame
    ## Measured
    start_genW = df[df['Gen.W'] != 0]['time'].iloc[0]
    ## Clear Skies
    start_theoretical = df[df['theoretical_clear-sky_generation.W'] != 0]['time'].iloc[0]

    # Compute time difference in hours
    time_diff = (start_theoretical - start_genW).total_seconds() / 3600

    # Converting timedelta into minutes
    shift_minutes = timedelta(hours=time_diff).seconds // 60

    print(f'shifting the timeseries by {shift_minutes} minutes')

    shift_periods = int(shift_minutes / cadence_of_observation)

    # Shift 'Gen.W' column in df_genW_copy
    df['Gen.W'] = df['Gen.W'].shift(shift_periods)

    # Fill NaN values (if any) with 0 after shifting
    df['Gen.W'].fillna(0, inplace=True)
    return df

In [None]:
df_genW_resample['time'] = df_genW_resample.index
df_genW_resample = shift_timeseries(df_genW_resample, cadence_of_observation=5)

In [None]:
# Copying the original dataframe
df_copy = df_genW_resample.copy()  
df_copy['time'] = pd.to_datetime(df_copy['time']) 
# Setting the index
df_copy.set_index('time', inplace=True) 

### 5.5.3. Plot after shifting

In [None]:
start_date = df_7_days['time'].min()
end_date = df_7_days['time'].max()

df_to_plot = df_genW_resample.copy(deep=True)
df_to_plot = df_to_plot.resample('H').mean()

df_sliced = df_to_plot.loc[start_date:end_date]

plt.figure(figsize=(24, 3))
sns.lineplot(x = df_sliced.index, y = df_sliced['Gen.W'], label = 'Gen.W')
sns.lineplot(x = df_sliced.index, y = df_sliced['theoretical_clear-sky_generation.W'], label = 'Theoretical clear sky generation')

plt.title(f'[After shifting] Gen.W and Theoretical Clear Sky Generation Over Time - With Azimuth = {mid_azimuth} and Tilt = {mid_tilt}')
plt.xlabel('Time')
plt.ylabel('Value')
plt.legend()

plt.show()

### 5.5.4. Delta after shifting

In [None]:
df_genW_resample['time'] = df_genW_resample.index

start_date = df_7_days['time'].min()
end_date = df_7_days['time'].max()

df_slice = df_genW_resample[start_date:end_date]

df_slice['time'] = pd.to_datetime(df_slice['time']) 

df_slice['delta-clear-gen'] = df_slice['Gen.W'] - df_slice['theoretical_clear-sky_generation.W']

# Creating 2 subplots
fig, axes = plt.subplots(2, 1, figsize=(20, 6), sharex=True) 

# plot 'Gen.W' and 'theoretical_clear-sky_generation.W' against 'time' in the first subplot
sns.lineplot(ax=axes[0], x='time', y='Gen.W', data=df_slice, label='Gen.W', color='blue')
sns.lineplot(ax=axes[0], x='time', y='theoretical_clear-sky_generation.W', data=df_slice, label='Theoretical Clear Sky Generation', color='orange')
axes[0].legend()

# plot 'delta-clear-gen' against 'time' in the second subplot
sns.lineplot(ax=axes[1], x='time', y='delta-clear-gen', data=df_slice, label='Delta Clear Gen', color='green')
axes[1].legend()

plt.title(f'[After shifting] Delta of Gen.W and Theoretical Clear Sky Generation Over Time')

plt.tight_layout()  # adjust subplot params so that the subplots fit into the figure area
plt.show()


### 5.5.5. Interactive plot after shifting

In [None]:
# Create a figure
fig = go.Figure()

# Add the first line
fig.add_trace(go.Scatter(x=df_genW_resample['time'], y=df_genW_resample['Gen.W'], mode='lines', name='Gen.W'))

# Add the second line
fig.add_trace(go.Scatter(x=df_genW_resample['time'], y=df_genW_resample['theoretical_clear-sky_generation.W'], mode='lines', name='Theoretical'))

# Display the figure
fig.show()

## 5.5. Normalising the curves

### 5.5.1. Normalising

In [None]:
# Function to normalize columns based on the maximum value of theoretical_clear-sky_generation.W for each day
def normalize_daywise(group):

    # CUrrently hardcoded Gen.W -> Make sure to change this to a if stament to encompass the cases for when
    # On a day Gen.W has a higher max and on a day Theoritical has a higher max

    max_val = max(group['Gen.W'].max(), group['theoretical_clear-sky_generation.W'].max())

    # Avoiding division by zero and infinite results tehreafter:
    max_val = max_val if max_val != 0 else 1
    
    group['theoretical_clear-sky_generation.W_normalized'] = group['theoretical_clear-sky_generation.W'] / max_val
    
    group['Gen.W_normalized'] = group['Gen.W'] / max_val
    return group

# Group by day and apply normalization
df_normalized = df_genW_resample.groupby(df_genW_resample.index.date).apply(normalize_daywise)

### 5.5.2. Plot after normalising

In [None]:
df_normalized

In [None]:
start_date = df_7_days['time'].min()
end_date = df_7_days['time'].max()

df_to_plot = df_normalized.copy(deep=True)
df_to_plot = df_to_plot.resample('H').mean()

df_sliced = df_to_plot.loc[start_date:end_date]

plt.figure(figsize=(24, 3))
sns.lineplot(x = df_sliced.index, y = df_sliced['Gen.W_normalized'], label = 'Normalised Gen.W')
sns.lineplot(x = df_sliced.index, y = df_sliced['theoretical_clear-sky_generation.W_normalized'], label = 'Normalised Theoretical clear sky generation')

plt.title(f'Normalised Gen.W and Normalised Theoretical Clear Sky Generation Over Time - With Azimuth = {mid_azimuth} and Tilt = {mid_tilt}')
plt.xlabel('Time')
plt.ylabel('Value')
plt.legend()

plt.show()

### 5.5.3. Interative Plot after normalising

In [None]:
# Create a figure
fig = go.Figure()

# Add the first line
fig.add_trace(go.Scatter(x=df_normalized.index, y=df_normalized['Gen.W_normalized'], mode='lines', name='Gen.W_normalized'))

# Add the second line
fig.add_trace(go.Scatter(x=df_normalized.index, y=df_normalized['theoretical_clear-sky_generation.W_normalized'], mode='lines', name='theoretical_clear-sky_generation.W_normalized'))

# Display the figure
fig.show()

## 5.6. Stretching theoretical

### 5.6.1. Stretching theoretical with a minimum threshold and MAX values

In [None]:
def stretch_theoretical(df):

    # I'm initialising an empty dict so I can keep track of the stretch factors:
    stretch_factors = {}

    # Threshold for theoretical value below which we won't compute the ratio
    ## The threshold helps in avoiding unrealistically high stretch factors caused by near-zero values in the denominator.
    
    ## Currently using 0.7, as 70% of the main normalised value (1)
    THRESHOLD = 0.7

    # Function to compute and adjust the theoretical curve for each day's group
    def compute_stretch(group):

        # Check if the input is a DataFrame and if not, return as is
        ## I've been getting some weird AttributeErrors here
        if not isinstance(group, pd.DataFrame):
            return group

        # getting the date from each row, so I can keep track of stretch factors
        current_date = group.index[0].date()

        # Compute ratio where theoretical value is above the threshold
        ## I want to make sure that I'm not dividing by tiny values and getting crazy spikes
        valid_idxs = group['theoretical_clear-sky_generation.W_normalized'] > THRESHOLD
        ratios = (group['Gen.W_normalized'] / group['theoretical_clear-sky_generation.W_normalized'])[valid_idxs]
        
        # Use maximum ratio as stretch factor
        stretch_factor = ratios.max()
        
        # If the stretch factor is NaN or zero (no valid ratios), set it to 1 to avoid NaN results
        # This will keep the value unchanged
        stretch_factor = 1 if pd.isna(stretch_factor) or stretch_factor == 0 else stretch_factor

        # Storing each stretch factor so I can keep track of it:
        stretch_factors[current_date] = stretch_factor

        # Multiply each value in the theoretical column by the stretched factor
        group['stretched_theoretical'] = group['theoretical_clear-sky_generation.W_normalized'] * stretch_factor
        return group

    # I'm grouping the datasframe by DAY and applying that function to it
    df_stretched = df.groupby(df.index.map(lambda x: x.date())).apply(compute_stretch)

    # Convert the stretch_factors dictionary to a DataFrame for better vis
    stretch_factors_df = pd.DataFrame(list(stretch_factors.items()), columns=['date', 'stretch_factor'])
    # Convert 'date' column to datetime and set as index
    stretch_factors_df['date'] = pd.to_datetime(stretch_factors_df['date'])
    stretch_factors_df.set_index('date', inplace=True)

    return df_stretched, stretch_factors_df


In [None]:
df_stretched, stretch_factors_df = stretch_theoretical(df_normalized)

### 5.6.2. Plotting after stretching

In [None]:
start_date = df_7_days['time'].min()
end_date = df_7_days['time'].max()

df_to_plot = df_stretched.copy(deep=True)
df_to_plot = df_to_plot.resample('H').mean()

df_sliced = df_to_plot.loc[start_date:end_date]

plt.figure(figsize=(24, 3))
sns.lineplot(x = df_sliced.index, y = df_sliced['Gen.W_normalized'], label = 'Normalised Gen.W')
sns.lineplot(x = df_sliced.index, y = df_sliced['stretched_theoretical'], label = 'Stretched Theoretical clear sky generation')

plt.title(f'Normalised Gen.W and Stretched Theoretical Clear Sky Generation Over Time - With Azimuth = {mid_azimuth} and Tilt = {mid_tilt}')
plt.xlabel('Time')
plt.ylabel('Value')
plt.legend()

plt.show()

### 5.6.3. Interactive Plotting after stretching

In [None]:
# Create a figure
fig = go.Figure()

# Add the first line
fig.add_trace(go.Scatter(x=df_stretched.index, y=df_stretched['Gen.W_normalized'], mode='lines', name='Gen.W normalised'))

# Add the second line
fig.add_trace(go.Scatter(x=df_stretched.index, y=df_stretched['stretched_theoretical'], mode='lines', name='Theoretical Stretched'))

# Display the figure
fig.show()

## 5.7. Capping Max values

### 5.7.1. Capping max stretched theoretical based on normalised max gen

In [None]:
# The values of 'theoretical_clear-sky_generation.W' should never be bigger than 'Gen.W'
# For now, I'm using Gen.W as the cap
df_stretched['Gen.W_normalized'] = np.where(df_stretched['Gen.W_normalized'] > df_stretched['stretched_theoretical'], df_stretched['stretched_theoretical'], df_stretched['Gen.W_normalized'])

### 5.7.2. Plotting

In [None]:
start_date = df_7_days['time'].min()
end_date = df_7_days['time'].max()

df_sliced = df_stretched.loc[start_date:end_date]

plt.figure(figsize=(24, 3))
sns.lineplot(x = df_sliced.index, y = df_sliced['Gen.W_normalized'], label = 'Gen.W_normalized')
sns.lineplot(x = df_sliced.index, y = df_sliced['stretched_theoretical'], label = 'stretched_theoretical')

plt.title(f'[RECODED] Normalized Gen.W and Normalized and Stretched Theoretical Clear Sky Generation Over Time - With Azimuth = {mid_azimuth} and Tilt = {mid_tilt}')
plt.xlabel('Time')
plt.ylabel('Value')
plt.legend()

plt.show()


## 5.8. Getting Deltas

### 5.8.1. Getting Deltas

In [None]:
df_genW_resample['time'] = df_genW_resample.index
df_stretched['delta-clear-gen'] = df_stretched['Gen.W_normalized'] - df_stretched['stretched_theoretical']

In [None]:
df_MA = df_stretched.copy(deep=True)
df_MA = df_MA.loc[:,['time','Gen.W','theoretical_clear-sky_generation.W','delta-clear-gen','Gen.W_normalized','stretched_theoretical']]
df_MA = df_MA.resample('H').sum()
df_MA['timestamp'] = df_MA.index
df_MA = df_MA.rename(columns={"delta-clear-gen": "value"})

### 5.8.2. Plotting

In [None]:
start_date = df_7_days['time'].min()
end_date = df_7_days['time'].max()

df_slice = df_MA[start_date:end_date]

# Creating 2 subplots
fig, axes = plt.subplots(2, 1, figsize=(20, 6), sharex=True) 

# plot 'Gen.W' and 'theoretical_clear-sky_generation.W' against 'time' in the first subplot
sns.lineplot(ax=axes[0], x='timestamp', y='Gen.W_normalized', data=df_slice, label='Gen.W_normalized', color='blue')
sns.lineplot(ax=axes[0], x='timestamp', y='stretched_theoretical', data=df_slice, label='stretched_theoretical', color='orange')
axes[0].legend()

# plot 'delta-clear-gen' against 'time' in the second subplot
sns.lineplot(ax=axes[1], x='timestamp', y='value', data=df_slice, label='Delta Clear Gen', color='green')
axes[1].legend()

plt.tight_layout()  # adjust subplot params so that the subplots fit into the figure area
plt.show()


## 5.9. Identifying underperformance

In [None]:
## For plotting:
start_date =  start_faults - timedelta(days=7)
end_date =  end_faults + timedelta(days=7)

In [None]:
df_MA_inverted = df_MA.copy(deep=True)
df_MA_inverted['value'] = df_MA['value'] * (-1)

### 5.9.1. Adding information on Cloudy days

In [None]:
# Convert 'timestamp' column in df_test to datetime if it's not
df_MA_inverted['timestamp'] = pd.to_datetime(df_MA_inverted['timestamp'])

# Create a new column in df_test with just the date (no time info)
df_MA_inverted['date_only'] = df_MA_inverted['timestamp'].dt.date

# Convert index 'time' to datetime and create a new column 'time' in df_site if it's not
df_site['time'] = pd.to_datetime(df_site.index)

# Create a new column in df_site with just the date (no time info)
df_site['date_only'] = df_site['time'].dt.date

# Merge df_site and df_test on the date_only column
df_with_cloud = df_MA_inverted.merge(df_site[['date_only', 'is_low_cloudiness_day']], on='date_only', how='left')

# If you want to drop the 'date_only' column after the merge:
df_with_cloud = df_with_cloud.drop(columns=['date_only'])

# Create a copy of 'timestamp' column
df_with_cloud['timestamp_copy'] = df_with_cloud['timestamp']

# Set 'timestamp' column as index
df_with_cloud.set_index('timestamp', inplace=True)

# Rename the 'timestamp_copy' back to 'timestamp'
df_with_cloud.rename(columns={'timestamp_copy': 'timestamp'}, inplace=True)

df_MA_inverted = df_with_cloud.copy(deep=True)

In [None]:
df_to_plot = df_MA_inverted[start_date:end_date]

# Creating 2 subplots
fig, axes = plt.subplots(2, 1, figsize=(20, 6), sharex=True) 

# plot 'Gen.W' and 'theoretical_clear-sky_generation.W' against 'time' in the first subplot
sns.lineplot(ax=axes[0], x='timestamp', y='Gen.W_normalized', data=df_to_plot, label='Gen.W_normalized', color='blue')
sns.lineplot(ax=axes[0], x='timestamp', y='stretched_theoretical', data=df_to_plot, label='stretched_theoretical', color='orange')
axes[0].legend()

# Separate data based on 'is_low_cloudiness_day'
df_test_true = df_to_plot[df_to_plot['is_low_cloudiness_day'] == True]
df_test_false = df_to_plot[df_to_plot['is_low_cloudiness_day'] == False]

# plot 'delta-clear-gen' against 'time' in the second subplot for 'is_low_cloudiness_day' = True
sns.lineplot(ax=axes[1], x='timestamp', y='value', data=df_test_true, label='Delta Clear Gen - Low Cloudiness', color='green')

# plot 'delta-clear-gen' against 'time' in the second subplot for 'is_low_cloudiness_day' = False
sns.lineplot(ax=axes[1], x='timestamp', y='value', data=df_test_false, label='Delta Clear Gen - High Cloudiness', color='gray')

axes[1].legend()

plt.tight_layout()  # adjust subplot params so that the subplots fit into the figure area
plt.show()


### 5.9.2. Limiting to a threshold

In [None]:
def limit_to_threshold(df, threshold_percentage=0.2):
    # Make sure index is a DateTimeIndex
    if not isinstance(df.index, pd.DatetimeIndex):
        df.index = pd.to_datetime(df.index)

    # Get all unique dates in the df
    unique_dates = df.index.normalize().unique()

    # Iterate over each unique day in the df
    for date in unique_dates:
        # Convert to string for indexing
        str_date = date.strftime('%Y-%m-%d')

        # Get the max 'value' for the day
        max_value = df.loc[str_date, 'stretched_theoretical'].max()

        # Multiply the max 'value' by the threshold
        threshold = max_value * threshold_percentage

        # Find 'Gen.W' values that are below the threshold and replace them with 0
        df.loc[(df.index.normalize() == date) & (df['value'] < threshold), 'value'] = 0

    return df

In [None]:
df_MA_inverted = limit_to_threshold(df_MA_inverted)

In [None]:
df_to_plot['value'].loc[start_date:end_date].plot(figsize=(24,6))

# Creating 2 subplots
fig, axes = plt.subplots(2, 1, figsize=(20, 6), sharex=True) 

# plot 'Gen.W' and 'theoretical_clear-sky_generation.W' against 'time' in the first subplot
sns.lineplot(ax=axes[0], x='timestamp', y='Gen.W_normalized', data=df_to_plot, label='Gen.W_normalized', color='blue')
sns.lineplot(ax=axes[0], x='timestamp', y='stretched_theoretical', data=df_to_plot, label='stretched_theoretical', color='orange')
axes[0].legend()

# Separate data based on 'is_low_cloudiness_day'
df_test_true = df_to_plot[df_to_plot['is_low_cloudiness_day'] == True]
df_test_false = df_to_plot[df_to_plot['is_low_cloudiness_day'] == False]

# plot 'delta-clear-gen' against 'time' in the second subplot for 'is_low_cloudiness_day' = True
sns.lineplot(ax=axes[1], x='timestamp', y='value', data=df_test_true, label='Delta Clear Gen - Low Cloudiness', color='green')

# plot 'delta-clear-gen' against 'time' in the second subplot for 'is_low_cloudiness_day' = False
sns.lineplot(ax=axes[1], x='timestamp', y='value', data=df_test_false, label='Delta Clear Gen - High Cloudiness', color='gray')

axes[1].legend()

plt.tight_layout()  # adjust subplot params so that the subplots fit into the figure area
plt.show()

## 5.10. Identifying the recurrent faults

### 5.10.1. Helper function to plot graphs

In [None]:
def plot_date_range(df, start_date, end_date):
    # Define timezone
    tz = pytz.timezone(timezone_value)

    # Converting string dates to datetime with timezone only if they are tz-naive
    if start_date.tzinfo is None:
        start_date = pd.to_datetime(start_date).tz_localize(tz)
    if end_date.tzinfo is None:
        end_date = pd.to_datetime(end_date).tz_localize(tz)

    # Convert string dates to datetime with timezone
    # start_date = pd.to_datetime(start_date).tz_localize(tz)
   # end_date = pd.to_datetime(end_date).tz_localize(tz)
    
    # Slice the DataFrame based on the date range
    mask = (df['timestamp'] >= start_date) & (df['timestamp'] <= end_date)
    df_sliced = df.loc[mask]

    # Creating 3 subplots
    fig, axes = plt.subplots(3, 1, figsize=(20, 9), sharex=True)

    # plot 'Gen.W' and 'theoretical_clear-sky_generation.W' against 'time' in the new subplot
    sns.lineplot(ax=axes[0], x='timestamp', y='Gen.W', data=df_sliced, label='Gen.W', color='blue')
    sns.lineplot(ax=axes[0], x='timestamp', y='theoretical_clear-sky_generation.W', data=df_sliced, label='theoretical_clear-sky_generation.W', color='orange')
    axes[0].legend()

    # plot 'transformed_Gen.W' and 'theoretical_clear-sky_generation.W' against 'time' in the second subplot
    sns.lineplot(ax=axes[1], x='timestamp', y='Gen.W_normalized', data=df_sliced, label='Gen.W_normalized', color='blue')
    sns.lineplot(ax=axes[1], x='timestamp', y='stretched_theoretical', data=df_sliced, label='stretched_theoretical', color='orange')
    axes[1].legend()

    # Separate sliced data based on 'is_low_cloudiness_day'
    df_test_true = df_sliced[df_sliced['is_low_cloudiness_day'] == True]
    df_test_false = df_sliced[df_sliced['is_low_cloudiness_day'] == False]

    # plot 'delta-clear-gen' against 'time' in the third subplot for 'is_low_cloudiness_day' = True
    sns.lineplot(ax=axes[2], x='timestamp', y='value', data=df_test_true, label='Delta Clear Gen - Low Cloudiness', color='green')

    # plot 'delta-clear-gen' against 'time' in the third subplot for 'is_low_cloudiness_day' = False
    sns.lineplot(ax=axes[2], x='timestamp', y='value', data=df_test_false, label='Delta Clear Gen - High Cloudiness', color='gray')

    # plot 'underperformance' and 'is_recurrent_underperformance'
    # plot gray 'X' for underperformance
    axes[2].scatter(df_sliced[df_sliced['underperformance']]['timestamp'], df_sliced[df_sliced['underperformance']]['value'], color='gray', marker='x', label='Underperformance')

    # plot red 'O' for is_recurrent_underperformance
    axes[2].scatter(df_sliced[df_sliced['is_recurrent_underperformance']]['timestamp'], df_sliced[df_sliced['is_recurrent_underperformance']]['value'], color='red', marker='o', label='Recurrent Underperformance')

    axes[2].legend()

    plt.tight_layout()  # adjust subplot params so that the subplots fit into the figure area
    return plt.gcf()
    plt.show()

### 5.10.2. All positive records that happen on the same hour for consecutive days - With similar values - Skipping low-cloudiness

In [None]:
def mark_recurrent_underperformance(df, threshold_of_recurrence=3, value_threshold_percentage=20):
    
    if df.index.name != 'timestamp':
        df.set_index('timestamp', inplace=True)
    
    df['hour'] = df.index.hour
    df['underperformance'] = df['value'] > 0  # Assuming 'value' > 0 means underperformance
    df['is_recurrent_underperformance'] = False
    df['count_for_hour_of_day'] = 0
    
    # Filter only rows where is_low_cloudiness_day is True
    low_cloudiness_df = df[df['is_low_cloudiness_day'].fillna(False)]
    
    for hour in range(24):  # Looping through all hours of the day
        hourly_df = low_cloudiness_df[low_cloudiness_df['hour'] == hour].copy()
        
        count = 0
        potential_indices = []  # To keep track of the potential recurrent underperformances
        
        for idx, row in hourly_df.iterrows():
            
            if row['underperformance'] and row['value'] != 0:
                
                # For the first encounter with a fault
                if not potential_indices:  # List is empty, initialize
                    count = 1
                    potential_indices = [idx]
                    df.loc[idx, 'count_for_hour_of_day'] = count
                    reference_value = row['value']
                    
                else:
                    # Compute the percentage difference
                    reference_value = hourly_df.loc[potential_indices[0], 'value']
                    percentage_diff = abs((row['value'] - reference_value) / reference_value) * 100
                    
                    # Check if the value is within the threshold
                    if percentage_diff <= value_threshold_percentage:
                        count += 1
                        df.loc[idx, 'count_for_hour_of_day'] = count
                        potential_indices.append(idx)  # Store this index
                    else:
                        # Reset count and start tracking from the current index
                        count = 1  
                        df.loc[idx, 'count_for_hour_of_day'] = count
                        potential_indices = [idx]  # Reset potential_indices list

                # Check if count reaches the threshold
                if count >= threshold_of_recurrence:
                    # Set 'is_recurrent_underperformance' to True for all potential indices
                    df.loc[potential_indices, 'is_recurrent_underperformance'] = True
                
            else:
                # Reset count and clear list
                count = 0  
                potential_indices = []  
                
    return df

In [None]:
mark_recurrent_underperformance(df_MA_inverted)

### 5.10.2. Plotting results:

In [None]:
# To plot a certain range:
plot_date_range(df_MA_inverted, start_date, end_date)

### 5.10.3. Saving plots to visual check all dataset

Only use the following to save ALL PLOTS from the date range.

This takes up a lot of space!

In [None]:
'''
# Getting first valid index
index_start = df_MA_inverted.first_valid_index()
# Getting last valid index
index_end = df_MA_inverted.last_valid_index()

# Create the desired directory if it doesn't exist
dir_path = f"./recurrent_faults_plots/{MID}"
if not os.path.exists(dir_path):
    os.makedirs(dir_path)

# start_plot = time_start - DateOffset(days=1)
# end_plot = time_end + DateOffset(days=1)

# Function to loop through the dates week by week and save plots
def save_weekly_plots(time_start, time_end, df, plot_func, dir_path):
    plt.ioff()  # Turn off interactive mode
    current_start = time_start
    while current_start < time_end:
        current_end = current_start + pd.Timedelta(weeks=1)
        
        # Call the plot function
        fig = plot_func(df, current_start, current_end)  # assume plot_func returns the figure
        
        # Save the plot
        fig.savefig(f"{dir_path}/{current_start.date()}_{current_end.date()}.png")
        plt.close(fig)

        current_start = current_end
    plt.ion()  # Turn on interactive mode


# CAREFUL - THIS TAKES UP A LOT OF STORAGE:
save_weekly_plots(index_start, index_end, df_MA_inverted, plot_date_range, dir_path)
'''

# 6. Functions to label Level 1 faults based on raw signal

## 6.1. Generation Tripping

In [None]:
def detect_solar_tripping(df, trip_threshold=3):
    # Ensure index is in the correct format
    df['time'] = pd.to_datetime(df['time'])
    df['date_from_time'] = df['time'].dt.date
    
    # Create a boolean mask where generation is zero
    df['is_zero_gen'] = df['Gen.W'] == 0

    # Restrict to periods between sunrise and sunset
    df['is_zero_gen'] = df['is_zero_gen'] & \
                                        (((df.index.hour * 60) + df.index.minute)>= df['sun_thre_start']) & (((df.index.hour * 60) + df.index.minute) <= df['sun_thre_end'])

    # Identifying instances where generation goes to zero and then back up
        ## First identify where is_zero_gen changes from False to True or True to False
    df['diff_is_zero_gen'] = df['is_zero_gen'].astype(int).diff()

        ## Then isolate instances where the generation is greater than 0
    df['positive_gen'] = df['Gen.W'] > 0

        ## Finally, combining both conditions
    df['is_tripping'] = (df['diff_is_zero_gen'] != 0) & df['positive_gen']

    # Get daily counts of tripping
    daily_tripping_counts = df.resample('D')['is_tripping'].sum()

    # Identify the dates where tripping counts cross the threshold
    threshold_dates = daily_tripping_counts[daily_tripping_counts >= trip_threshold].index.date

    # Convert df.index.date to a pandas Series
    df_dates = pd.Series(df.index.date)
    
    # Update 'fault_tripping' column based on threshold_dates
    df['fault_tripping'] = df['date_from_time'].isin(threshold_dates)

    # I'm filling NaN values with False
    df['fault_tripping'] = df['fault_tripping'].fillna(False)
    
    return df

## 6.2. Non-zero generation tripping

In [None]:
def detect_nonZero_tripping(df, threshold_nonZero_tripping=0.8):
    # Initiatlising the column as False
    df['nonZero_tripping'] = 0  
    df['fault_nonZero_tripping'] = False  

    # looping through and getting individual drops:
    for i in range(1, len(df)):
        # Get a ratio from measurements right after another
        ratio = df['Gen.W'].iloc[i] / df['Gen.W'].iloc[i-1]

        # If a drop is too significant, consider it a non-zero tripping:
        if ratio < threshold_nonZero_tripping:
            df['nonZero_tripping'].iloc[i] = 1

    # Summing up all drops on a given day:
    df['nonZero_tripping_sum'] = df['nonZero_tripping'].resample('D').transform('sum')

    # Marking a day that this happens at least thrice:
    df.loc[df['nonZero_tripping_sum'] >= 3, 'fault_nonZero_tripping'] = True
    
    return df

## 6.3. Generation Clipping - Normalised with PVSize

In [None]:
def detect_solar_clipping(df, mid_pvsizewatt, threshold=0.001, min_duration='60min'):
    df = df.sort_index()

    # Calculate the rate of change of power
    # Currently using diff, should use derivative?
    df['power_diff_normalised'] = df['Gen.W'].diff()/mid_pvsizewatt

    # We only want to check clipping in between the thresholds, therefore we'll need to define the hour:
    # df['hour'] = df.index.hour

    # Identifying periods where:
    ##  the rate of change is near zero (potential clipping, considered by threshold), and 
    ## there more than 100 generation (sun_threshold), and
    ## is between sun_thre_start and sun_thre_end
    df['is_clipping'] = (np.abs(df['power_diff_normalised']) < threshold) & \
                        (df['Gen.W'] > 100) & \
                        (((df.index.hour * 60) + df.index.minute)>= df['sun_thre_start']) & (((df.index.hour * 60) + df.index.minute) <= df['sun_thre_end'])
    
    # Identify 'clipping periods', which are periods of clipping that last at least min_duration
    ## First we get EVERY occurrence of clipping, for every 5 min
    df['clipping_period'] = df['is_clipping'].diff().ne(0).cumsum()

    # Then, we'll calculate the duration of each 'clipping period'
    df['clipping_duration'] = df.groupby('clipping_period')['is_clipping'].transform('sum') * (df.index.to_series().diff().dt.total_seconds() / 60)

    # If duration is longer than our min_duration, we label that as clipping
    df['fault_clipping'] = df['is_clipping'] & (df['clipping_duration'] >= pd.Timedelta(min_duration).total_seconds() / 60)
    
    return df

## 6.4. Zero Generation

In [None]:
def detect_zero_generation(df, min_duration='60min'):  
    # Creating a boolean mask where generation is zero
    # tbd -> optimise this with tripping
    df['is_zero_gen'] = df['Gen.W'] == 0

    # Restrict to periods between sunrise and sunset
    df['is_zero_gen'] = df['is_zero_gen'] & (((df.index.hour * 60) + df.index.minute)>= df['sun_thre_start']) & (((df.index.hour * 60) + df.index.minute) <= df['sun_thre_end'])

    '''   
        # Trying with zero_gen and ghi not zero:
        df['is_zero_gen'] = df['is_zero_gen'] & df['clear_sky_ghi'] > 0
    '''

    # Labeling periods of zero generation
    df['zero_gen_period'] = df['is_zero_gen'].diff().ne(0).cumsum()

    # Calculate the duration of each 'zero_gen_period'
    df['zero_gen_duration'] = df.groupby('zero_gen_period')['is_zero_gen'].transform('sum') * (df.index.to_series().diff().dt.total_seconds() / 60)

    # If duration is longer than our min_duration, we label that as zero_gen_period
    df['fault_zero_gen'] = df['is_zero_gen'] & (df['zero_gen_duration'] >= pd.Timedelta(min_duration).total_seconds() / 60)
    
    return df

## 6.5. Recurring underperformance

In [None]:
# I've already worked with df_MA_inverted, I'll just use it to label the original df_genW

df_recurrent_und = df_MA_inverted.copy(deep=True)
df_recurrent_und = df_recurrent_und[['timestamp','is_recurrent_underperformance']].rename(columns={'timestamp':'time'})

# df_MA_inverted is a hourly dataframe, whereas df_genW has 5-minutes increments
# df_MA_inverted['is_recurrent_underperformance'] is TRUE on an hourly basis:
## If it happens to be TRUE on a full hour, I'll propagate it to it's corresponding 5-minutes increments:
# Using forward fill to achieve this:

df_recurrent_und = df_recurrent_und.resample('5T').ffill()

## 6.6. Night-time Generation

In [None]:
def detect_nightTime_gen(df):
    # Create a boolean column for generation during night time
    df['temp_fault_nightTime_gen'] = (df['Gen.W'] > 0) & \
                                     ~((((df.index.hour * 60) + df.index.minute) >= df['sun_thre_start']) & \
                                       (((df.index.hour * 60) + df.index.minute) <= df['sun_thre_end']))

    # Create a new column to indicate if the nighttime generation occurred continuously for 1 hour (12 intervals)
    df['fault_nightTime_gen_1hr'] = df['temp_fault_nightTime_gen'].rolling(window=12, min_periods=12).sum() == 12

    # Drop the temporary column
    df.drop(columns=['temp_fault_nightTime_gen'], inplace=True)

    return df

## 6.7. Negative Generation

In [None]:
# Note, need to use df_toCheck_negativeGen

def detect_negative_gen(df, mid_pvsizewatt):

    # Calculating the threshold (1% of mid_pvsizewatt)
    threshold_neg = 0.01 * mid_pvsizewatt

    # Creating a boolean to catch negative values
    df['fault_negative_gen'] = (df['original_genW'] < 0) & (abs(df['original_genW']) >= threshold_neg)

    return df

## 6.8. Excessive Generation

In [None]:
def detect_excessive_gen(df, mid_pvsizewatt):

    # 100% on top of system size
    threshold_excessive = 2 * mid_pvsizewatt

    df['fault_excessive_gen'] = (df['Gen.W'] >= threshold_excessive)

    return df

## 6.9. No Data

In [None]:
def detect_no_data(df):
    df['No Data'] = False
    if df['Gen.W'].isna().all():
        df['No Data'] = True
        
    return df

# 7. Label all

In [None]:
def label_dataframe(df):
    # Extract hour from sunrise and sunset times

    # Extract time from sunrise and sunset times
    ## I was getting false positives since the sunset could be at 17:01, and by extracting the hour from the timestamp, I would get 17, and in that case I would have 12 possible slots to detect fault.
    ## I'm doing a 1 hour buffer for the start, and flooring the sunset
    df['index_minute_vale'] = ((df_genW.index.hour * 60) + df_genW.index.minute)
    df['sun_thre_start'] = (((df['sunrise'].dt.hour + 1) % 24) * 60) + df['sunrise'].dt.minute
    df['sun_thre_end'] = (df['sunset'].dt.hour * 60) + df['sunrise'].dt.minute
    
    df = detect_solar_tripping(df)
    df = detect_nonZero_tripping(df)
    df = detect_solar_clipping(df, mid_pvsizewatt)
    df = detect_zero_generation(df)
    df = detect_negative_gen(df, mid_pvsizewatt)
    df = detect_nightTime_gen(df)
    df = detect_excessive_gen(df, mid_pvsizewatt)
    df = detect_no_data(df)
    
    return df

In [None]:
df_labelled = label_dataframe(df_genW)

In [None]:
# I'll have to resample the df_MA_inverted to 5 minutes intervals
# The only thing I really care about here is the 'is_recurrent_underperformance' column, so I'll use forward fill:
# https://pandas.pydata.org/docs/reference/api/pandas.core.resample.Resampler.ffill.html


df_labelled['time'] = pd.to_datetime(df_labelled['time'])
df_MA_inverted.index = pd.to_datetime(df_MA_inverted.index)

df_recurrent_und = df_MA_inverted.copy(deep=True)
df_recurrent_und = df_recurrent_und[['timestamp','is_recurrent_underperformance', 'is_low_cloudiness_day']].rename(columns={'timestamp':'time'})

# Resample the df_MA_inverted to 5-minute intervals using forward filling
df_recurrent_und = df_recurrent_und.resample('5T').ffill()

# Merge the dataframes on the timestamp columns
merged_df = pd.merge(df_labelled, df_recurrent_und[['is_recurrent_underperformance', 'is_low_cloudiness_day']], left_on='time', right_index=True, how='left')

# Rename the is_recurrent_underperformance column
merged_df.rename(columns={'is_recurrent_underperformance': 'fault_recurrent_underperformance'}, inplace=True)

# Now merged_df contains the df_genW data with the added is_recurrent_underperformance column
df_labelled = merged_df

# filling any NaN values in the new column with a default value (e.g., False)
df_labelled['fault_recurrent_underperformance'].fillna(False, inplace=True)

# 8. Cleaning up

In [None]:
def find_faults(row):
    faults = []
    if row['fault_tripping']:
        faults.append('fault_tripping')
    if row['fault_nonZero_tripping']:
        faults.append('fault_nonZero_tripping')
    if row['fault_clipping']:
        faults.append('fault_clipping')
    if row['fault_zero_gen']:
        faults.append('fault_zero_gen')
    if row['fault_recurrent_underperformance']:
        faults.append('fault_recurrent_underperformance')
    if row['fault_negative_gen']:
        faults.append('Negative Generation')
    if row['fault_nightTime_gen_1hr']:
        faults.append('Night-Time Generation')
    if row['fault_excessive_gen']:
        faults.append('Excessive Generation')
    if row['No Data']:
        faults.append('No Data')
    return faults

df_labelled['faults'] = df_labelled.apply(find_faults, axis=1)

# 9. Saving individual

In [None]:
# To save it:
df_labelled.to_csv(f'./2A_individual_outputs/{MID}.csv')

print(f'Fault Detection saved for {MID}')

In [None]:
df_per_day = df_labelled.copy()

df_per_day.index = df_per_day.index.date

def agg_faults_per_day(faults_list):
    unique_faults_on_that_day = list(set([fault for sublist in faults_list for fault in sublist]))
    return unique_faults_on_that_day

result_per_day = df_per_day.groupby(df_per_day.index).agg({'Gen.W': 'sum', 'faults': agg_faults_per_day})

result_per_day.to_csv(f'./2A_individual_outputs/per_day/{MID}_per_day.csv')

# 10. Visualising

In [None]:
'''# List of possible faults
faults_list = ['fault_tripping', 'fault_clipping', 'fault_zero_gen', 'fault_recurrent_underperformance']

# Assuming df_labelled is your DataFrame
# If 'timestamp' is not the index, set it
df_labelled.index = pd.to_datetime(df_labelled.index)

# Resample DataFrame by weeks
weeks = [group for _, group in df_labelled.resample('W-MON')]

# Determine the number of rows for the subplots (one week per row)
nrows = len(weeks)

fig, axes = plt.subplots(nrows=nrows, figsize=[15, 5 * nrows])
plt.subplots_adjust(hspace=0.5)

# Make sure axes is always a 1D array, even if there's only one subplot
if nrows == 1:
    axes = [axes]

for i, week in enumerate(weeks):
    ax = axes[i]
    ax.set_ylabel('Gen.W')
    ax.set_xlabel('Time')
    ax.set_title(f'Gen.W over Time for Week {i+1}')

    # Iterate through the days of the week
    days = [group for _, group in week.resample('D')]
    for day in days:
        color = 'blue' # Default color for no faults
        label = None # Default label

        # Check for faults in the day
        for fault in faults_list:
            if any(fault in faults for faults in day['faults']):
                color = 'red' # Change color if there's a fault
                label = 'Fault' # Set the label for the legend
                break

        ax.plot(day['Gen.W'], color=color, label=label)

    # Set the x-tick labels to be vertical
    ax.tick_params(axis='x', rotation=90)

    # Add legend without duplicate labels
    handles, labels = ax.get_legend_handles_labels()
    by_label = dict(zip(labels, handles))
    ax.legend(by_label.values(), by_label.keys())

plt.show()
'''

In [None]:
'''import matplotlib.pyplot as plt
import pandas as pd

# List of possible faults
faults_list = ['fault_tripping', 'fault_clipping', 'fault_zero_gen', 'fault_recurrent_underperformance']

# Colors for different cases
colors = {
    'no_monitor_fault': 'green',
    'low_cloudiness': 'grey'
}
colors.update({fault: 'red' for fault in faults_list})

# Assuming df_labelled is your DataFrame
# If 'timestamp' is not the index, set it
df_labelled.index = pd.to_datetime(df_labelled.index)

# Resample DataFrame by weeks
weeks = [group for _, group in df_labelled.resample('W-MON')]

# Determine the number of rows for the subplots (one week per row)
nrows = len(weeks)

fig, axes = plt.subplots(nrows=nrows, figsize=[15, 5 * nrows])
plt.subplots_adjust(hspace=0.5)

# Make sure axes is always a 1D array, even if there's only one subplot
if nrows == 1:
    axes = [axes]

for i, week in enumerate(weeks):
    ax = axes[i]
    ax.set_ylabel('Gen.W')
    ax.set_xlabel('Time')
    ax.set_title(f'Gen.W over Time for Week {i+1}')

    # Iterate through the days of the week
    days = [group for _, group in week.resample('D')]
    for day in days:
        label = 'no_monitor_fault' # Default label for no monitor faults
        if all(day['is_low_cloudiness_day'] == False):
            label = 'low_cloudiness' # Label for low cloudiness

        # Check for faults in the day
        for fault in faults_list:
            if any(fault in faults for faults in day['faults']):
                label = fault # Set label to the specific fault
                break

        ax.plot(day['Gen.W'], color=colors[label], label=label)

    # Set the x-tick labels to be vertical
    ax.tick_params(axis='x', rotation=90)

    # Add legend without duplicate labels
    handles, labels = ax.get_legend_handles_labels()
    by_label = dict(zip(labels, handles))
    ax.legend(by_label.values(), by_label.keys())

plt.show()'''