In [196]:
import os
import pandas as pd
import numpy as np

In [197]:
def detect_separator(line):
    if "," in line:
        return ","

    return ";"


def read_site(file_path):
    lines = []

    # Read only the first 10 lines into the 'lines' list
    with open(file_path, "r") as f:
        for _ in range(10):
            line = f.readline()
            if not line:
                break
            lines.append(line)

    # The first line where the word "timpestamps" (in any case) appears is the header line
    header_index = next(
        (i for i, line in enumerate(lines) if "timestamp" in line.lower()), None
    )

    if header_index is None:
        error_message = "Header not found in the file."
        print(error_message)
        exit()

    separator = detect_separator(lines[header_index])
    # The index of the found header is exactly the number of rows to skip when reading the data
    df = pd.read_csv(file_path, skiprows=header_index, header=0, sep=separator)
    # print(df.loc[0,:])

    # Check whether the second row in the dataframe is an extra unit row
    if pd.isna(df.iloc[0, 0]):
        df.drop(index=0, inplace=True)
        df.reset_index(
            drop=True, inplace=True
        )  # Resetting the index after dropping the row

    return df

In [198]:
file_path = "../data/2024-01-01-2024-01-31_NESM - Availability_Chiloquin Solar Farm.csv"
df = read_site(file_path)
df

Unnamed: 0,Timestamp,POA*,"Sungrow 60kW Inverter - 1.1, Line kW","Sungrow 60kW Inverter - 1.2, Line kW","Sungrow 60kW Inverter - 1.3, Line kW","Sungrow 60kW Inverter - 1.4, Line kW","Sungrow 60kW Inverter - 1.5, Line kW","Sungrow 60kW Inverter - 1.6, Line kW","Sungrow 60kW Inverter - 1.7, Line kW","Sungrow 60kW Inverter - 1.8, Line kW",...,"Sungrow 60kW Inverter - 11.7, Line kW","Sungrow 60kW Inverter - 11.8, Line kW","Sungrow 60kW Inverter - 11.9, Line kW","Sungrow 60kW Inverter - 11.10, Line kW","Sungrow 60kW Inverter - 11.11, Line kW","Sungrow 60kW Inverter - 11.12, Line kW","Sungrow 60kW Inverter - 11.13, Line kW","Sungrow 60kW Inverter - 11.14, Line kW","Sungrow 60kW Inverter - 11.15, Line kW",Production meter active power
0,2024-01-01 00:00:00,0,,,,,,,,,...,,,,,,,,,,-13.87813
1,2024-01-01 01:00:00,0,,,,,,,,,...,,,,,,,,,,-13.855
2,2024-01-01 02:00:00,0,,,,,,,,,...,,,,,,,,,,-13.84791
3,2024-01-01 03:00:00,0,,,,,,,,,...,,,,,,,,,,-13.8775
4,2024-01-01 04:00:00,0,,,,,,,,,...,,,,,,,,,,-13.82208
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,2024-01-31 19:00:00,0,,,,,,,,,...,,,,,,,,,,-15.23895
740,2024-01-31 20:00:00,0,,,,,,,,,...,,,,,,,,,,-15.22125
741,2024-01-31 21:00:00,0,,,,,,,,,...,,,,,,,,,,-15.04375
742,2024-01-31 22:00:00,0,,,,,,,,,...,,,,,,,,,,-15.15271


In [199]:
site_name = file_path.split("_")[-1].replace(".csv", "")
site_name

'Chiloquin Solar Farm'

In [200]:
def find_keywords(column, keywords_list):
    for keywords in keywords_list:
        if all(keyword.lower() in column.lower() for keyword in keywords):
            return True
    return False


def column_basic(df):
    keyword_mapping = {
        "Timestamp": [["timestamp"]],
        "POA Irradiance": [["poa"]],
    }

    rename_mapping = {}
    for new_name, keywords_list in keyword_mapping.items():
        found = False
        for col in df.columns:
            found = find_keywords(col, keywords_list)
            if found:
                rename_mapping[col] = new_name
                break
        if not found:
            df[new_name] = np.nan

    df.rename(columns=rename_mapping, inplace=True)

    return df

name_mapping = {}
def column_inverter(df):
    known_columns = {
        "Timestamp",
        "POA Irradiance",
    }
    inverter_index = 1

    for col in df.columns:
        if col not in known_columns:
            new_name = "Inverter_" + str(inverter_index)
            df.rename(columns={col: new_name}, inplace=True)
            # Used for renaming cols to their original names in the end of the processing
            name_mapping[new_name] = col
            inverter_index += 1

    return df


def column_reorder(df):
    inverter_columns = sorted(
        (col for col in df.columns if "Inverter" in col),
        key=lambda s: int(s.split("_")[1]),
    )
    columns_order = [
        "Timestamp",
        "POA Irradiance",
    ] + inverter_columns
    df = df[columns_order]

    return df


def rename(df):
    return (
        df.pipe(column_basic)
        .pipe(column_inverter)
        .pipe(column_reorder)
    )

df = rename(df)
df

Unnamed: 0,Timestamp,POA Irradiance,Inverter_1,Inverter_2,Inverter_3,Inverter_4,Inverter_5,Inverter_6,Inverter_7,Inverter_8,...,Inverter_157,Inverter_158,Inverter_159,Inverter_160,Inverter_161,Inverter_162,Inverter_163,Inverter_164,Inverter_165,Inverter_166
0,2024-01-01 00:00:00,0,,,,,,,,,...,,,,,,,,,,-13.87813
1,2024-01-01 01:00:00,0,,,,,,,,,...,,,,,,,,,,-13.855
2,2024-01-01 02:00:00,0,,,,,,,,,...,,,,,,,,,,-13.84791
3,2024-01-01 03:00:00,0,,,,,,,,,...,,,,,,,,,,-13.8775
4,2024-01-01 04:00:00,0,,,,,,,,,...,,,,,,,,,,-13.82208
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,2024-01-31 19:00:00,0,,,,,,,,,...,,,,,,,,,,-15.23895
740,2024-01-31 20:00:00,0,,,,,,,,,...,,,,,,,,,,-15.22125
741,2024-01-31 21:00:00,0,,,,,,,,,...,,,,,,,,,,-15.04375
742,2024-01-31 22:00:00,0,,,,,,,,,...,,,,,,,,,,-15.15271


In [201]:
# Convert the datetime string to a datetime object
def custom_to_datetime(df):
    formats = [
        "%m/%d/%Y %H:%M:%S",
        "%m/%d/%y %H:%M:%S",
        "%m/%d/%y %H:%M",
        "%m/%d/%Y %I:%M:%S %p",
        "%m-%d-%Y %H:%M:%S",
        "%m-%d-%y %H:%M:%S",
        "%m-%d-%Y %H:%M",
        "%m-%d-%y %H:%M",
        "%Y-%m-%d %H:%M:%S",
        "%d/%m/%Y %H:%M:%S",
        "%m/%d/%Y %H:%M",
        "%Y-%m-%d %H:%M",
    ]

    for fmt in formats:
        try:
            df["Timestamp"] = pd.to_datetime(df["Timestamp"], format=fmt)
            return df

        except ValueError:  # if the format doesn't match, continue to the next format
            continue

    # Quit the program if no suitable format is found
    raise ValueError("No suitable format found for the 'Timestamp' column.")


def normalize(df):
    cols_to_convert = df.columns[df.columns != "Timestamp"]
    df[cols_to_convert] = df[cols_to_convert].apply(pd.to_numeric, errors="coerce")
    df = custom_to_datetime(df)

    return df

df = normalize(df)
df

Unnamed: 0,Timestamp,POA Irradiance,Inverter_1,Inverter_2,Inverter_3,Inverter_4,Inverter_5,Inverter_6,Inverter_7,Inverter_8,...,Inverter_157,Inverter_158,Inverter_159,Inverter_160,Inverter_161,Inverter_162,Inverter_163,Inverter_164,Inverter_165,Inverter_166
0,2024-01-01 00:00:00,0.0,,,,,,,,,...,,,,,,,,,,-13.87813
1,2024-01-01 01:00:00,0.0,,,,,,,,,...,,,,,,,,,,-13.85500
2,2024-01-01 02:00:00,0.0,,,,,,,,,...,,,,,,,,,,-13.84791
3,2024-01-01 03:00:00,0.0,,,,,,,,,...,,,,,,,,,,-13.87750
4,2024-01-01 04:00:00,0.0,,,,,,,,,...,,,,,,,,,,-13.82208
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,2024-01-31 19:00:00,0.0,,,,,,,,,...,,,,,,,,,,-15.23895
740,2024-01-31 20:00:00,0.0,,,,,,,,,...,,,,,,,,,,-15.22125
741,2024-01-31 21:00:00,0.0,,,,,,,,,...,,,,,,,,,,-15.04375
742,2024-01-31 22:00:00,0.0,,,,,,,,,...,,,,,,,,,,-15.15271


In [202]:
# Calculate the average energy produced by the best 20% of inverters
# that have been working non-stop for at least an hour in conditions where POA Irradiance > 50
def compute_avg(df):
    inverter_cols = [col for col in df.columns if col.startswith("Inverter_")]
    # Initialize a Series to store the average value of top 20% inverter values
    avg_top_20_series = pd.Series(index=df.index, dtype="float64")
    for index, row in df.iterrows():
        if row["POA Irradiance"] > 50:
            candidate_values = row[inverter_cols].dropna()
            positive_values = candidate_values[candidate_values > 0]
            if not positive_values.empty:
                # 80% of the values in positive_values are less than or equal to the 80th percentile value
                percentile_80 = np.percentile(positive_values, 80)
                # Select values greater than or equal to the 80th percentile
                top_20 = positive_values[positive_values >= percentile_80]
                if not top_20.empty:
                    avg_top_20_series.at[index] = top_20.mean()

    return avg_top_20_series

avg_top_20_series = compute_avg(df)


In [203]:
def new_copy(df):
    df_new = df.copy()
    df_new = df_new.rename(
        columns=lambda x: (
            x.replace("Inverter_", "INV_") if x.startswith("Inverter_") else x
        )
    )
    return df_new


def process_inverter(df_new):
    df_new["Avg_Top_20%"] = avg_top_20_series
    inverter_cols = [col for col in df.columns if col.startswith("INV_")]

    def replace_values(row):
        if row['POA Irradiance'] > 50:
            for col in inverter_cols:
                if row[col] <= 0:
                    row[col] = row['Avg_Top_20%']
        return row

    df_new = df_new.apply(replace_values, axis=1)
    return df_new


df_new = new_copy(df)
df_new = process_inverter(df_new)
df_new

Unnamed: 0,Timestamp,POA Irradiance,INV_1,INV_2,INV_3,INV_4,INV_5,INV_6,INV_7,INV_8,...,INV_158,INV_159,INV_160,INV_161,INV_162,INV_163,INV_164,INV_165,INV_166,Avg_Top_20%
0,2024-01-01 00:00:00,0.0,,,,,,,,,...,,,,,,,,,-13.87813,
1,2024-01-01 01:00:00,0.0,,,,,,,,,...,,,,,,,,,-13.85500,
2,2024-01-01 02:00:00,0.0,,,,,,,,,...,,,,,,,,,-13.84791,
3,2024-01-01 03:00:00,0.0,,,,,,,,,...,,,,,,,,,-13.87750,
4,2024-01-01 04:00:00,0.0,,,,,,,,,...,,,,,,,,,-13.82208,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,2024-01-31 19:00:00,0.0,,,,,,,,,...,,,,,,,,,-15.23895,
740,2024-01-31 20:00:00,0.0,,,,,,,,,...,,,,,,,,,-15.22125,
741,2024-01-31 21:00:00,0.0,,,,,,,,,...,,,,,,,,,-15.04375,
742,2024-01-31 22:00:00,0.0,,,,,,,,,...,,,,,,,,,-15.15271,


In [204]:
def sum_inv(df, col_name):
    inverter_cols = [col for col in df.columns if col.startswith("Inverter_")]
    df[col_name] = df[inverter_cols].sum(axis=1)
    return df

df = sum_inv(df, "Actual Sum")
df_new = sum_inv(df, "Expected Sum")

  df[col_name] = df[inverter_cols].sum(axis=1)
  df[col_name] = df[inverter_cols].sum(axis=1)


In [205]:
df = df[df["POA Irradiance"] > 50]
df

Unnamed: 0,Timestamp,POA Irradiance,Inverter_1,Inverter_2,Inverter_3,Inverter_4,Inverter_5,Inverter_6,Inverter_7,Inverter_8,...,Inverter_159,Inverter_160,Inverter_161,Inverter_162,Inverter_163,Inverter_164,Inverter_165,Inverter_166,Actual Sum,Expected Sum
8,2024-01-01 08:00:00,61.17756,3.839833,4.048021,3.992563,3.516438,3.763188,4.120250,3.608229,3.739021,...,3.557062,3.679917,3.762250,3.652688,3.784479,3.817833,3.542500,626.7177,1230.699929,1230.699929
9,2024-01-01 09:00:00,135.17960,10.744840,10.792870,10.790870,10.505790,10.652310,10.941440,10.630390,10.733790,...,9.294583,10.066790,9.431979,9.928249,9.628187,10.159980,10.030040,1676.3010,3322.484765,3322.484765
10,2024-01-01 10:00:00,221.19310,19.137290,18.979590,19.057860,18.961830,19.003390,19.167060,19.059830,18.981650,...,18.595940,18.656350,18.665230,18.601790,18.944690,18.700270,18.670000,3201.6180,6371.541580,6371.541580
11,2024-01-01 11:00:00,490.18210,33.183810,32.619650,32.783310,32.859150,32.729920,32.547880,32.877230,34.346890,...,30.457620,30.471560,30.547230,30.637920,30.882630,30.606520,30.723560,5499.0410,11007.016340,11007.016340
12,2024-01-01 12:00:00,683.27510,44.254460,43.968980,44.494750,45.095580,44.968250,44.872380,45.474190,45.667040,...,43.912170,46.591580,43.443100,46.428040,43.505880,46.387170,46.248540,6986.6360,14027.013850,14027.013850
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
731,2024-01-31 11:00:00,131.83110,8.230790,8.232667,8.206814,7.804292,8.177230,8.437853,8.227791,8.190875,...,8.107084,8.142395,8.135374,8.234209,8.398626,8.190042,8.201167,1256.5150,2500.262497,2500.262497
732,2024-01-31 12:00:00,181.00160,10.843540,10.805380,10.772750,9.738002,10.760250,9.893750,10.857250,10.844460,...,10.492480,10.644580,10.517890,10.739190,10.801310,10.679750,10.742480,1641.4080,3262.921381,3262.921381
733,2024-01-31 13:00:00,225.45280,13.318730,13.136610,13.140960,13.234040,13.145710,13.339690,13.246400,13.255730,...,12.807710,12.969670,12.820560,13.050420,13.125480,13.003290,13.089080,2006.3380,3998.864767,3998.864767
734,2024-01-31 14:00:00,151.84790,9.460689,9.290124,9.277729,9.353021,9.269166,9.479166,9.332000,9.375374,...,8.818624,8.998855,8.824291,9.123916,9.088457,9.078855,9.082168,1370.3600,2734.792246,2734.792246


In [206]:
df_new = df_new[df_new["POA Irradiance"] > 50]
df_new

Unnamed: 0,Timestamp,POA Irradiance,Inverter_1,Inverter_2,Inverter_3,Inverter_4,Inverter_5,Inverter_6,Inverter_7,Inverter_8,...,Inverter_159,Inverter_160,Inverter_161,Inverter_162,Inverter_163,Inverter_164,Inverter_165,Inverter_166,Actual Sum,Expected Sum
8,2024-01-01 08:00:00,61.17756,3.839833,4.048021,3.992563,3.516438,3.763188,4.120250,3.608229,3.739021,...,3.557062,3.679917,3.762250,3.652688,3.784479,3.817833,3.542500,626.7177,1230.699929,1230.699929
9,2024-01-01 09:00:00,135.17960,10.744840,10.792870,10.790870,10.505790,10.652310,10.941440,10.630390,10.733790,...,9.294583,10.066790,9.431979,9.928249,9.628187,10.159980,10.030040,1676.3010,3322.484765,3322.484765
10,2024-01-01 10:00:00,221.19310,19.137290,18.979590,19.057860,18.961830,19.003390,19.167060,19.059830,18.981650,...,18.595940,18.656350,18.665230,18.601790,18.944690,18.700270,18.670000,3201.6180,6371.541580,6371.541580
11,2024-01-01 11:00:00,490.18210,33.183810,32.619650,32.783310,32.859150,32.729920,32.547880,32.877230,34.346890,...,30.457620,30.471560,30.547230,30.637920,30.882630,30.606520,30.723560,5499.0410,11007.016340,11007.016340
12,2024-01-01 12:00:00,683.27510,44.254460,43.968980,44.494750,45.095580,44.968250,44.872380,45.474190,45.667040,...,43.912170,46.591580,43.443100,46.428040,43.505880,46.387170,46.248540,6986.6360,14027.013850,14027.013850
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
731,2024-01-31 11:00:00,131.83110,8.230790,8.232667,8.206814,7.804292,8.177230,8.437853,8.227791,8.190875,...,8.107084,8.142395,8.135374,8.234209,8.398626,8.190042,8.201167,1256.5150,2500.262497,2500.262497
732,2024-01-31 12:00:00,181.00160,10.843540,10.805380,10.772750,9.738002,10.760250,9.893750,10.857250,10.844460,...,10.492480,10.644580,10.517890,10.739190,10.801310,10.679750,10.742480,1641.4080,3262.921381,3262.921381
733,2024-01-31 13:00:00,225.45280,13.318730,13.136610,13.140960,13.234040,13.145710,13.339690,13.246400,13.255730,...,12.807710,12.969670,12.820560,13.050420,13.125480,13.003290,13.089080,2006.3380,3998.864767,3998.864767
734,2024-01-31 14:00:00,151.84790,9.460689,9.290124,9.277729,9.353021,9.269166,9.479166,9.332000,9.375374,...,8.818624,8.998855,8.824291,9.123916,9.088457,9.078855,9.082168,1370.3600,2734.792246,2734.792246


In [207]:
selected_1 = df[["Timestamp", "POA Irradiance", "Actual Sum"]]
selected_2 = df_new[["Timestamp", "Expected Sum"]]
df_merged = pd.merge(selected_1, selected_2, on="Timestamp", how="inner")
df_merged["Availability %"] = df_merged["Actual Sum"] / df_merged["Expected Sum"] * 100
df_merged.to_csv(f"../output/{site_name}.csv", index=False)

In [208]:
availability = df_merged["Availability %"].mean()
availability

100.0