In [3]:
import pandas as pd
import numpy as np
import os

In [4]:
def detect_separator(line):
    if "," in line:
        return ","

    return ";"


def read_site(file_path):
    lines = []

    # Read only the first 10 lines into the 'lines' list
    with open(file_path, "r") as f:
        for _ in range(10):
            line = f.readline()
            if not line:
                break
            lines.append(line)

    # The first line where the word "timpestamps" (in any case) appears is the header line
    header_index = next(
        (i for i, line in enumerate(lines) if "timestamp" in line.lower()), None
    )

    if header_index is None:
        error_message = "Header not found in the file."
        print(error_message)
        exit()

    separator = detect_separator(lines[header_index])
    # The index of the found header is exactly the number of rows to skip when reading the data
    df = pd.read_csv(file_path, skiprows=header_index, header=0, sep=separator)
    # print(df.loc[0,:])

    # Check whether the second row in the dataframe is an extra unit row
    if pd.isna(df.iloc[0, 0]):
        df.drop(index=0, inplace=True)
        df.reset_index(
            drop=True, inplace=True
        )  # Resetting the index after dropping the row

    return df

In [5]:
file_path = "../data/2024-01/2024-01_Chiloquin Solar Farm.csv"
df = read_site(file_path)
df

Unnamed: 0,Timestamp,POA*,"Sungrow 60kW Inverter - 1.1, Line kW","Sungrow 60kW Inverter - 1.2, Line kW","Sungrow 60kW Inverter - 1.3, Line kW","Sungrow 60kW Inverter - 1.4, Line kW","Sungrow 60kW Inverter - 1.5, Line kW","Sungrow 60kW Inverter - 1.6, Line kW","Sungrow 60kW Inverter - 1.7, Line kW","Sungrow 60kW Inverter - 1.8, Line kW",...,"Sungrow 60kW Inverter - 11.7, Line kW","Sungrow 60kW Inverter - 11.8, Line kW","Sungrow 60kW Inverter - 11.9, Line kW","Sungrow 60kW Inverter - 11.10, Line kW","Sungrow 60kW Inverter - 11.11, Line kW","Sungrow 60kW Inverter - 11.12, Line kW","Sungrow 60kW Inverter - 11.13, Line kW","Sungrow 60kW Inverter - 11.14, Line kW","Sungrow 60kW Inverter - 11.15, Line kW",Production meter active power
0,2024-01-01 00:00:00,0,,,,,,,,,...,,,,,,,,,,-13.87813
1,2024-01-01 01:00:00,0,,,,,,,,,...,,,,,,,,,,-13.855
2,2024-01-01 02:00:00,0,,,,,,,,,...,,,,,,,,,,-13.84791
3,2024-01-01 03:00:00,0,,,,,,,,,...,,,,,,,,,,-13.8775
4,2024-01-01 04:00:00,0,,,,,,,,,...,,,,,,,,,,-13.82208
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,2024-01-31 19:00:00,0,,,,,,,,,...,,,,,,,,,,-15.23895
740,2024-01-31 20:00:00,0,,,,,,,,,...,,,,,,,,,,-15.22125
741,2024-01-31 21:00:00,0,,,,,,,,,...,,,,,,,,,,-15.04375
742,2024-01-31 22:00:00,0,,,,,,,,,...,,,,,,,,,,-15.15271


In [6]:
site_name = file_path.split("_")[-1].replace(".csv", "")
year_month = file_path.split("_")[0].split("/")[-1]
year_month

'2024-01'

In [7]:
def find_keywords(column, keywords_list):
    for keywords in keywords_list:
        if all(keyword.lower() in column.lower() for keyword in keywords):
            return True
    return False


def column_basic(df):
    keyword_mapping = {
        "Timestamp": [["timestamp"]],
        "POA Irradiance": [["poa"]],
        "Meter Power": [["meter", "power"], ["electric", "power"]]
    }

    rename_mapping = {}
    for new_name, keywords_list in keyword_mapping.items():
        found = False
        for col in df.columns:
            found = find_keywords(col, keywords_list)
            if found:
                rename_mapping[col] = new_name
                break
        if not found:
            df[new_name] = np.nan

    df.rename(columns=rename_mapping, inplace=True)

    return df

name_mapping = {}
def column_inverter(df):
    known_columns = {
        "Timestamp",
        "POA Irradiance",
        "Meter Power",
    }
    inverter_index = 1

    for col in df.columns:
        if col not in known_columns:
            new_name = "Inverter_" + str(inverter_index)
            df.rename(columns={col: new_name}, inplace=True)
            # Used for renaming cols to their original names in the end of the processing
            name_mapping[new_name] = col
            inverter_index += 1

    return df


def column_reorder(df):
    inverter_columns = sorted(
        (col for col in df.columns if "Inverter" in col),
        key=lambda s: int(s.split("_")[1]),
    )

    columns_order = [
        "Timestamp",
        "POA Irradiance",
        "Meter Power"
    ] + inverter_columns
    df = df[columns_order]

    return df


def rename(df):
    return (
        df.pipe(column_basic)
        .pipe(column_inverter)
        .pipe(column_reorder)
    )

df = rename(df)
df

Unnamed: 0,Timestamp,POA Irradiance,Meter Power,Inverter_1,Inverter_2,Inverter_3,Inverter_4,Inverter_5,Inverter_6,Inverter_7,...,Inverter_156,Inverter_157,Inverter_158,Inverter_159,Inverter_160,Inverter_161,Inverter_162,Inverter_163,Inverter_164,Inverter_165
0,2024-01-01 00:00:00,0,-13.87813,,,,,,,,...,,,,,,,,,,
1,2024-01-01 01:00:00,0,-13.855,,,,,,,,...,,,,,,,,,,
2,2024-01-01 02:00:00,0,-13.84791,,,,,,,,...,,,,,,,,,,
3,2024-01-01 03:00:00,0,-13.8775,,,,,,,,...,,,,,,,,,,
4,2024-01-01 04:00:00,0,-13.82208,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,2024-01-31 19:00:00,0,-15.23895,,,,,,,,...,,,,,,,,,,
740,2024-01-31 20:00:00,0,-15.22125,,,,,,,,...,,,,,,,,,,
741,2024-01-31 21:00:00,0,-15.04375,,,,,,,,...,,,,,,,,,,
742,2024-01-31 22:00:00,0,-15.15271,,,,,,,,...,,,,,,,,,,


In [8]:
# Convert the datetime string to a datetime object
def custom_to_datetime(df):
    formats = [
        "%m/%d/%Y %H:%M:%S",
        "%m/%d/%y %H:%M:%S",
        "%m/%d/%y %H:%M",
        "%m/%d/%Y %I:%M:%S %p",
        "%m-%d-%Y %H:%M:%S",
        "%m-%d-%y %H:%M:%S",
        "%m-%d-%Y %H:%M",
        "%m-%d-%y %H:%M",
        "%Y-%m-%d %H:%M:%S",
        "%d/%m/%Y %H:%M:%S",
        "%m/%d/%Y %H:%M",
        "%Y-%m-%d %H:%M",
    ]

    for fmt in formats:
        try:
            df["Timestamp"] = pd.to_datetime(df["Timestamp"], format=fmt)
            return df

        except ValueError:  # if the format doesn't match, continue to the next format
            continue

    # Quit the program if no suitable format is found
    raise ValueError("No suitable format found for the 'Timestamp' column.")


def normalize(df):
    cols_to_convert = df.columns[df.columns != "Timestamp"]
    # Convert all values in columns except for Timestamp to numbers
    df[cols_to_convert] = df[cols_to_convert].apply(pd.to_numeric, errors="coerce")
    df = custom_to_datetime(df)

    return df

df = normalize(df)
df

Unnamed: 0,Timestamp,POA Irradiance,Meter Power,Inverter_1,Inverter_2,Inverter_3,Inverter_4,Inverter_5,Inverter_6,Inverter_7,...,Inverter_156,Inverter_157,Inverter_158,Inverter_159,Inverter_160,Inverter_161,Inverter_162,Inverter_163,Inverter_164,Inverter_165
0,2024-01-01 00:00:00,0.0,-13.87813,,,,,,,,...,,,,,,,,,,
1,2024-01-01 01:00:00,0.0,-13.85500,,,,,,,,...,,,,,,,,,,
2,2024-01-01 02:00:00,0.0,-13.84791,,,,,,,,...,,,,,,,,,,
3,2024-01-01 03:00:00,0.0,-13.87750,,,,,,,,...,,,,,,,,,,
4,2024-01-01 04:00:00,0.0,-13.82208,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,2024-01-31 19:00:00,0.0,-15.23895,,,,,,,,...,,,,,,,,,,
740,2024-01-31 20:00:00,0.0,-15.22125,,,,,,,,...,,,,,,,,,,
741,2024-01-31 21:00:00,0.0,-15.04375,,,,,,,,...,,,,,,,,,,
742,2024-01-31 22:00:00,0.0,-15.15271,,,,,,,,...,,,,,,,,,,


In [9]:
# Calculate the average energy produced by the best 20% of inverters
# that have been working non-stop for at least an hour in conditions where POA Irradiance > 50
def compute_avg(df):
    inverter_cols = [col for col in df.columns if col.startswith("Inverter_")]
    # Initialize a Series to store the average value of top 20% inverter values when POA Irradiance > 50
    avg_top_20_series = pd.Series(index=df.index, dtype="float64")
    for index, row in df.iterrows():
        if pd.notnull(row["POA Irradiance"]) and row["POA Irradiance"] > 50:
            candidate_values = row[inverter_cols].dropna()
            positive_candidates = candidate_values[candidate_values > 0]
            if not positive_candidates.empty:
                # 80% of the values in positive_values are less than or equal to the 80th percentile value
                percentile_80 = np.percentile(positive_candidates, 80)
                # Select values greater than or equal to the 80th percentile
                top_20 = positive_candidates[positive_candidates >= percentile_80]
                if not top_20.empty:
                    avg_top_20_series.at[index] = top_20.mean()


    return avg_top_20_series

avg_top_20_series = compute_avg(df)
avg_top_20_series

0     NaN
1     NaN
2     NaN
3     NaN
4     NaN
       ..
739   NaN
740   NaN
741   NaN
742   NaN
743   NaN
Length: 744, dtype: float64

In [10]:
def new_copy(df):
    df_new = df.copy(deep=True)
    df_new = df_new.rename(
        columns=lambda x: (
            x.replace("Inverter", "INV") if x.startswith("Inverter_") else x
        )
    )
    return df_new


def process_inverter(df_new):
    df_new["Avg_Top_20%"] = avg_top_20_series
    inverter_cols = [col for col in df_new.columns if col.startswith("INV_")]

    for col in inverter_cols:
        mask = ((df_new[col] <= 0) | (pd.isnull(df_new[col]))) & pd.notnull(
            df_new["Avg_Top_20%"]
        )
        df_new.loc[mask, col] = df_new.loc[mask, "Avg_Top_20%"]

    print(df_new[df_new["POA Irradiance"] > 0][inverter_cols].head())

    return df_new


df_new = new_copy(df)
df_new = process_inverter(df_new)

        INV_1      INV_2      INV_3      INV_4      INV_5      INV_6  \
7    0.123000   0.176528   0.168750   0.065194   0.126194   0.197056   
8    3.839833   4.048021   3.992563   3.516438   3.763188   4.120250   
9   10.744840  10.792870  10.790870  10.505790  10.652310  10.941440   
10  19.137290  18.979590  19.057860  18.961830  19.003390  19.167060   
11  33.183810  32.619650  32.783310  32.859150  32.729920  32.547880   

        INV_7      INV_8      INV_9     INV_10  ...    INV_156    INV_157  \
7    0.085833   0.113306   0.159722   0.119444  ...   0.196139   0.134583   
8    3.608229   3.739021   3.971521   3.790917  ...   4.148667   3.713438   
9   10.630390  10.733790  10.852020  10.894350  ...  10.570560   9.424582   
10  19.059830  18.981650  19.239860  19.079540  ...  19.326250  18.770900   
11  32.877230  34.346890  32.118080  35.561250  ...  31.522020  30.583670   

      INV_158    INV_159    INV_160    INV_161    INV_162    INV_163  \
7    0.154333   0.099417   0.114

In [11]:
def sum_inv(df, col_name, inv_starter):
    inverter_cols = [col for col in df.columns if col.startswith(inv_starter)]
    df[col_name] = df[inverter_cols].sum(axis=1)
    return df

df = sum_inv(df, "Actual Sum", "Inverter")
df_new = sum_inv(df_new, "Expected Sum", "INV")
df_new.to_csv("./test.csv")


  df[col_name] = df[inverter_cols].sum(axis=1)


In [12]:
selected_1 = df[["Timestamp", "POA Irradiance", "Actual Sum"]]
selected_2 = df_new[["Timestamp", "Expected Sum"]]
df_merged = pd.merge(selected_1, selected_2, on="Timestamp", how="inner")
df_merged["Availability %"] = (
    df_merged["Actual Sum"] / df_merged["Expected Sum"] * 100
).round(2)
os.makedirs(f"../output/{year_month}", exist_ok=True)
df_merged.to_csv(f"../output/{year_month}/{site_name}.csv", index=False)

In [13]:
availability = df_merged["Availability %"].mean()
availability

98.02674922600619

In [14]:
import pandas as pd

summary = pd.DataFrame(columns=["Site Name", "Year-Month", "Availability"])


def update_summary(summary, site_name, year_month, availability):
    mask = (summary["Site Name"] == site_name) & (summary["Year-Month"] == year_month)
    if summary[mask].empty:
        new_row = pd.DataFrame(
            {
                "Site Name": site_name,
                "Year-Month": year_month,
                "Availability": availability,
            },
            index=[0],
        )
        summary = pd.concat([summary, new_row], ignore_index=True)
    else:
        summary.loc[mask, "Availability"] = availability

    return summary


summary = update_summary(summary, site_name, year_month, availability)
summary

  summary = pd.concat([summary, new_row], ignore_index=True)


Unnamed: 0,Site Name,Year-Month,Availability
0,Chiloquin Solar Farm,2024-01,98.026749
