In [4]:
import pandas as pd
import numpy as np
import os

In [5]:
def detect_separator(line):
    if "," in line:
        return ","

    return ";"


def read_site(file_path):
    lines = []

    # Read only the first 10 lines into the 'lines' list
    with open(file_path, "r") as f:
        for _ in range(10):
            line = f.readline()
            if not line:
                break
            lines.append(line)

    # The first line where the word "timpestamps" (in any case) appears is the header line
    header_index = next(
        (i for i, line in enumerate(lines) if "timestamp" in line.lower()), None
    )

    if header_index is None:
        error_message = "Header not found in the file."
        print(error_message)
        exit()

    separator = detect_separator(lines[header_index])
    # The index of the found header is exactly the number of rows to skip when reading the data
    df = pd.read_csv(file_path, skiprows=header_index, header=0, sep=separator)
    # print(df.loc[0,:])

    # Check whether the second row in the dataframe is an extra unit row
    if pd.isna(df.iloc[0, 0]):
        df.drop(index=0, inplace=True)
        df.reset_index(
            drop=True, inplace=True
        )  # Resetting the index after dropping the row

    return df

In [6]:
file_path = "../data/2024-01/2024-01_Chiloquin Solar Farm.csv"
df = read_site(file_path)
df

Unnamed: 0,Timestamp,POA*,"Sungrow 60kW Inverter - 1.1, Line kW","Sungrow 60kW Inverter - 1.2, Line kW","Sungrow 60kW Inverter - 1.3, Line kW","Sungrow 60kW Inverter - 1.4, Line kW","Sungrow 60kW Inverter - 1.5, Line kW","Sungrow 60kW Inverter - 1.6, Line kW","Sungrow 60kW Inverter - 1.7, Line kW","Sungrow 60kW Inverter - 1.8, Line kW",...,"Sungrow 60kW Inverter - 11.7, Line kW","Sungrow 60kW Inverter - 11.8, Line kW","Sungrow 60kW Inverter - 11.9, Line kW","Sungrow 60kW Inverter - 11.10, Line kW","Sungrow 60kW Inverter - 11.11, Line kW","Sungrow 60kW Inverter - 11.12, Line kW","Sungrow 60kW Inverter - 11.13, Line kW","Sungrow 60kW Inverter - 11.14, Line kW","Sungrow 60kW Inverter - 11.15, Line kW",Production meter active power
0,2024-01-01 00:00:00,0,,,,,,,,,...,,,,,,,,,,-13.87813
1,2024-01-01 01:00:00,0,,,,,,,,,...,,,,,,,,,,-13.855
2,2024-01-01 02:00:00,0,,,,,,,,,...,,,,,,,,,,-13.84791
3,2024-01-01 03:00:00,0,,,,,,,,,...,,,,,,,,,,-13.8775
4,2024-01-01 04:00:00,0,,,,,,,,,...,,,,,,,,,,-13.82208
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,2024-01-31 19:00:00,0,,,,,,,,,...,,,,,,,,,,-15.23895
740,2024-01-31 20:00:00,0,,,,,,,,,...,,,,,,,,,,-15.22125
741,2024-01-31 21:00:00,0,,,,,,,,,...,,,,,,,,,,-15.04375
742,2024-01-31 22:00:00,0,,,,,,,,,...,,,,,,,,,,-15.15271


In [7]:
site_name = file_path.split("_")[-1].replace(".csv", "")
year_month = file_path.split("_")[0].split("/")[-1]
year_month

'2024-01'

In [8]:
def find_keywords(column, keywords_list):
    for keywords in keywords_list:
        if all(keyword.lower() in column.lower() for keyword in keywords):
            return True
    return False


def column_basic(df):
    keyword_mapping = {
        "Timestamp": [["timestamp"]],
        "POA Irradiance": [["poa"]],
        "Meter Power": [["meter", "power"], ["electric", "power"]]
    }

    rename_mapping = {}
    for new_name, keywords_list in keyword_mapping.items():
        found = False
        for col in df.columns:
            found = find_keywords(col, keywords_list)
            if found:
                rename_mapping[col] = new_name
                break
        if not found:
            df[new_name] = np.nan

    df.rename(columns=rename_mapping, inplace=True)

    return df

name_mapping = {}
def column_inverter(df):
    known_columns = {
        "Timestamp",
        "POA Irradiance",
        "Meter Power",
    }
    inverter_index = 1

    for col in df.columns:
        if col not in known_columns:
            new_name = "Inverter_" + str(inverter_index)
            df.rename(columns={col: new_name}, inplace=True)
            # Used for renaming cols to their original names in the end of the processing
            name_mapping[new_name] = col
            inverter_index += 1

    return df


def column_reorder(df):
    inverter_columns = sorted(
        (col for col in df.columns if "Inverter" in col),
        key=lambda s: int(s.split("_")[1]),
    )

    columns_order = [
        "Timestamp",
        "POA Irradiance",
        "Meter Power"
    ] + inverter_columns
    df = df[columns_order]

    return df


def rename(df):
    return (
        df.pipe(column_basic)
        .pipe(column_inverter)
        .pipe(column_reorder)
    )

df = rename(df)
df

Unnamed: 0,Timestamp,POA Irradiance,Meter Power,Inverter_1,Inverter_2,Inverter_3,Inverter_4,Inverter_5,Inverter_6,Inverter_7,...,Inverter_156,Inverter_157,Inverter_158,Inverter_159,Inverter_160,Inverter_161,Inverter_162,Inverter_163,Inverter_164,Inverter_165
0,2024-01-01 00:00:00,0,-13.87813,,,,,,,,...,,,,,,,,,,
1,2024-01-01 01:00:00,0,-13.855,,,,,,,,...,,,,,,,,,,
2,2024-01-01 02:00:00,0,-13.84791,,,,,,,,...,,,,,,,,,,
3,2024-01-01 03:00:00,0,-13.8775,,,,,,,,...,,,,,,,,,,
4,2024-01-01 04:00:00,0,-13.82208,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,2024-01-31 19:00:00,0,-15.23895,,,,,,,,...,,,,,,,,,,
740,2024-01-31 20:00:00,0,-15.22125,,,,,,,,...,,,,,,,,,,
741,2024-01-31 21:00:00,0,-15.04375,,,,,,,,...,,,,,,,,,,
742,2024-01-31 22:00:00,0,-15.15271,,,,,,,,...,,,,,,,,,,


In [9]:
# Convert the datetime string to a datetime object
def custom_to_datetime(df):
    formats = [
        "%m/%d/%Y %H:%M:%S",
        "%m/%d/%y %H:%M:%S",
        "%m/%d/%y %H:%M",
        "%m/%d/%Y %I:%M:%S %p",
        "%m-%d-%Y %H:%M:%S",
        "%m-%d-%y %H:%M:%S",
        "%m-%d-%Y %H:%M",
        "%m-%d-%y %H:%M",
        "%Y-%m-%d %H:%M:%S",
        "%d/%m/%Y %H:%M:%S",
        "%m/%d/%Y %H:%M",
        "%Y-%m-%d %H:%M",
    ]

    for fmt in formats:
        try:
            df["Timestamp"] = pd.to_datetime(df["Timestamp"], format=fmt)
            return df

        except ValueError:  # if the format doesn't match, continue to the next format
            continue

    # Quit the program if no suitable format is found
    raise ValueError("No suitable format found for the 'Timestamp' column.")


def normalize(df):
    cols_to_convert = df.columns[df.columns != "Timestamp"]
    # Convert all values in columns except for Timestamp to numbers
    df[cols_to_convert] = df[cols_to_convert].apply(pd.to_numeric, errors="coerce")
    df = custom_to_datetime(df)

    return df

df = normalize(df)
df

Unnamed: 0,Timestamp,POA Irradiance,Meter Power,Inverter_1,Inverter_2,Inverter_3,Inverter_4,Inverter_5,Inverter_6,Inverter_7,...,Inverter_156,Inverter_157,Inverter_158,Inverter_159,Inverter_160,Inverter_161,Inverter_162,Inverter_163,Inverter_164,Inverter_165
0,2024-01-01 00:00:00,0.0,-13.87813,,,,,,,,...,,,,,,,,,,
1,2024-01-01 01:00:00,0.0,-13.85500,,,,,,,,...,,,,,,,,,,
2,2024-01-01 02:00:00,0.0,-13.84791,,,,,,,,...,,,,,,,,,,
3,2024-01-01 03:00:00,0.0,-13.87750,,,,,,,,...,,,,,,,,,,
4,2024-01-01 04:00:00,0.0,-13.82208,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,2024-01-31 19:00:00,0.0,-15.23895,,,,,,,,...,,,,,,,,,,
740,2024-01-31 20:00:00,0.0,-15.22125,,,,,,,,...,,,,,,,,,,
741,2024-01-31 21:00:00,0.0,-15.04375,,,,,,,,...,,,,,,,,,,
742,2024-01-31 22:00:00,0.0,-15.15271,,,,,,,,...,,,,,,,,,,


In [10]:
# Calculate the average energy produced by the best 20% of inverters
# that have been working non-stop for at least an hour in conditions where POA Irradiance > 50
def compute_avg(df):
    inverter_cols = [col for col in df.columns if col.startswith("Inverter_")]
    # Initialize a Series to store the average value of top 20% inverter values when POA Irradiance > 50
    avg_top_20_series = pd.Series(index=df.index, dtype="float64")
    for index, row in df.iterrows():
        if pd.notnull(row["POA Irradiance"]) and row["POA Irradiance"] > 50:
            candidate_values = row[inverter_cols].dropna()
            positive_candidates = candidate_values[candidate_values > 0]
            if not positive_candidates.empty:
                # 80% of the values in positive_values are less than or equal to the 80th percentile value
                percentile_80 = np.percentile(positive_candidates, 80)
                # Select values greater than or equal to the 80th percentile
                top_20 = positive_candidates[positive_candidates >= percentile_80]
                if not top_20.empty:
                    avg_top_20_series.at[index] = top_20.mean()


    return avg_top_20_series

avg_top_20_series = compute_avg(df)
avg_top_20_series

0     NaN
1     NaN
2     NaN
3     NaN
4     NaN
       ..
739   NaN
740   NaN
741   NaN
742   NaN
743   NaN
Length: 744, dtype: float64

In [39]:
def validate_datetime_input(datetime_str, time_point):
    """
    Validates and returns the datetime object for a given date or datetime string.
    """
    date_time_formats = [
        "%m/%d/%Y %H:%M:%S",
        "%m/%d/%y %H:%M:%S",
        "%m/%d/%y %H:%M",
        "%m/%d/%Y %I:%M:%S %p",
        "%m-%d-%Y %H:%M:%S",
        "%m-%d-%y %H:%M:%S",
        "%m-%d-%Y %H:%M",
        "%m-%d-%y %H:%M",
        "%Y-%m-%d %H:%M:%S",
        "%d/%m/%Y %H:%M:%S",
        "%m/%d/%Y %H:%M",
        "%Y-%m-%d %H:%M"
    ]

    date_only_formats = [
        "%Y-%m-%d",  # Date with century and hyphen separator (e.g., 2023-06-30)
        "%m/%d/%Y",  # U.S. date format with century and slash separator (e.g., 06/30/2023)
        "%m-%d-%Y",  # U.S. date format with century and hyphen separator (e.g., 06-30-2023)
        "%d/%m/%Y",  # Rest-of-world date format with century and slash separator (e.g., 30/06/2023)
        "%d-%m-%Y",  # Rest-of-world date format with century and hyphen separator (e.g., 30-06-2023)
        "%Y/%m/%d",  # ISO-like date format with century and slash separator (e.g., 2023/06/30)
        "%Y%m%d",  # Compact date format with century (e.g., 20230630)
        "%m/%d/%y",  # U.S. date format without century and slash separator (e.g., 06/30/23)
        "%m-%d-%y",  # U.S. date format without century and hyphen separator (e.g., 06-30-23)
        "%d/%m/%y",  # Rest-of-world date format without century and slash separator (e.g., 30/06/23)
        "%d-%m-%y",  # Rest-of-world date format without century and hyphen separator (e.g., 30-06-23)
        "%y/%m/%d",  # ISO-like date format without century and slash separator (e.g., 23/06/30)
        "%y%m%d",  # Compact date format without century (e.g., 230630)
        "%b %d, %Y",  # Date with textual month, day and century (e.g., Jun 30, 2023)
        "%d %b %Y",  # Date with day, textual month and century (e.g., 30 Jun 2023)
        "%b %d, %y",  # Date with textual month, day without century (e.g., Jun 30, 23)
        "%d %b %y",  # Date with day, textual month without century (e.g., 30 Jun 23)
        "%B %d, %Y",  # Date with full textual month, day and century (e.g., June 30, 2023)
        "%d %B %Y",  # Date with day, full textual month and century (e.g., 30 June 2023)
        "%B %d, %y",  # Date with full textual month, day without century (e.g., June 30, 23)
        "%d %B %y",  # Date with day, full textual month without century (e.g., 30 June 23)
    ]

    # Try parsing the string using date only formats
    for fmt in date_only_formats:
        try:
            datetime_obj = pd.to_datetime(datetime_str, format=fmt, errors="raise")
            if time_point == "end":
                # Adjust to the end of the day for "end" time points
                datetime_obj += pd.Timedelta(days=1)
            return datetime_obj
        except ValueError:
            continue

    # If not successful, try parsing using date-time formats
    for fmt in date_time_formats:
        try:
            datetime_obj = pd.to_datetime(datetime_str, format=fmt, errors="raise")
            return datetime_obj
        except ValueError:
            continue

    raise ValueError(f"Can't parse the {time_point} datetime.")

def prompt_for_outages():
    outages = []
    outage_count = 0

    while True:
        print(
            f"\nPlease enter information for exclusive outage #{outage_count + 1}. Enter '0' to stop."
        )
        start = input("Enter the start date (YYYY-MM-DD) or date and time (YYYY-MM-DD HH:MM): ")
        if start == "0":
            break 

        end = input(
            "Enter the end date (YYYY-MM-DD) or date and time (YYYY-MM-DD HH:MM): "
        )
        if end == "0":
            break

        inverter_id = input("Enter the inverter ID (e.g., 1, 2, 3): ")
        if inverter_id == "0":
            break

        try:
            # Validate start datetime
            start_datetime = validate_datetime_input(start, "start")

            # Validate end datetime
            end_datetime = validate_datetime_input(end, "end")

            # Validate that end is after start
            if end_datetime <= start_datetime:
                raise ValueError("End date/time must be after the start date/time.")

            # Validate inverter ID
            inverter_num = int(inverter_id)
            if inverter_num < 1:
                raise ValueError("Inverter ID must be a positive integer.")

        except ValueError as e:
            print(
                f"Invalid input: {e} Please try again!"
            )
            continue  # If any validation fails, restart the loop for this outage

        # Add validated outage information to the list
        outages.append((start_datetime, end_datetime, inverter_num))
        outage_count += 1

    return outages

In [34]:
prompt_for_outages()


Please enter information for exclusive outage #1. Enter '0' to stop.

Please enter information for exclusive outage #2. Enter '0' to stop.


[(Timestamp('2020-11-04 00:00:00'), Timestamp('2020-11-06 00:00:00'), 4)]

In [40]:
def read_outages_from_csv(site_name):
    filename = f"ExclusiveOutages_{site_name}.csv"
    file_dir = os.path.join("../data/exclusions", filename)
    try:
        df = pd.read_csv(file_dir, )
    except FileNotFoundError:
        raise ValueError(f"File {filename} not found.")

    outages = []
    for index, row in df.iterrows():
        try:
            start = row.iloc[0]  # The first column is start_time
            end = row.iloc[1]  # The second column is end_time
            inverter_id = row.iloc[2]  # The third column is inverter_id

            # Validate start datetime
            start_datetime = validate_datetime_input(start, "start")

            # Validate end datetime
            end_datetime = validate_datetime_input(end, "end")

            # Validate that end is after start
            if end_datetime <= start_datetime:
                raise ValueError("End date/time must be after the start date/time.")

            # Validate inverter ID
            inverter_num = int(inverter_id)
            if inverter_num < 1:
                raise ValueError("Inverter ID must be a positive integer.")

            outages.append((start_datetime, end_datetime, inverter_num))

        except ValueError as e:
            print(f"{site_name} @ Row {index + 1} : Invalid input - {e}")
            continue

    return outages


In [41]:
site_name = "Dairy Solar"
filename = f"ExclusiveOutages_{site_name}.csv"
file_dir = os.path.join("../data/exclusions", filename)
df = pd.read_csv(file_dir)
df

Unnamed: 0,start_time,end_time,inverter_id
0,6/30/23,7/6/23,2


In [42]:
outages = read_outages_from_csv("Dairy Solar")
outages

[(Timestamp('2023-06-30 00:00:00'), Timestamp('2023-07-07 00:00:00'), 2)]

In [None]:
def mark_exclusive_outages(df, outages):
    """
    Fill cells of exclusive outages with "Exclusive Outage".

    Parameters:
    - df: Original DataFrame
    - outages: List of tuples, each tuple contains start time, end time, and inverter ID.

    Returns:
    - DataFrame copy with specific cells filled with NaN.
    """

    df_copy = df.copy(deep=True)

    for start, end, inverter_id in outages:
        start_time = pd.to_datetime(start)
        end_time = pd.to_datetime(end)
        inverter_col = f"Inverter_{inverter_id}"

        # Fill the cells with "Exclusive Outage"
        df_copy.loc[
            (df_copy["Timestamp"] >= start_time) & (df_copy["Timestamp"] < end_time),
            inverter_col,
        ] = np.nan

    return df_copy

In [14]:
def new_copy(df):
    df_new = df.copy(deep=True)
    df_new = df_new.rename(
        columns=lambda x: (
            x.replace("Inverter", "INV") if x.startswith("Inverter_") else x
        )
    )
    return df_new


def process_inverter(df_new):
    df_new["Avg_Top_20%"] = avg_top_20_series
    inverter_cols = [col for col in df_new.columns if col.startswith("INV_")]

    for col in inverter_cols:
        mask = ((df_new[col] <= 0) | (pd.isnull(df_new[col]))) & pd.notnull(
            df_new["Avg_Top_20%"]
        )
        df_new.loc[mask, col] = df_new.loc[mask, "Avg_Top_20%"]

    print(df_new[df_new["POA Irradiance"] > 0][inverter_cols].head())

    return df_new


df_new = new_copy(df)
df_new = process_inverter(df_new)

        INV_1      INV_2      INV_3      INV_4      INV_5      INV_6  \
7    0.123000   0.176528   0.168750   0.065194   0.126194   0.197056   
8    3.839833   4.048021   3.992563   3.516438   3.763188   4.120250   
9   10.744840  10.792870  10.790870  10.505790  10.652310  10.941440   
10  19.137290  18.979590  19.057860  18.961830  19.003390  19.167060   
11  33.183810  32.619650  32.783310  32.859150  32.729920  32.547880   

        INV_7      INV_8      INV_9     INV_10  ...    INV_156    INV_157  \
7    0.085833   0.113306   0.159722   0.119444  ...   0.196139   0.134583   
8    3.608229   3.739021   3.971521   3.790917  ...   4.148667   3.713438   
9   10.630390  10.733790  10.852020  10.894350  ...  10.570560   9.424582   
10  19.059830  18.981650  19.239860  19.079540  ...  19.326250  18.770900   
11  32.877230  34.346890  32.118080  35.561250  ...  31.522020  30.583670   

      INV_158    INV_159    INV_160    INV_161    INV_162    INV_163  \
7    0.154333   0.099417   0.114

In [15]:
def sum_inv(df, col_name, inv_starter):
    inverter_cols = [col for col in df.columns if col.startswith(inv_starter)]
    df[col_name] = df[inverter_cols].sum(axis=1)
    return df

df = sum_inv(df, "Actual Sum", "Inverter")
df_new = sum_inv(df_new, "Expected Sum", "INV")
df_new.to_csv("./test.csv")


  df[col_name] = df[inverter_cols].sum(axis=1)


In [16]:
selected_1 = df[["Timestamp", "POA Irradiance", "Actual Sum"]]
selected_2 = df_new[["Timestamp", "Expected Sum"]]
df_merged = pd.merge(selected_1, selected_2, on="Timestamp", how="inner")
df_merged["Availability %"] = (
    df_merged["Actual Sum"] / df_merged["Expected Sum"] * 100
)
os.makedirs(f"../output/{year_month}", exist_ok=True)
df_merged.to_csv(f"../output/{year_month}/{site_name}.csv", index=False)

In [17]:
availability = df_merged["Availability %"].mean()
availability

98.02673027618691

In [18]:
import pandas as pd

summary = pd.DataFrame(columns=["Site Name", "Year-Month", "Availability"])


def update_summary(summary, site_name, year_month, availability):
    mask = (summary["Site Name"] == site_name) & (summary["Year-Month"] == year_month)
    if summary[mask].empty:
        new_row = pd.DataFrame(
            {
                "Site Name": site_name,
                "Year-Month": year_month,
                "Availability": availability,
            },
            index=[0],
        )
        summary = pd.concat([summary, new_row], ignore_index=True)
    else:
        summary.loc[mask, "Availability"] = availability

    return summary


summary = update_summary(summary, site_name, year_month, availability)
summary

  summary = pd.concat([summary, new_row], ignore_index=True)


Unnamed: 0,Site Name,Year-Month,Availability
0,Chiloquin Solar Farm,2024-01,98.02673
