In [3]:
import pandas as pd

In [4]:
#  Read in the data file
def read_site(file_path):
    lines = []

    # Read only the first 10 lines into the 'lines' list
    with open(file_path, "r") as f:
        for _ in range(10):
            line = f.readline()
            if not line:
                break
            lines.append(line)

    # The first line where the word "timpestamps" (in any case) appears is the header line
    header_index = next(
        (i for i, line in enumerate(lines) if "timestamp" in line.lower()), None
    )

    if header_index is None:
        error_message = "Header not found in the file."
        print(error_message)
        exit()

    separator = detect_separator(lines[header_index])
    # The index of the found header is exactly the number of rows to skip when reading the data
    df = pd.read_csv(file_path, skiprows=header_index, header=0, sep=separator)
    # print(df.loc[0,:])

    # Check whether the second row in the dataframe is an extra unit row
    if pd.isna(df.iloc[0, 0]):
        df.drop(index=0, inplace=True)
        df.reset_index(
            drop=True, inplace=True
        )  # Resetting the index after dropping the row

    return df


def detect_separator(line):
    if "," in line:
        return ","

    return ";"


df = read_site("../data/processed/2023-10-01-2023-10-31_Gastonia Monthly.csv")
df

Unnamed: 0,Timestamp,002 - Gastonia Solar Projects 2 - IRRADIANCE_POA (W/m^2),002 - Gastonia Solar Projects 2 - T_AMB (°C),002 - Gastonia Solar Projects 2 - WIND_SPEED (m/s),MTR01 - Meter 01 - AC_POWER (kW),MTR01 - Meter 01 - AC_VOLTAGE_A (V),INV01 - Inverter 1.01.01 - AC_POWER (kW),INV02 - Inverter 1.01.02 - AC_POWER (kW),INV03 - Inverter 1.01.03 - AC_POWER (kW),INV04 - Inverter 1.02.04 - AC_POWER (kW),...,INV28 - Inverter 2.11.28 - AC_POWER (kW),INV29 - Inverter 2.12.29 - AC_POWER (kW),INV30 - Inverter 2.12.30 - AC_POWER (kW),INV31 - Inverter 2.12.31 - AC_POWER (kW),INV32 - Inverter 2.13.32 - AC_POWER (kW),INV33 - Inverter 2.13.33 - AC_POWER (kW),INV34 - Inverter 2.13.34 - AC_POWER (kW),INV35 - Inverter 2.14.35 - AC_POWER (kW),INV36 - Inverter 2.14.36 - AC_POWER (kW),INV37 - Inverter 2.14.37 - AC_POWER (kW)
0,10-01-2023 00:00:00,0.0,18.212587,0.400000,-4.59,7585.451602,,,,,...,,,,,,,,,,
1,10-01-2023 00:05:00,0.0,18.343514,0.400000,-4.59,7572.292682,,,,,...,,,,,,,,,,
2,10-01-2023 00:10:00,0.0,18.400000,0.400000,-4.59,7577.048046,,,,,...,,,,,,,,,,
3,10-01-2023 00:15:00,0.0,18.219646,0.039491,-4.59,7576.975400,,,,,...,,,,,,,,,,
4,10-01-2023 00:20:00,0.0,18.159545,0.000000,-4.59,7574.154573,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8924,10-31-2023 23:40:00,0.0,8.754156,1.511189,-4.63,7583.775803,,,,,...,,,,,,,,,,
8925,10-31-2023 23:45:00,0.0,8.600000,1.528940,-4.63,7587.824759,,,,,...,,,,,,,,,,
8926,10-31-2023 23:50:00,0.0,8.600000,1.471307,-4.63,7596.869025,,,,,...,,,,,,,,,,
8927,10-31-2023 23:55:00,0.0,8.518093,1.009009,-4.63,7594.237386,,,,,...,,,,,,,,,,


In [5]:
import numpy as np


def find_keywords(column, keywords_list):
    for keywords in keywords_list:
        print("Checking keywords:", keywords)
        if all(keyword.lower() in column.lower() for keyword in keywords):
            return True
    return False


def column_others(df):
    keyword_mapping = {
        "Timestamp": [["timestamp"]],
        "POA Irradiance": [["poa"]],
        "Meter Power": [["meter", "power"], ["electric", "power"]],
        # "Meter Power": ["meter", "power"],
    }

    rename_mapping = {}
    for new_name, keywords_list in keyword_mapping.items():
        print(f"Finding:  {new_name}")
        found = False
        for col in df.columns:
            print(f"current column name:  {col}")
            found = find_keywords(col, keywords_list)
            print(found)
            if found:
                print(f"keywords found in:  {col}")
                rename_mapping[col] = new_name
                break
        if not found:
            df[new_name] = np.nan

    df.rename(columns=rename_mapping, inplace=True)

    return df


df = column_others(df)
df

Finding:  Timestamp
current column name:  Timestamp
Checking keywords: ['timestamp']
True
keywords found in:  Timestamp
Finding:  POA Irradiance
current column name:  Timestamp
Checking keywords: ['poa']
False
current column name:  002 - Gastonia Solar Projects 2 - IRRADIANCE_POA (W/m^2)
Checking keywords: ['poa']
True
keywords found in:  002 - Gastonia Solar Projects 2 - IRRADIANCE_POA (W/m^2)
Finding:  Meter Power
current column name:  Timestamp
Checking keywords: ['meter', 'power']
Checking keywords: ['electric', 'power']
False
current column name:  002 - Gastonia Solar Projects 2 - IRRADIANCE_POA (W/m^2)
Checking keywords: ['meter', 'power']
Checking keywords: ['electric', 'power']
False
current column name:  002 - Gastonia Solar Projects 2 - T_AMB (°C)
Checking keywords: ['meter', 'power']
Checking keywords: ['electric', 'power']
False
current column name:  002 - Gastonia Solar Projects 2 - WIND_SPEED (m/s)
Checking keywords: ['meter', 'power']
Checking keywords: ['electric', 'pow

Unnamed: 0,Timestamp,POA Irradiance,002 - Gastonia Solar Projects 2 - T_AMB (°C),002 - Gastonia Solar Projects 2 - WIND_SPEED (m/s),Meter Power,MTR01 - Meter 01 - AC_VOLTAGE_A (V),INV01 - Inverter 1.01.01 - AC_POWER (kW),INV02 - Inverter 1.01.02 - AC_POWER (kW),INV03 - Inverter 1.01.03 - AC_POWER (kW),INV04 - Inverter 1.02.04 - AC_POWER (kW),...,INV28 - Inverter 2.11.28 - AC_POWER (kW),INV29 - Inverter 2.12.29 - AC_POWER (kW),INV30 - Inverter 2.12.30 - AC_POWER (kW),INV31 - Inverter 2.12.31 - AC_POWER (kW),INV32 - Inverter 2.13.32 - AC_POWER (kW),INV33 - Inverter 2.13.33 - AC_POWER (kW),INV34 - Inverter 2.13.34 - AC_POWER (kW),INV35 - Inverter 2.14.35 - AC_POWER (kW),INV36 - Inverter 2.14.36 - AC_POWER (kW),INV37 - Inverter 2.14.37 - AC_POWER (kW)
0,10-01-2023 00:00:00,0.0,18.212587,0.400000,-4.59,7585.451602,,,,,...,,,,,,,,,,
1,10-01-2023 00:05:00,0.0,18.343514,0.400000,-4.59,7572.292682,,,,,...,,,,,,,,,,
2,10-01-2023 00:10:00,0.0,18.400000,0.400000,-4.59,7577.048046,,,,,...,,,,,,,,,,
3,10-01-2023 00:15:00,0.0,18.219646,0.039491,-4.59,7576.975400,,,,,...,,,,,,,,,,
4,10-01-2023 00:20:00,0.0,18.159545,0.000000,-4.59,7574.154573,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8924,10-31-2023 23:40:00,0.0,8.754156,1.511189,-4.63,7583.775803,,,,,...,,,,,,,,,,
8925,10-31-2023 23:45:00,0.0,8.600000,1.528940,-4.63,7587.824759,,,,,...,,,,,,,,,,
8926,10-31-2023 23:50:00,0.0,8.600000,1.471307,-4.63,7596.869025,,,,,...,,,,,,,,,,
8927,10-31-2023 23:55:00,0.0,8.518093,1.009009,-4.63,7594.237386,,,,,...,,,,,,,,,,


In [6]:
def column_temperature(df):
    keywords = ["temperature", "amb", "°C"]
    temperature_cols = any(
        keyword in col.lower() for col in df.columns for keyword in keywords
    )
    temperature_cols = any([col for col in df.columns if keywords in col.lower()])
    if len(temperature_cols) == 1:
        df.rename(columns={temperature_cols[0]: "Temperature"}, inplace=True)
    elif len(temperature_cols) == 0:
        df["Temperature"] = np.nan
    else:
        # Check for columns with "ambient" in their name
        ambient = [col for col in temperature_cols if "ambient" in col.lower()]

        # If there's a column with "ambient", use that. Otherwise, use the first "temperature" column
        col_to_use = ambient[0] if ambient else temperature_cols[0]
        df.rename(columns={col_to_use: "Temperature"}, inplace=True)

        # Drop any other temperature columns to keep the dataframe clean
        cols_to_drop = [col for col in temperature_cols if col != col_to_use]
        df.drop(columns=cols_to_drop, inplace=True)

    return df


df = column_temperature(df)
df

Unnamed: 0,Timestamp,POA Irradiance,002 - Gastonia Solar Projects 2 - T_AMB (°C),002 - Gastonia Solar Projects 2 - WIND_SPEED (m/s),Meter Power,MTR01 - Meter 01 - AC_VOLTAGE_A (V),INV01 - Inverter 1.01.01 - AC_POWER (kW),INV02 - Inverter 1.01.02 - AC_POWER (kW),INV03 - Inverter 1.01.03 - AC_POWER (kW),INV04 - Inverter 1.02.04 - AC_POWER (kW),...,INV29 - Inverter 2.12.29 - AC_POWER (kW),INV30 - Inverter 2.12.30 - AC_POWER (kW),INV31 - Inverter 2.12.31 - AC_POWER (kW),INV32 - Inverter 2.13.32 - AC_POWER (kW),INV33 - Inverter 2.13.33 - AC_POWER (kW),INV34 - Inverter 2.13.34 - AC_POWER (kW),INV35 - Inverter 2.14.35 - AC_POWER (kW),INV36 - Inverter 2.14.36 - AC_POWER (kW),INV37 - Inverter 2.14.37 - AC_POWER (kW),Temperature
0,10-01-2023 00:00:00,0.0,18.212587,0.400000,-4.59,7585.451602,,,,,...,,,,,,,,,,
1,10-01-2023 00:05:00,0.0,18.343514,0.400000,-4.59,7572.292682,,,,,...,,,,,,,,,,
2,10-01-2023 00:10:00,0.0,18.400000,0.400000,-4.59,7577.048046,,,,,...,,,,,,,,,,
3,10-01-2023 00:15:00,0.0,18.219646,0.039491,-4.59,7576.975400,,,,,...,,,,,,,,,,
4,10-01-2023 00:20:00,0.0,18.159545,0.000000,-4.59,7574.154573,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8924,10-31-2023 23:40:00,0.0,8.754156,1.511189,-4.63,7583.775803,,,,,...,,,,,,,,,,
8925,10-31-2023 23:45:00,0.0,8.600000,1.528940,-4.63,7587.824759,,,,,...,,,,,,,,,,
8926,10-31-2023 23:50:00,0.0,8.600000,1.471307,-4.63,7596.869025,,,,,...,,,,,,,,,,
8927,10-31-2023 23:55:00,0.0,8.518093,1.009009,-4.63,7594.237386,,,,,...,,,,,,,,,,
