In [17]:
import pandas as pd

In [18]:
#  Read in the data file
def read_site(file_path):
    lines = []

    # Read only the first 10 lines into the 'lines' list
    with open(file_path, "r") as f:
        for _ in range(10):
            line = f.readline()
            if not line:
                break
            lines.append(line)

    # The first line where the word "timpestamps" (in any case) appears is the header line
    header_index = next(
        (i for i, line in enumerate(lines) if "timestamp" in line.lower()), None
    )

    if header_index is None:
        error_message = "Header not found in the file."
        print(error_message)
        exit()

    separator = detect_separator(lines[header_index])
    # The index of the found header is exactly the number of rows to skip when reading the data
    df = pd.read_csv(file_path, skiprows=header_index, header=0, sep=separator)
    # print(df.loc[0,:])

    # Check whether the second row in the dataframe is an extra unit row
    if pd.isna(df.iloc[0, 0]):
        df.drop(index=0, inplace=True)
        df.reset_index(
            drop=True, inplace=True
        )  # Resetting the index after dropping the row

    return df


def detect_separator(line):
    if "," in line:
        return ","

    return ";"


df = read_site("../data/processed/2023-10-01-2023-10-31_Silverton Monthly.csv")
df

Unnamed: 0,Timestamp,Weather station module temperature,Weather station ambient temperature,Max wind speed - VAISALA_1,Production meter L-N voltage,Production meter active power,"Inverter 1, AC Active Power (Total)",POA Sensor
0,10/1/2023 12:00:00 AM,40.78959,45.37811,1.287854,7.234267,-2.49375,0.0,0.0
1,10/1/2023 12:15:00 AM,40.86845,45.39113,1.762260,7.243093,-2.50365,0.0,0.0
2,10/1/2023 12:30:00 AM,40.44472,45.01901,0.995809,7.249640,-2.50494,0.0,0.0
3,10/1/2023 12:45:00 AM,39.76267,43.80235,1.330107,7.255577,-2.50780,0.0,0.0
4,10/1/2023 1:00:00 AM,39.19312,43.44142,1.304879,7.259396,-2.51140,0.0,0.0
...,...,...,...,...,...,...,...,...
2971,10/31/2023 10:45:00 PM,42.74689,44.74180,1.180729,7.256020,-2.53160,0.0,0.0
2972,10/31/2023 11:00:00 PM,42.58095,44.51686,0.600190,7.268523,-2.53876,0.0,0.0
2973,10/31/2023 11:15:00 PM,42.31520,44.43145,0.672945,7.268757,-2.53651,0.0,0.0
2974,10/31/2023 11:30:00 PM,42.91148,44.69661,0.601612,7.260537,-2.53137,0.0,0.0


In [19]:
import numpy as np


def find_keywords(column, keywords_list):
    for keywords in keywords_list:
        print("Checking keywords:", keywords)
        if all(keyword.lower() in column.lower() for keyword in keywords):
            return True
    return False


def column_others(df):
    keyword_mapping = {
        "Timestamp": [["timestamp"]],
        "POA Irradiance": [["poa"]],
        "Meter Power": [["meter", "power"], ["electric", "power"]],
        # "Meter Power": ["meter", "power"],
    }

    rename_mapping = {}
    for new_name, keywords_list in keyword_mapping.items():
        print(f"Finding:  {new_name}")
        found = False
        for col in df.columns:
            print(f"current column name:  {col}")
            found = find_keywords(col, keywords_list)
            print(found)
            if found:
                print(f"keywords found in:  {col}")
                rename_mapping[col] = new_name
                break
        if not found:
            df[new_name] = np.nan

    df.rename(columns=rename_mapping, inplace=True)

    return df


df = column_others(df)
df

Finding:  Timestamp
current column name:  Timestamp
Checking keywords: ['timestamp']
True
keywords found in:  Timestamp
Finding:  POA Irradiance
current column name:  Timestamp
Checking keywords: ['poa']
False
current column name:  Weather station module temperature
Checking keywords: ['poa']
False
current column name:  Weather station ambient temperature
Checking keywords: ['poa']
False
current column name:  Max wind speed - VAISALA_1
Checking keywords: ['poa']
False
current column name:  Production meter L-N voltage
Checking keywords: ['poa']
False
current column name:  Production meter active power
Checking keywords: ['poa']
False
current column name:  Inverter 1, AC Active Power (Total)
Checking keywords: ['poa']
False
current column name:  POA Sensor
Checking keywords: ['poa']
True
keywords found in:  POA Sensor
Finding:  Meter Power
current column name:  Timestamp
Checking keywords: ['meter', 'power']
Checking keywords: ['electric', 'power']
False
current column name:  Weather st

Unnamed: 0,Timestamp,Weather station module temperature,Weather station ambient temperature,Max wind speed - VAISALA_1,Production meter L-N voltage,Meter Power,"Inverter 1, AC Active Power (Total)",POA Irradiance
0,10/1/2023 12:00:00 AM,40.78959,45.37811,1.287854,7.234267,-2.49375,0.0,0.0
1,10/1/2023 12:15:00 AM,40.86845,45.39113,1.762260,7.243093,-2.50365,0.0,0.0
2,10/1/2023 12:30:00 AM,40.44472,45.01901,0.995809,7.249640,-2.50494,0.0,0.0
3,10/1/2023 12:45:00 AM,39.76267,43.80235,1.330107,7.255577,-2.50780,0.0,0.0
4,10/1/2023 1:00:00 AM,39.19312,43.44142,1.304879,7.259396,-2.51140,0.0,0.0
...,...,...,...,...,...,...,...,...
2971,10/31/2023 10:45:00 PM,42.74689,44.74180,1.180729,7.256020,-2.53160,0.0,0.0
2972,10/31/2023 11:00:00 PM,42.58095,44.51686,0.600190,7.268523,-2.53876,0.0,0.0
2973,10/31/2023 11:15:00 PM,42.31520,44.43145,0.672945,7.268757,-2.53651,0.0,0.0
2974,10/31/2023 11:30:00 PM,42.91148,44.69661,0.601612,7.260537,-2.53137,0.0,0.0
