In [98]:
import pandas as pd
from prettytable import PrettyTable

In [99]:
lines = []

with open('2023-08-01-2023-08-31_Amethyst Monthly.csv', 'r') as f:
    for _ in range(10):  # Read only the first 10 lines
        line = f.readline()
        if not line:
            break
        lines.append(line)

header_row = next((i for i, line in enumerate(lines) if "timestamp" in line.lower()), None)
if header_row is None:
    raise ValueError("Header not found in the file.")

rows_to_skip = header_row
df = pd.read_csv('2023-08-01-2023-08-31_Amethyst Monthly.csv',
                skiprows=rows_to_skip, header=0)

df

Unnamed: 0,timestamp,"NC1 - Amethyst Solar - KZ (POA) - Measured Irradiance Other, avg, (W/m&#178;)","NC1 - Amethyst Solar - Weather Station - Measured Temperature Ambient, avg, (&#176;C)","NC1 - Amethyst Solar - Weather Station - Measured Wind Speed, avg, (m/s)","NC1 - Amethyst Solar - PV Meter - Measured AC Phase A Voltage, avg, (V)","NC1 - Amethyst Solar - PV Meter - Measured AC Power, avg, (kW)","NC1 - Amethyst Solar - Eaton Inverter #01 - Measured AC Power, avg, (kW)","NC1 - Amethyst Solar - Eaton Inverter #02 - Measured AC Power, avg, (kW)"
0,08/01/2023 00:00:00,-1.000,21.300,0.0,7450.000,-7.270,0.0,0.0
1,08/01/2023 00:15:00,-1.000,22.167,0.0,7466.667,-7.287,0.0,0.0
2,08/01/2023 00:30:00,-1.333,21.967,0.0,7476.667,-7.330,0.0,0.0
3,08/01/2023 00:45:00,-1.333,21.733,0.0,7453.333,-7.270,0.0,0.0
4,08/01/2023 01:00:00,-2.667,23.067,0.0,7473.333,-7.307,0.0,0.0
...,...,...,...,...,...,...,...,...
2971,08/31/2023 22:45:00,-3.000,19.700,0.0,7470.000,-7.340,0.0,0.0
2972,08/31/2023 23:00:00,-3.667,19.167,0.0,7463.333,-7.327,0.0,0.0
2973,08/31/2023 23:15:00,-2.000,19.633,0.0,7466.667,-7.293,0.0,0.0
2974,08/31/2023 23:30:00,-2.000,19.567,0.0,7470.000,-7.283,0.0,0.0


In [100]:
def change_column_names(df):
    keyword_mapping = {
        'Timestamp': ['timestamp'],
        'POA Irradiance': ['poa'],
        'Ambient Temperature': ['temperature', 'ambient'],
        'Wind Speed': ['wind speed'],
        'Meter Voltage': ['voltage'],
        'Meter Power': ['meter', 'power']
    }

    rename_mapping = {}
    for new_name, keywords in keyword_mapping.items():
        for col in df.columns:
            if all(keyword.lower() in col.lower() for keyword in keywords):
                rename_mapping[col] = new_name
                break

    # Rename the columns based on the mapping
    df.rename(columns=rename_mapping, inplace=True)

    meter_power_index = df.columns.get_loc("Meter Power")

    inverter_index = 1
    for col in df.columns[meter_power_index+1:]:
        df.rename(columns={col: 'Inverter_' + str(inverter_index)}, inplace=True)
        inverter_index += 1

    # Return only the columns that we're interested in
    required_cols = list(keyword_mapping.keys()) + [f'Inverter_{i + 1}' for i in range(inverter_index-1)]
    return df[required_cols]


In [101]:
change_column_names(df)
df

Unnamed: 0,Timestamp,POA Irradiance,Ambient Temperature,Wind Speed,Meter Voltage,Meter Power,Inverter_1,Inverter_2
0,08/01/2023 00:00:00,-1.000,21.300,0.0,7450.000,-7.270,0.0,0.0
1,08/01/2023 00:15:00,-1.000,22.167,0.0,7466.667,-7.287,0.0,0.0
2,08/01/2023 00:30:00,-1.333,21.967,0.0,7476.667,-7.330,0.0,0.0
3,08/01/2023 00:45:00,-1.333,21.733,0.0,7453.333,-7.270,0.0,0.0
4,08/01/2023 01:00:00,-2.667,23.067,0.0,7473.333,-7.307,0.0,0.0
...,...,...,...,...,...,...,...,...
2971,08/31/2023 22:45:00,-3.000,19.700,0.0,7470.000,-7.340,0.0,0.0
2972,08/31/2023 23:00:00,-3.667,19.167,0.0,7463.333,-7.327,0.0,0.0
2973,08/31/2023 23:15:00,-2.000,19.633,0.0,7466.667,-7.293,0.0,0.0
2974,08/31/2023 23:30:00,-2.000,19.567,0.0,7470.000,-7.283,0.0,0.0


In [117]:
# Read the data file
# rows_to_skip would need further consideration for specific data files

rows_to_skip = []
df_2 = pd.read_csv('2023-08-01-2023-08-31_Agate Bay Monthly.csv',
                 skiprows=rows_to_skip, header=0)

change_column_names(df_2)

Unnamed: 0,Timestamp,POA Irradiance,Ambient Temperature,Wind Speed,Meter Voltage,Meter Power,Inverter_1,Inverter_2,Inverter_3,Inverter_4
0,8/1/2023 12:00:00 AM,-2.0,64.36333,0.863333,12.35889,-19.97367,0.0,0.0,0.0,0.0
1,8/1/2023 12:15:00 AM,-2.0,63.99000,0.740000,12.29711,-19.66233,0.0,0.0,0.0,0.0
2,8/1/2023 12:30:00 AM,-1.5,63.51333,1.443333,12.30100,-19.68500,0.0,0.0,0.0,0.0
3,8/1/2023 12:45:00 AM,-1.5,63.48000,0.816667,12.32422,-19.77967,0.0,0.0,0.0,0.0
4,8/1/2023 1:00:00 AM,-1.5,63.97667,0.793333,12.34722,-19.91733,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
2971,8/31/2023 10:45:00 PM,0.0,61.81000,1.343333,12.33233,-19.84333,0.0,0.0,0.0,0.0
2972,8/31/2023 11:00:00 PM,0.0,62.15000,1.156667,12.34478,-19.92067,0.0,0.0,0.0,0.0
2973,8/31/2023 11:15:00 PM,0.0,61.96667,1.290000,12.31467,-19.74533,0.0,0.0,0.0,0.0
2974,8/31/2023 11:30:00 PM,0.5,61.85333,1.486667,12.32789,-19.81200,0.0,0.0,0.0,0.0


In [103]:
# Would need to write a more advanced function to detect and select the target columns,
# since the column names in different data files can vary and,
# sometimes there are extra columns

# Rename column names
fixed_names = ['Timestamp', 'POA Irradiance', 'Ambient Temperature',
               'Wind Speed', 'Meter Voltage', 'Meter Power']

num_inverters = len(df.columns) - len(fixed_names)
inverter_names = []
for i in range(num_inverters):
    name = 'Inverter_' + str(i + 1)
    inverter_names.append(name)

new_columns = fixed_names + inverter_names

df.columns = new_columns

df

Unnamed: 0,Timestamp,POA Irradiance,Ambient Temperature,Wind Speed,Meter Voltage,Meter Power,Inverter_1,Inverter_2,Inverter_3,Inverter_4
0,8/1/2023 12:00:00 AM,-2.0,64.36333,0.863333,12.35889,-19.97367,0.0,0.0,0.0,0.0
1,8/1/2023 12:15:00 AM,-2.0,63.99000,0.740000,12.29711,-19.66233,0.0,0.0,0.0,0.0
2,8/1/2023 12:30:00 AM,-1.5,63.51333,1.443333,12.30100,-19.68500,0.0,0.0,0.0,0.0
3,8/1/2023 12:45:00 AM,-1.5,63.48000,0.816667,12.32422,-19.77967,0.0,0.0,0.0,0.0
4,8/1/2023 1:00:00 AM,-1.5,63.97667,0.793333,12.34722,-19.91733,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
2971,8/31/2023 10:45:00 PM,0.0,61.81000,1.343333,12.33233,-19.84333,0.0,0.0,0.0,0.0
2972,8/31/2023 11:00:00 PM,0.0,62.15000,1.156667,12.34478,-19.92067,0.0,0.0,0.0,0.0
2973,8/31/2023 11:15:00 PM,0.0,61.96667,1.290000,12.31467,-19.74533,0.0,0.0,0.0,0.0
2974,8/31/2023 11:30:00 PM,0.5,61.85333,1.486667,12.32789,-19.81200,0.0,0.0,0.0,0.0


In [104]:
# Add a date column

df['Timestamp'] = pd.to_datetime(df['Timestamp'],format="%m/%d/%Y %I:%M:%S %p")
df['Date'] = df['Timestamp'].dt.date
df


Unnamed: 0,Timestamp,POA Irradiance,Ambient Temperature,Wind Speed,Meter Voltage,Meter Power,Inverter_1,Inverter_2,Inverter_3,Inverter_4,Date
0,2023-08-01 00:00:00,-2.0,64.36333,0.863333,12.35889,-19.97367,0.0,0.0,0.0,0.0,2023-08-01
1,2023-08-01 00:15:00,-2.0,63.99000,0.740000,12.29711,-19.66233,0.0,0.0,0.0,0.0,2023-08-01
2,2023-08-01 00:30:00,-1.5,63.51333,1.443333,12.30100,-19.68500,0.0,0.0,0.0,0.0,2023-08-01
3,2023-08-01 00:45:00,-1.5,63.48000,0.816667,12.32422,-19.77967,0.0,0.0,0.0,0.0,2023-08-01
4,2023-08-01 01:00:00,-1.5,63.97667,0.793333,12.34722,-19.91733,0.0,0.0,0.0,0.0,2023-08-01
...,...,...,...,...,...,...,...,...,...,...,...
2971,2023-08-31 22:45:00,0.0,61.81000,1.343333,12.33233,-19.84333,0.0,0.0,0.0,0.0,2023-08-31
2972,2023-08-31 23:00:00,0.0,62.15000,1.156667,12.34478,-19.92067,0.0,0.0,0.0,0.0,2023-08-31
2973,2023-08-31 23:15:00,0.0,61.96667,1.290000,12.31467,-19.74533,0.0,0.0,0.0,0.0,2023-08-31
2974,2023-08-31 23:30:00,0.5,61.85333,1.486667,12.32789,-19.81200,0.0,0.0,0.0,0.0,2023-08-31


In [105]:
# Help function for getting missing information,
# used in printing details of rows that have missing values 

def get_missing_info(missing_rows):
    table = PrettyTable()
    table.field_names = ["Index"] + missing_rows.columns.tolist()
    for index, row in missing_rows.iterrows():
        table.add_row([index+2] + row.tolist())
    
    return table


In [106]:
# Check missing values in 'POA Irradiance'

missing_rows = df[df['POA Irradiance'].isna()]
if not missing_rows.empty:
    missing_info = get_missing_info(missing_rows)
    df['POA Irradiance'] = df['POA Irradiance'].fillna(-999)
    print((
        f"Detected missing values in the 'POA Irradiance' column.\n"
        f"These have been filled with a placeholder value of -999.\n"
        f"Kindly document this discrepancy in the 'Data Issues' spreadsheet for further review.\n"
        f"Details of missing rows:\n{missing_info}"
    ))

else:
    print("All good! The 'POA Irradiance' column has no missing values.")


Detected missing values in the 'POA Irradiance' column.
These have been filled with a placeholder value of -999.
Kindly document this discrepancy in the 'Data Issues' spreadsheet for further review.
Details of missing rows:
+-------+---------------------+----------------+---------------------+------------+---------------+-------------+------------+------------+------------+------------+------------+
| Index |      Timestamp      | POA Irradiance | Ambient Temperature | Wind Speed | Meter Voltage | Meter Power | Inverter_1 | Inverter_2 | Inverter_3 | Inverter_4 |    Date    |
+-------+---------------------+----------------+---------------------+------------+---------------+-------------+------------+------------+------------+------------+------------+
|  775  | 2023-08-09 01:15:00 |      nan       |         nan         |    nan     |      nan      |     nan     |    nan     |    nan     |    nan     |    nan     | 2023-08-09 |
+-------+---------------------+----------------+------------

In [107]:
# Check missing values in Ambient Temperature and Wind Speed under the condition,
# that the corresponding value for POA Irradiance is above 100
def check_and_replace_missing(df, condition_col, condition_value, col_to_check):
    condition_rows = df[df[condition_col] >= condition_value]
    missing_rows = condition_rows[condition_rows[col_to_check].isna()]
    if not missing_rows.empty:
        df.loc[missing_rows.index, col_to_check] = -999
        missing_info = get_missing_info(missing_rows)
        print((
            f"Detected missing values in the {col_to_check} column when {condition_col} >= {condition_value}.\n"
            f"These have been filled with a placeholder value of -999.\n"
            f"There is no need to document this discrepancy in the 'Data Issues' spreadsheet.\n"
            f"Details of missing rows:\n{missing_info}"
        ))

    else:
        print(f"All good! The {col_to_check} column has no missing values when {condition_col} >= {condition_value}.")

In [108]:
check_and_replace_missing(df, 'POA Irradiance', 100, 'Ambient Temperature')
check_and_replace_missing(df, 'POA Irradiance', 100, 'Wind Speed')

All good! The Ambient Temperature column has no missing values when POA Irradiance >= 100.
All good! The Wind Speed column has no missing values when POA Irradiance >= 100.


In [109]:
# Check missing values in Meter Power and auto-fill the missing value if possible
missing_meters = df[df['Meter Power'].isna()]
inverter_cols = [col for col in df.columns if col.startswith('Inverter_')]
filled = []
unfilled = []

if not missing_meters.empty:
    for index, row in missing_meters.iterrows():
        if not row[inverter_cols].isna().any():
            df.loc[index, 'Meter Power'] = row[inverter_cols].sum()
            updated_row = df.loc[index] 
            filled.append(updated_row)
        else:
            df.loc[index, 'Meter Power'] = -999
            updated_row = df.loc[index] 
            unfilled.append(updated_row)
     
            
filled_df = pd.DataFrame(filled)
unfilled_df = pd.DataFrame(unfilled)

if not filled_df.empty:
    print(f"The missing 'Meter Power' values in the following rows have been auto-filled based on the sum of inverter values.")
    print(get_missing_info(filled_df))
if not unfilled_df.empty:
    print(f"The missing 'Meter Power' values in the following rows cannot be auto-filled due to missing inverter values."
        f"These have been filled with a placeholder value of -999.\n"
        f"Kindly document this discrepancy in the 'Data Issues' spreadsheet for further review.\n")
    print(get_missing_info(unfilled_df))

The missing 'Meter Power' values in the following rows cannot be auto-filled due to missing inverter values.These have been filled with a placeholder value of -999.
Kindly document this discrepancy in the 'Data Issues' spreadsheet for further review.

+-------+---------------------+----------------+---------------------+------------+---------------+-------------+------------+------------+------------+------------+------------+
| Index |      Timestamp      | POA Irradiance | Ambient Temperature | Wind Speed | Meter Voltage | Meter Power | Inverter_1 | Inverter_2 | Inverter_3 | Inverter_4 |    Date    |
+-------+---------------------+----------------+---------------------+------------+---------------+-------------+------------+------------+------------+------------+------------+
|  775  | 2023-08-09 01:15:00 |     -999.0     |         nan         |    nan     |      nan      |    -999.0   |    nan     |    nan     |    nan     |    nan     | 2023-08-09 |
|  1039 | 2023-08-11 19:15:00 | 

In [110]:
missing_by_day = unfilled_df.groupby('Date').size()
missing_dates = missing_by_day.index.tolist()
missing_date_str = '\n'.join([f"{date}: {count} missing" for date, count in missing_by_day.items()])
print(missing_date_str)

2023-08-09: 1 missing
2023-08-11: 10 missing
2023-08-19: 12 missing


In [111]:
def format_workorders(workorders):
    table = PrettyTable()
    table.field_names = ["Index"] + workorders.columns.tolist()
    for index, row in workorders.iterrows():
        table.add_row([index] + row.tolist())
    return table


In [112]:
workorder = pd.read_csv('WorkOrdersAdministration.csv',
                 skiprows=0, header=0)

workorder['Fault/Event Start - Date/Time'] = pd.to_datetime(workorder['Fault/Event Start - Date/Time'], format="%b %d, %Y %I:%M:%S %p")
workorder['Date'] = workorder['Fault/Event Start - Date/Time'].dt.date
fetched_records = workorder[(workorder['Date'].isin(missing_dates)) & (workorder['Site Name'].isin(["Agate"]))]

site_name = "Agate Bay"
if fetched_records.empty:
    print(f"No work orders found for site {site_name} on the missing dates.")
else:
    formatted_table = format_workorders(fetched_workorders)
    print(formatted_table)


No work orders found for site Agate Bay on the missing dates.


In [113]:
# voltage_missing_count = df['Meter Voltage'].isna().sum()
# total_rows = len(df)
# voltage_missing_ratio = voltage_missing_count/total_rows
# if voltage_missing_ratio >= 0.5:
#     print()

In [114]:
# def check_missing_values(column_name, threshold_col='POA Sensor', threshold_val=100):
#     condition = (df[threshold_col] > threshold_val) & (df[column_name].isna())
#     if condition.any():
#         print(f"There are missing values in the {column_name} column when {threshold_col} is above {threshold_val} W/m2.")
#         missing_indices = df[condition].index
#         for index in missing_indices:
#             print(f"Missing value at Index: {index}")
#         df.loc[condition, column_name] = -999
#     else:
#         print(f"There are no missing values in the {column_name} column when {threshold_col} is above {threshold_val} W/m2.")

# check_missing_values('Weather station ambient temperature')
# check_missing_values('Max wind speed - Weather Station w/ Module & POA LP02')


In [115]:
# df.to_csv('investigate.csv', index=False)

In [116]:
filename = '2023-08-01-2023-08-31_Agate Bay Monthly.csv'
site_name = filename.split('_')[-1].replace(" Monthly.csv", "")
site_name


'Agate Bay'