In [None]:
import os
import pandas as pd
import numpy as np
import math
import re
import subprocess
import random

import plotly.express as px

from sklearn.metrics import mean_absolute_error, mean_squared_error,r2_score

from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer

# Kalman Smoothing using R objects
import rpy2.robjects as robjects
# import R packages
from rpy2.robjects.packages import importr

# Impute TS
imputeTS = importr('imputeTS') 
kalman_StructTs = robjects.r['na_kalman']
sea_decom = robjects.r['na.seadec']
sea_split = robjects.r['na.seasplit']

import sys
module_path = re.sub(r'Notebooks','Python Scripts',os.getcwd())
sys.path.append(module_path)
from pv_modules import *

np.random.seed(12)

In [None]:
def df_cleaner(path_list,file):
    
    df = pd.read_csv(path_list[0],sep="\t|,",engine='python')
        
    # === In case a file isn't stored properly or empty === #
    if df.empty:
        raise Exception("Loaded an empty dataframe")

    # ==== reshaping df for timestap & adjusted headers ==== #
    df = reshape_df(df,file)

    # === filling gaps in time intervals === #
    df,_ = add_missing_times(df)

    # # ==== Using PvLib to remove nightime values === #
    df = remove_night(df)

    if file == 'Irradiance':
        # === Removing Values for Irradiance === #
        df = clean_irradiance_values(df)

    else:
        # === Removing Values for Deger & Fixed === #
        df = clean_deger_fixed_values(df)
        
    return df

In [None]:
def surrounding_month(month, year , cond):
    
    """
    Get the surrounding month and year based on the input month, year, and condition.

    Args:
        month (int): The input month (1-12).
        year (int): The input year.
        cond (str): The condition indicating the desired surrounding month. Possible values are 'prev' or 'next'.

    Returns:
        tuple: A tuple containing the surrounding month and year.

    """
    
    month_dict = {0:'dec',13:'jan',8:'aug',9:'sep',10:'oct',11:'nov',12:'dec',1:'jan',2:'feb',3:'mar',4:'apr',5:'may',6:'jun',7:'jul'}
    if cond == 'prev':
        if month == 1:
            year -= 1
        return month_dict[month-1], year
    else:
        if month == 12:
            year += 1
        return month_dict[month+1], year

In [None]:
def missing_value_simulation(df):
    
    gap_length = int(int(input("Gap size (seconds): "))/ 12) # Avg observation interval
    
    df = df.dropna()
        
    copy_df = df.copy()
    
    index_list = df.reset_index().index
    
    while True:
        selected_index = random.choice([index for index in index_list if index < index_list[-1] - gap_length])
        if df.index[selected_index].day == df.index[selected_index+gap_length].day:
            break
            
    df.iloc[selected_index:selected_index+gap_length, :] = np.nan

    datapath = re.sub(r'Notebooks|Python Scripts','Support Files/',os.getcwd())
    df.to_csv(datapath + 'test_data.csv')

    return df, copy_df, gap_length

In [None]:
def map_nan_gaps_indexes(pre_df, date):
    
    test_gaps = {}
    for col in range(len(pre_df.columns)):
        inx_ind = []
        test_gaps[pre_df.columns[col]] = []
        for index in range(len(pre_df.index)):
            if pre_df.index[index].month != int(date): continue
            if index in inx_ind: continue
            c = 0
            while np.isnan(pre_df.iloc[index+c,col]) and pre_df.index[index+c] != pre_df.index[-1]:
                inx_ind += [index+c]
                c += 1
            if not c and not np.isnan(pre_df.iloc[index+c,col]): continue
            test_gaps[pre_df.columns[col]] += list(range(index,index+c))
            
    return test_gaps

In [None]:
def identify_missing_data_beg_end_month(df, file):
    
    month_i = df.index[0].month

    beg_ind = False
    end_ind = False
    path_list = get_file_paths(file)

    beg_threshold = int(len(df) / df.iloc[-1].name.day * 5)
    end_threshold = int(len(df) / df.iloc[-1].name.day * (df.iloc[-1].name.day - 4))

    # Check for missing data in the beginning of the month
    if df.iloc[0:beg_threshold, :].isna().sum().sum() / df.iloc[0:beg_threshold, :].size * 100 > 5:
        if month_i == 1 and df.index[0].year == 2021:
            print("January 2021 is the first month observed, can't join prior month.")
            return False, False, month_i
        month, year = surrounding_month(df.iloc[0].name.month, df.iloc[0].name.year, 'prev')
        path_list = path_function_extended(year, month, '', '', path_list)
        load_beg = df_cleaner(path_list, file)
        load_beg = load_beg.drop(columns='GlobalIR', errors='ignore')
        beg_ind = True

    # Check for missing data at the end of the month
    if df.iloc[end_threshold:-1, :].isna().sum().sum() / df.iloc[end_threshold:-1, :].size * 100 > 5:
        if month_i == 2 and df.index[0].year == 2023:
            print("February 2021 is the last month observed, can't join following month.")
            return False, False, month_i
        month, year = surrounding_month(df.iloc[0].name.month, df.iloc[0].name.year, '')
        path_list = path_function_extended(year, month, '', '', path_list)
        load_end = df_cleaner(path_list, file)
        load_end = load_end.drop(columns='GlobalIR', errors='ignore')
        end_ind = True

    if beg_ind and end_ind:
        return load_beg, load_end, month_i
    elif beg_ind:
        return load_beg, False, month_i
    elif end_ind:
        return False, load_end, month_i
    else:
        return False, False, month_i

In [None]:
def calculate_imputation_errors(imputed_df, copy_df, test_gaps):
    
    error_dict = {}
    
    for col in imputed_df.columns:
        
        pred_val = imputed_df[col].iloc[test_gaps[col]]
        test_val = copy_df[col].iloc[test_gaps[col]]

        mae = mean_absolute_error(test_val,pred_val)
        mse = mean_squared_error(test_val, pred_val)
        rmse = np.sqrt(mse)
        r2 = r2_score(test_val, pred_val)
        
        error_dict[col] = [mae,mse,rmse,r2]

        print(f"For {col}")
        print(f"Mean Absolute Error: {mae}")
        print(f"Mean Squared Error: {mse}")
        print(f"Root Mean Squared Error: {rmse}")
        print(f"R2 Score: {r2} \n")
        
    error_df = pd.DataFrame(error_dict).T
    error_df.columns = ['mae','mse','rmse','r2']
    
    print(error_df)
    
    px.bar(error_df, x = error_df.index, y = error_df['r2']).show()
    
    return error_df

In [None]:
def main(year, month, year_2, month_2, file):
    if not re.search(r'\d{4}',year):
        raise Exception(f"Incorret Input: {year}")
    elif not re.search(r'[A-Za-z]{3}',month):
        raise Exception(f"Incorret Input: {month}")
    elif not re.search(r'[A-Za-z]{3}',month_2):
        raise Exception(f"Incorret Input: {month_2}")
    elif not re.search(r'\d{4}',year_2):
        raise Exception(f"Incorret Input: {year_2}")
    elif not [file_i for file_i in ['Irradiance','Deger','Fixed'] if re.search(fr'{file}',file_i)]:
        raise Exception(f"Incorret Input: File")
    else:
        path_list = get_file_paths(file)
        path_list = path_function_extended(year,month,year_2,month_2,path_list)
        df = df_cleaner([path_list[0]],file)
        df_2 = df_cleaner([path_list[1]],file)
    return df, df_2,file, year_2, month_2, year, month

# = Load all the Data = #
test_df, source_df, file, year_2, month_2, year, month = main(year = input("Year (format: YYYY): "),month = input("Month (format: jul): "),
     year_2 = input("Second Year (format: YYYY): "),month_2 = input("Second Month (format: jul): "),
     file = input("File (opt: Irradiance/Deger/Fixed): "))

In [None]:
def preprocessesing(test_df, source_df, file):
    
    test_df, base_df, syn_gap = missing_value_simulation(test_df.copy())
        
    beg_df, end_df, test_month = identify_missing_data_beg_end_month(test_df.copy(), file)

    if type(beg_df) == type(test_df) and type(end_df) == type(test_df):
        test_df = pd.concat([beg_df,test_df,end_df],axis=0,ignore_index=False)
        base_df =  pd.concat([beg_df,base_df,end_df],axis=0,ignore_index=False)

    elif type(beg_df) == type(test_df):
        test_df = pd.concat([beg_df,test_df],axis=0,ignore_index=False)
        base_df =  pd.concat([beg_df,base_df],axis=0,ignore_index=False)

    elif type(end_df) == type(test_df):
        test_df = pd.concat([test_df,end_df],axis=0,ignore_index=False)
        base_df =  pd.concat([base_df,end_df],axis=0,ignore_index=False)

    test_gaps = map_nan_gaps_indexes(test_df.copy(), test_month)
    
    return test_df, base_df, test_gaps syn_gap
    
test_df, base_df, test_gaps, syn_gap = preprocessesing(test_df,source_df,file)

In [None]:
imputation_dict = {}

In [None]:
def fill_forward(df, df_copy = base_df.copy(), test_gaps=test_gaps):
    
    df.iloc[[-1,0],:] = 0

    for col in df.columns:
        
        if not df[col].isna().sum().sum(): continue

        df[col] = df[col].fillna(method='ffill')
    
    error_df = calculate_imputation_errors(df, df_copy, test_gaps)
    
    if 'DiffuseIR' in df.columns:
        px.scatter(df,x=df.index,y='DirectIR').show()
        
    return error_df
    
imputation_dict['LOCF'] = fill_forward(test_df.copy()).to_dict()

In [None]:
def nearest(df, df_copy = base_df, test_gaps=test_gaps):
    
    df.iloc[[-1,0],:] = 0
    
    for col in df.columns:
        
        if not df[col].isna().sum().sum(): continue

        df[col] = df[col].interpolate(method='nearest')
            
    error_df = calculate_imputation_errors(df, df_copy, test_gaps)
    
    if 'DiffuseIR' in df.columns:
        px.scatter(df,x=df.index,y='DirectIR').show()
        
    return error_df

imputation_dict['Nearest Neighbor'] = nearest(test_df.copy()).to_dict()

In [None]:
def kalman(df, df_copy = base_df, test_gaps=test_gaps):
    
    for col in df.columns:
        
        arr = np.ndarray.tolist(df[col].values)
        arr = robjects.FloatVector(arr)

        df[col] = kalman_StructTs(arr, model = "StructTS")
    
    error_df = calculate_imputation_errors(df, df_copy, test_gaps)
    
    if 'DiffuseIR' in df.columns:
        px.scatter(df,x=df.index,y='DirectIR').show()
    
    return error_df

imputation_dict['Kalman Smoothing'] = kalman(test_df.copy()).to_dict()

In [None]:
def ARIMA(df, df_copy = base_df, test_gaps=test_gaps):
    
    for col in df.columns:
        
        arr = np.ndarray.tolist(df[col].values)
        arr = robjects.FloatVector(arr)

        df[col] = kalman_StructTs(arr, model = "auto.arima")
    
    error_df = calculate_imputation_errors(df, df_copy, test_gaps)
    
    if 'DiffuseIR' in df.columns:
        px.scatter(df,x=df.index,y='DirectIR').show()
        
    return error_df

imputation_dict['ARIMA'] = ARIMA(test_df.copy()).to_dict()

In [None]:
def seasonal_decom(df, df_copy = base_df, test_gaps=test_gaps):
    
    for col in df.columns:
        
        arr = np.ndarray.tolist(df[col].values)
        arr = robjects.FloatVector(arr)

        df[col] = sea_decom(arr, algorithm = "kalman")
    
    error_df = calculate_imputation_errors(df, df_copy, test_gaps)
    
    if 'DiffuseIR' in df.columns:
        px.scatter(df,x=df.index,y='DirectIR').show()
        
    return error_df

imputation_dict['Seasonal Decomposition'] = seasonal_decom(test_df.copy()).to_dict()

In [None]:
def zoo_functions(df):
    
    df.iloc[[-1,0],:] = 0
    
    df['seconds'] = [(df.index[-1] - time).total_seconds() for time in list(df.index)]
    
    df.index.name = 'Timestamp'
    
    current_dir = os.getcwd()
    datapath = re.sub(r'Notebooks|Python Scripts','Support_Scripts/R',current_dir)

    df.to_csv(datapath + '/R_df_.csv')
    
    datapath = re.sub(r'Notebooks|Python Scripts','Support_Scripts/R',os.getcwd())
    os.chdir(datapath)
    subprocess.call("Rscript " + datapath + "/r_imputation.R", shell=True)
    os.chdir(current_dir)
def zoo_spline(df, df_copy = base_df, test_gaps=test_gaps):
    
    datapath = re.sub(r'Notebooks|Python Scripts','Support_Scripts/R/',os.getcwd())
    df = pd.read_csv(datapath + 'spline_df.csv')
    
    df['Timestamp'] = df_copy.index
    
    df = df.set_index('Timestamp')
    
    error_df = calculate_imputation_errors(df, df_copy, test_gaps)
    
    if 'DiffuseIR' in df.columns:
        px.scatter(df,x=df.index,y='DirectIR').show()
        
    return error_df

def forecast_interpolate(df, df_copy = base_df, test_gaps=test_gaps):
    datapath = re.sub(r'Notebooks|Python Scripts','Support_Scripts/R/',os.getcwd())
    df = pd.read_csv(datapath + 'forecast_int_df.csv')
    
    df['Timestamp'] = df_copy.index
    
    df = df.set_index('Timestamp')
    
    error_df = calculate_imputation_errors(df, df_copy, test_gaps)
    
    if 'DiffuseIR' in df.columns:
        px.scatter(df,x=df.index,y='DirectIR').show()
        
    return error_df

def zoo_interpolation(df, df_copy = base_df, test_gaps=test_gaps):
    datapath = re.sub(r'Notebooks|Python Scripts','Support_Scripts/R/',os.getcwd())
    df = pd.read_csv(datapath + 'interpolation_df.csv')
    
    df['Timestamp'] = df_copy.index
    
    df = df.set_index('Timestamp')
    
    error_df = calculate_imputation_errors(df, df_copy, test_gaps)
    
    if 'DiffuseIR' in df.columns:
        px.scatter(df,x=df.index,y='DirectIR').show()
        
    return error_df

def forecast_auto_arima(df, df_copy = base_df, test_gaps=test_gaps):
    datapath = re.sub(r'Notebooks|Python Scripts','Support_Scripts/R/',os.getcwd())
    df = pd.read_csv(datapath + 'forecast_arima.csv')
    
    df['Timestamp'] = df_copy.index
    
    df = df.set_index('Timestamp')
    
    error_df = calculate_imputation_errors(df, df_copy, test_gaps)
    
    if 'DiffuseIR' in df.columns:
        px.scatter(df,x=df.index,y='DirectIR').show()
        
    return error_df

zoo_functions(test_df.copy())

# imputation_dict['Forecast Auto.Arima'] = forecast_auto_arima(df.copy()).to_dict()
imputation_dict['Zoo Interpolation'] = zoo_interpolation(test_df.copy()).to_dict()
imputation_dict['Zoo Spline'] = zoo_spline(test_df.copy()).to_dict()
imputation_dict['Forecast Interpolation'] = forecast_interpolate(test_df.copy()).to_dict()

In [None]:
def interpolate(df, df_copy = base_df, test_gaps=test_gaps):
        
    for col in df.columns:
        
        if not df[col].isna().sum().sum(): continue

        df[col] = df[col].interpolate(method='time', limit_direction='both')
            
    error_df = calculate_imputation_errors(df, df_copy, test_gaps)
    
    return error_df

imputation_dict['Python Interpolate'] = interpolate(test_df.copy()).to_dict()

In [None]:
from rpy2 import robjects
from rpy2.robjects import pandas2ri
kalman_StructTs = robjects.r['na.kalman']

def sm_interpolate(df, df_copy = base_df, test_gaps=test_gaps):

    # Convert the DataFrame to an R data frame
    pandas2ri.activate()
    r_data_frame = pandas2ri.py2rpy(df)

    # Load the imputeTS package
    robjects.r('library(imputeTS)')

    # Use the na.interp function for imputation
    imputed_data_frame = robjects.r('na.interp')(r_data_frame, option='arima')

    # Convert the imputed data frame back to a pandas DataFrame
    imputed_data_frame = pandas2ri.ri2py(imputed_data_frame)
    
    error_df = calculate_imputation_errors(imputed_data_frame, df_copy, test_gaps)
    
    return error_df

# imputation_dict['SM Interpolate'] = sm_interpolate(test_df.copy()).to_dict()

In [None]:
def performance(impute_dict):
    imputation_df = pd.DataFrame.from_dict({(outerKey, innerKey): values for outerKey, innerDict in impute_dict.items() for innerKey, values in innerDict.items()}).T
    r2_df = pd.DataFrame(imputation_df.drop(['mae','mse','rmse'], axis=0,level=1).max(axis=0),columns = ['r2'])
    r2_df['Method'] = imputation_df.drop(['mae','mse','rmse'], axis=0,level=1).idxmax(axis=0).values
    mae_df = pd.DataFrame(imputation_df.drop(['r2','mse','rmse'], axis=0,level=1).min(axis=0),columns = ['mae'])
    mae_df['Method'] = imputation_df.drop(['r2','mse','rmse'], axis=0,level=1).idxmin(axis=0).values
    rmse_df = pd.DataFrame(imputation_df.drop(['r2','mse','mae'], axis=0,level=1).min(axis=0),columns = ['rmse'])
    rmse_df['Method'] = imputation_df.drop(['r2','mse','mae'], axis=0,level=1).idxmin(axis=0).values
    for row in r2_df.index:
        print(row)
        print(f"Optimal Imputation Method for {row}: {r2_df.loc[row]['Method'][0]}, R2 score: {round(r2_df.loc[row]['r2'],5)}")
        print(f"Optimal Imputation Method for {row}: {mae_df.loc[row]['Method'][0]}, MAE score: {round(mae_df.loc[row]['mae'],5)}")
        print(f"Optimal Imputation Method for {row}: {rmse_df.loc[row]['Method'][0]}, RMSE score: {round(rmse_df.loc[row]['rmse'],5)}\n")

    return imputation_df

if test == 'Synthetic':
    print(f"\nImputation methods for {file} and error metrics in {year}, {month} with a {syn_gap} second gap from {test_df.index[test_gaps[test_df.columns[0]][0]]} - {test_df.index[test_gaps[test_df.columns[0]][-1]]}.\n")

    
print(f"\nImputation methods for {file} and error metrics in {year}, {month} modeled with NaN values in {year_2}, {month_2}.\nIncluding gaps larger than {nan_gap_i} and smaller than {nan_gap_f} seconds.\n")
performance(imputation_dict.copy())