$$\large \text{Packages & Specs} $$

In [1]:
import os
import pandas as pd
import numpy as np
import pvlib
import re
import threading
import queue

import plotly.express as px

from sklearn.preprocessing import MinMaxScaler
from pykalman import KalmanFilter
from sklearn.impute import KNNImputer

# Kalman Smoothing using R objects
import rpy2.robjects as robjects
# import R packages
from rpy2.robjects.packages import importr
imputeTS = importr('imputeTS') 
kalman_StructTs = robjects.r['na.kalman']
kalman_auto_arima = robjects.r['na.kalman']
sea_decom = robjects.r['na.seadec']
sea_split = robjects.r['na.seasplit']

$$\large \text{Loading all possible data paths} $$

In [2]:
def execute(file):
    path_list = []
    datapath = re.sub(r'Notebooks','Data/',os.getcwd())
    for dir in os.scandir(datapath):
        if re.search(r'\.',dir.name): continue
        year_path = datapath + f"{dir.name}"
        for dir in os.scandir(year_path):
            if dir.name == file:
                month_path = year_path + f"/{dir.name}/"
                for dir in os.scandir(month_path):
                    if not re.search(r'[\.csv]|[\.xlsx]',dir.name): continue
                    path_list += [month_path + f"{dir.name}"]
    return path_list

$$\Large \text{Data Pre-Preprocessing} $$

$$\large \text{Reshaping dataframe with timestamp index and feature} $$

In [4]:
def reshape_df(df):
    df['DayID'] = df['DayID'].astype(str)
    df['TimeID'] = df['TimeID'].astype(str)
    df['date'] = df['DayID'] + 'T' +  df['TimeID']
    df = df.drop(columns = ['DayID','TimeID'])
    df.date = pd.to_datetime(df.date)
    df = df.set_index('date')
    df.index = df.index.tz_localize(tz = 'Etc/UTC')
    df = df.sort_index()
    return df

$$\large \text{Creating rows with NaN values where time gaps are larger than 21 seconds between observations} $$

In [5]:
def add_missing_times(df):
    
    # creating of list of times to find interval gaps
    time_list = list(df.index)
    
    # calculating interval gaps if > 21s and storing [interval length (s), start_time, end_time]
    missing_intervals = [[(time_list[time+1] - time_list[time]).total_seconds(),time_list[time],time_list[time+1]]
                 for time in range(len(time_list)-1) if (time_list[time+1] - time_list[time]).total_seconds() > 21]
    # generating time stamps to fill interval gaps 
    interval_list = [element for sublist in [pd.date_range(start=interval[1],
                             end=interval[2]-pd.Timedelta(1,'s'),
                             freq='11s') for interval in missing_intervals] for element in sublist]
    
    # checking for missing values at the beginning of the month
    if time_list[0] > time_list[0].replace(day=1,hour=1):
        print("Month found with missing values at the beginning of the month.")
        print('Time:',time_list[0])
        interval_list += [time for time in pd.date_range(start=time_list[0].replace(day=1,hour=0,minute=0,second=0),
                             end=time_list[0]-pd.Timedelta(1,'s'),
                             freq='11s')]
        
    # checking for missing values at the end of the month    
    next_month = time_list[0].replace(day=28,hour=0,minute=0,second=0) + pd.Timedelta(4,'d')
    last_day = next_month - pd.Timedelta(next_month.day,'d')
    if time_list[-1] < last_day.replace(hour = 23,minute=0):
        print("Month found with missing values at the end of the month.")
        print('Time:',time_list[-1])
        interval_list += [time for time in pd.date_range(start=time_list[-1],
                     end=last_day.replace(hour=23,minute=59,second=59),
                     freq='11s')]
        
    interval_list = list(set(interval_list))
    mt_df = pd.DataFrame(index=interval_list,columns=df.columns)
    mt_df.loc[interval_list] = np.nan
    df = pd.concat([df,mt_df], axis = 0).sort_index()

    return df

$$\large \text{Removing night time observations, and irregular variable values} $$

In [6]:
def remove_night(df):
    lat = 49.102
    lon = 6.215
    alt = 220
    solpos = pvlib.solarposition.get_solarposition(
        time=df.index,latitude=lat,longitude=lon,altitude=alt,method='pyephem')
    df = df[solpos['zenith'] <=90].replace(0,np.nan)
    return df

def remove_bad_temps(df):
    df['Temperature'] = np.where((df['Temperature'] > 60)|(df['Temperature'] < 0), np.nan, df['Temperature'])
    return df

def remove_bad_wind_speeds(df):
    df['WindSpeed'] = np.where((df['WindSpeed'] < 0)|(df['WindSpeed'] > 100), np.nan, df['WindSpeed'])
    return df

def remove_bad_dni(df):
    df['DirectIR'] = np.where((df['DirectIR'] > 2000)|(df['DirectIR'] < 0), np.nan, df['DirectIR'])
    return df

def remove_bad_dhi(df):
    df['DiffuseIR'] = np.where((df['DiffuseIR'] > 2000)|(df['DiffuseIR'] < 0), np.nan, df['DiffuseIR'])
    return df

def remove_neg(df):
    df[df < 0] = np.nan
    return df

In [None]:
def knn(df):
    
    df['Seconds'] = [(time - time.replace(hour=0, minute=0, second=0,
                                          microsecond=0)).total_seconds() for time in df.index]
    df['Day'] = [d.day for d in df.index]
    
    for col in df.columns:
        if df[col].isna().sum():
            df_KNN = df[[col,'Day', 'Seconds']].copy()
            scaler = MinMaxScaler()
            scaled_df = pd.DataFrame(scaler.fit_transform(df_KNN), columns = df_KNN.columns)
            imputer = KNNImputer(n_neighbors=7,weights='distance')
            knn_solar = pd.DataFrame(imputer.fit_transform(scaled_df),
                                    columns=scaled_df.columns)
            inverse_knn_solar = pd.DataFrame(scaler.inverse_transform(knn_solar),
                                columns=knn_solar.columns, index=df_KNN.index)
            df[col] = inverse_knn_solar[col]

In [None]:
def imputation(df):
    for col in range(len(df.columns)):
        inx_ind = []
        for index in range(len(df.index)):
            if index in inx_ind: continue
            c = 0
            while np.isnan(df.iloc[index+c,col]) and df.iloc[index+c,col] != df.iloc[-1,col]:
                inx_ind += [index+c]
                c += 1
            if not c: continue
            dt = (df.index[index+c] - df.index[index]).total_seconds()
            if dt <= 200:
                knn_imputations += inx_ind
            elif dt <= 10000:
                
    if test_gaps:
        df = df.drop(df.iloc[test_gaps].index, axis=0)
        copy_df = copy_df.drop(copy_df.iloc[test_gaps].index, axis=0)

$$\large \text{Dataframe cleaner function, take in file path and loads preprocessed data} $$

In [None]:
class Imputer():
    def __init__(self,df,month,year,file):
        self.df = df
        self.month = month
        self.year = year
        self.file = file
        super().__init__()
    def run(self):
        # ==== reshaping df for timestap & adjusted headers ==== #
        self.df = reshape_df(self.df)
        
        # === filling gaps in time intervals === #
        self.df = add_missing_times(self.df)
        
        # ==== Using PvLib to remove nightime values === #
        self.df = remove_night(self.df)
        
        # === Removing negative values === #  
        self.df = remove_neg(self.df)
        
        if self.file == 'Irradiance':
            # === Removing misread Temps === #
            self.df = remove_bad_temps(self.df)

            # # === Removing misread Wind Speeds === #
            df_load = remove_bad_wind_speeds(df_load)

            # # === Removing misread dni === #
            df_load = remove_bad_dni(df_load)

            # # === Removing misread dhi === #
            df_load = remove_bad_dhi(df_load)

        q = queue.Queue()
        
        for col in self.df.columns:
            q.put_nowait(col)
        
        for _ in range(4):
            threading.Thread(target=wrapper_targetFunc,args=(interpolate, q, self.df)).start()
        q.join()   
        
        self.df['ghi'] = self.df['dhi'] + self.df['dni']

        self.df = wind_speed_to_abs(self.df)

        cwd = os.getcwd()
        datapath = cwd + "/data/" + self.year + '/' + self.month
        file = self.month.lower() + '.csv'
        self.df.to_csv(datapath + "/clean_" + file)

In [None]:
class Worker(threading.Thread):
    def __init__(self, queue, file):
        threading.Thread.__init__(self)
        self.queue = queue
        self.file = file

    def run(self):
        while True:
            file_path = self.queue.get()
            self.clean_file(file_path)
            self.queue.task_done()

    def clean_file(self, file_path):
        data = re.search(r"/(\d{4})/[a-z]*/([a-z]*)\.csv",file_path).group(1,2)
        # Read in file
        df = pd.read_csv(file_path, sep="\t|,", engine='python')
        # Perform data cleaning
        Imputer(df, data[1], data[0], self.file)

starttime = datetime.now()
        
# initializing queue
q = queue.Queue()

file = input("File (opt: Irradiance/Deger/Fixed): "
             
for i in range(4): # change 4 to the number of threads you want to use
    t = Worker(q, file)
    t.daemon = True
    t.start()
             
# adding file paths to the queue
file_paths = execute(file) # Your code for getting monthly file paths

for file_path in file_paths:
    q.put(file_path)

q.join()  # Wait for all threads to finish

endtime = datetime.now()

runtime = endtime - starttime

print("Start:",starttime,"\nEnd:",endtime,"\nRun Time:",timedelta.__str__(runtime))