$$\large \text{Packages & Specs} $$

In [None]:
import os
import sys
import pandas as pd
import numpy as np
import re
import json

import plotly.express as px

import scipy

module_path = re.sub(r'Notebooks','Python Scripts',os.getcwd())
sys.path.append(module_path)
from pv_modules import *

$$\large \text{Dataframe Cleaner} $$

In [None]:
def df_cleaner(path_list,file):
    
    df = pd.DataFrame()
    outlier_output = pd.DataFrame()

    missing_intervals = []

    for path in path_list:

        df_load = pd.read_csv(path,sep="\t|,",engine='python')
        
        if df_load.empty:
            raise Exception(f"The path: {path} loaded an empty dataframe.")
        
        # ==== reshaping df for timestap & adjusted headers ==== #
        df_load = reshape_df(df_load,file)
        
        # === copy df for outlier pre-processed === #
        outlier_output = pd.concat([outlier_output,df_load.copy()],axis=0,ignore_index=False)

        # === filling gaps in time intervals === #
        df_load, m_intervals = add_missing_times(df_load)

        # ==== Using PvLib to remove nightime values === #
        df_load = remove_night(df_load)
        
        if file == 'Irradiance':
            df_load = clean_irradiance_values(df_load)
        else:
            df_load = clean_deger_fixed_values(df_load)
            
        # === Time Features === #
        df_load = time_features(df_load)
        
        df = pd.concat([df,df_load],axis=0,ignore_index=False).sort_index()
        missing_intervals += m_intervals
    
    return df, missing_intervals, outlier_output

$$\Large \text{Summary of NaN Values} $$

In [None]:
def summarize_nan(df):
    total_nan = df.drop(['day','month','year'],axis=1).isna().sum().sum()
    total_values = df.drop(['day','month','year'],axis=1).size
    mt_count = df.drop(['day','month','year'],axis=1).isna().all(axis=1).sum()
    t_perc = round(total_nan/total_values * 100,3)
    mt_perc = round(mt_count*(len(df.columns)-3)/total_values * 100,3)

    print(f"\nPercentage of NaN values due to System Outage: {mt_perc}% \n")
    
    print(f"Precentage of MAR NaN values: {round(t_perc-mt_perc,3)}% \n")

    print(f"Precentage of Total NaN values: {t_perc}%")

    print("\n Missing values by column")

    for col in df.columns:
        if not col in ['day','month','year']:
            n_miss = df[col].isna().sum()
            perc = round(n_miss / df.shape[0] * 100,3)
            print(f"{col}, Missing: {n_miss} ({perc}%)")

    print("\n Missing values by day")

    for row in df['day'].unique():
        n_miss = df[df['day']==row].drop(['day','month','year'],axis=1).isna().sum().sum()
        perc = round(n_miss / df[df['day']==row].drop(['day','month','year'],axis=1).size * 100,3)
        print(f"{row}, Missing: {n_miss} ({perc}%)")
    
    if len(df['month'].unique()) == 1: return
    
    print('Hi')
    
    print("\n Missing values by month")    

    for row in sorted(df['month'].unique()):
        n_miss = df[df['month']==row].drop(['day','month','year'],axis=1).isna().sum().sum()
        perc = round(n_miss / df[df['month']==row].drop(['day','month','year'],axis=1).size * 100,3)
        print(f"{row}, Missing: {n_miss} ({perc}%)")

    print("\n Missing values by year")    

    for row in df['year'].unique():
        n_miss = df[df['year']==row].drop(['day','month','year'],axis=1).isna().sum().sum()
        perc = round(n_miss / df[df['year']==row].drop(['day','month','year'],axis=1).size * 100,3)
        print(f"{row}, Missing: {n_miss} ({perc}%) \n")

$$\large \text{Scatter plot of time intervals that were recorded as missing times} $$

In [None]:
def mt_fig(missing_intervals):
    interval_df = pd.DataFrame(missing_intervals,
                               columns=['seconds','start_time','end_time'])
    arr = interval_df['seconds']
    z_scores = np.abs((arr - arr.mean()) / arr.std())
    threshold = 2
    outliers = arr[z_scores > threshold]
    if len(outliers):
        print(f"\tNumber of outliers {len(outliers)}, min: {sorted(outliers)[0]}, max: {sorted(outliers)[-1]} \n")
    print(f'\tThe average missing time: {arr.mean()}')
    print(f'\tThe median missing time: {arr.median()}')
    print(f'\tThe mode missing time: {arr.mode()[0]}')
    print("Scatter plot of the system outage (in seconds) over time without outliers:")
    px.scatter(interval_df, x='start_time',y='seconds',
               hover_data=['start_time','end_time'],
               title='Intervals in Time of Missing Observations').show()

$$\large \text{Scatter plot of all variables over time} $$

In [None]:
def col_fig(df):
    for col in df.drop(['day','month','year'],axis=1).columns:
        px.scatter(df, x=df.index, y=f'{col}',title=f'{col}').show()

$$\large \text{Scatter plot of timestamps} $$

In [None]:
def timestamps_fig(df):
    df['Seconds'] = [(time - time.replace(hour=0, minute=0,
                    second=0, microsecond=0)).total_seconds()for time in df.index]
    df = df.dropna(axis=0)   
    px.scatter(df, y='Seconds').show()

$$\large \text{Correlation Matrix} $$

In [None]:
def corr_matrix(df):
    df = df.drop(['day','month','year'],axis=1)
    df = df.dropna()
    
    corrs = []
    p_values = []
    
    for feat1 in df.columns:
        corr_list = []
        p_list = []
        for feat2 in df.columns:
            corr, p_value = scipy.stats.spearmanr(df[feat1], df[feat2])
            corr_list += [corr]
            p_list += [p_value]
        corrs += [corr_list]
        p_values += [p_list]
        
    corr_matrix = pd.DataFrame(corrs, index = df.columns, columns = df.columns)
    px.imshow(corr_matrix,text_auto=True,title="Correlation Matrix").show()  

$$\large \text{Summary of outliers by variable} $$

In [None]:
def outliers(df,pre):
    indicator = True
    if pre: df = df.drop(['day','month','year'],axis=1)
    for col in df.columns:
        arr = df[col]
        z_scores = np.abs((arr - arr.mean()) / arr.std())
        threshold = 3
        outliers = arr[z_scores > threshold]
        if len(outliers):
            indicator = False
            print(f"{col} number of outliers {len(outliers)}, min: {sorted(outliers)[0]}, max: {sorted(outliers)[-1]} \n")
    if indicator: print("There were no outliers found pre-processing. \n")

$$\large \text{Figure of NaN Gaps} $$

In [None]:
def nan_gaps(df):
    nan_gaps = []
    for col in range(len(df.columns)):
        inx_ind = []
        for index in range(len(df.index)):
            if index in inx_ind: continue
            c = 0
            while np.isnan(df.iloc[index+c,col]) and df.index[index+c] != df.index[-1] and df.index[index+c].day == df.index[index].day:
                inx_ind += [index+c]
                c += 1
            if not c: continue
            dt = (df.index[index+c] - df.index[index]).total_seconds()
            nan_gaps += [[dt, df.index[index], df.index[index+c], df.columns[col]]]
            
    nan_df = pd.DataFrame(nan_gaps, columns = ['Seconds', 'Start Time', 'End Time', 'Column'])
    
    print(nan_df)
    
    arr = nan_df['Seconds']
    z_scores = np.abs((arr - arr.mean()) / arr.std())
    threshold = 2
    outliers = arr[z_scores > threshold]
    
    if len(outliers):
        print(f"\n\tNumber of outliers {len(outliers)}, min: {sorted(outliers)[0]}, max: {sorted(outliers)[-1]}")
    print(f'\tThe average of the missing times: {arr.mean()}')
    print(f'\tThe median of the missing times: {arr.median()}')
    print(f'\tThe mode of the missing times: {arr.mode()[0]}')
    print("Scatter plot of the system outage (in seconds) over time without outliers:")
    px.scatter(nan_df, x = nan_df.index, y = 'Seconds', hover_data = ['Start Time','End Time', 'Column']
                ,title = 'Scatter plot of the NaN gaps (in seconds) over time:').show()

$$\large \text{Summary Function Calls} $$

In [None]:
def summary(path_list,file,update=False):
    
    df, missing_intervals, outlier_output = df_cleaner(path_list,file)
        
    # printing Summary of NaN Values
    print("\nSummary of NaN Values")
    summarize_nan(df)

    # printing outliers pre-processing
    print("\n\nSummary of outliers (if any) for pre-processed data:\n")
    outliers(outlier_output,pre = False)

    # printing outliers post-processing
    print("\n\nSummary of outliers (if any) for post-processed data:\n")
    outliers(df,pre = True)
    
    # Figure of nan gaps
    print('\n\nScatter plot of the NaN gaps (in seconds) over time:\n')
    # too computational heavy
    print(nan_gaps(df))
    
    # Summary of Missing Intervals
    print('\n\nSummary of Missing Intervals:\n')
    mt_fig(missing_intervals)
    

    # figure's of all variables plotted over time
    print("\n\nFigure's of all variables plotted over time:\n")
    col_fig(df)
    
    # figure of correlation matrix
    print("\n\nCorrelation Matrix:\n")
    corr_matrix(df)

$$\large \text{Main Function} $$

In [None]:
def main(year,month,file):
    if not [file_i for file_i in ['Irradiance','Deger','Fixed'] if re.search(fr'{file}',file_i)]:
            raise Exception(f"Incorret Input: File")
    elif not year and not month:
        path_list = get_file_paths(file)
        summary(path_list,file)
    elif not re.search(r'\d{4}',year):
        raise Exception(f"Incorret Input: Year")
    elif not re.search(r'[A-Za-z]{3}',month):
        raise Exception(f"Incorret Input: Month")
    else:
        path_list = get_file_paths(file)
        path_list = path_function(year,month,path_list,file)
        summary(path_list,file)

main(year = input("Year (format: YYYY): "), month = input("Month (format: jul): "),
     file = input("File (opt: Irradiance/Deger/Fixed): "))