# 1. Libraries Import

In [1]:
# ========================================================
# = Libraries import
# ========================================================
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns

In [2]:
FONT_SIZE = 16
rc={'font.size': FONT_SIZE, 'axes.labelsize': FONT_SIZE, 'legend.fontsize': FONT_SIZE,
    'axes.titlesize': FONT_SIZE, 'xtick.labelsize': FONT_SIZE, 'ytick.labelsize': FONT_SIZE}
plt.rc('font', weight='bold')

# 1 Read Data

## 1.1. Read Meta Data Table

In [None]:
# raw meta data
df_sites = pd.read_csv('../input_data/SITE_nodeType_20230630.csv')
df_monitors = pd.read_csv('../input_data/MNTR_ddb_20230630.csv')
# df_monitors.head()

## 1.2. Find SMA Monitors

In [6]:
# select monitor with fimer brand
df_SMA = df_monitors.loc[df_monitors['manufacturerApi']=='SMA', 'source']
SMA_monitor_list = df_SMA.str.split('|').str[1].values
print(len(SMA_monitor_list))

775


In [None]:
df_sma = pd.read_csv('../preprocessed_data/monitors_DCdata/df_SMA_2.csv', index_col=0)
# df_sma.head()

# 2 Parameter Setting

In [11]:
# When working with MONITORS, we determine a time period:
time_start = '2022-05-01'
time_end = '2023-05-01'

# threshold values for fault detection
Uthreshold_diff = 0.2
Ithreshold_diff = 0.2
Pthreshold_diff = 0.2
time_threshold = 3

# to check the missing data
time_index5min = pd.date_range(start=pd.to_datetime(time_start), end=pd.to_datetime(time_end), freq='5min').tz_localize(None)
time_index5min

DatetimeIndex(['2022-05-01 00:00:00', '2022-05-01 00:05:00',
               '2022-05-01 00:10:00', '2022-05-01 00:15:00',
               '2022-05-01 00:20:00', '2022-05-01 00:25:00',
               '2022-05-01 00:30:00', '2022-05-01 00:35:00',
               '2022-05-01 00:40:00', '2022-05-01 00:45:00',
               ...
               '2023-04-30 23:15:00', '2023-04-30 23:20:00',
               '2023-04-30 23:25:00', '2023-04-30 23:30:00',
               '2023-04-30 23:35:00', '2023-04-30 23:40:00',
               '2023-04-30 23:45:00', '2023-04-30 23:50:00',
               '2023-04-30 23:55:00', '2023-05-01 00:00:00'],
              dtype='datetime64[ns]', length=105121, freq='5T')

In [12]:
# check if a monitor in the downloaded file
sma_read_list = list(set(df_sma.columns.str.split(':').str[-1]))
if 'MNTR|5541098' in sma_read_list:
    print('yes')

yes


# 3 Single Monitor: Identify significant difference between different MPPTs

In [13]:
# ========================================================
# = Select elements in a array or list with a string
# ========================================================
def select_elements_with_string(iterable, search_string):
    """
    Select elements from the iterable that include the search_string.

    Args:
        iterable (iterable): The input iterable (list, NumPy array, etc.).
        search_string (str): The string to search for.

    Returns:
        list or numpy.ndarray: Elements from the iterable that include the search_string.
    """
    if isinstance(iterable, list):
        return [item for item in iterable if search_string in item]
    elif isinstance(iterable, np.ndarray):
        mask = np.array([search_string in item for item in iterable])
        return iterable[mask]
    else:
        raise ValueError("Input iterable type not supported")


In [None]:
# ========================================================
# = Read raw data & preprocess data
# ========================================================

def read_preprocess_monitor(df, MID, pvsize):
    '''
    df: index is time
    MID: MNTR|6905111
    '''
    ## == read data for the single monotor
    df_monitor = df.filter(regex='{}$'.format(MID))
    split_columns = df_monitor.columns.str.split(':').str[0]
    df_monitor.columns = split_columns
    df_monitor.reset_index(inplace=True)
    df_monitor['time'] = pd.to_datetime(df_monitor['time'].values)

    ## some columns related with the time
    metrics_name_list = df_monitor.columns.to_list()[1:]
    power_metrics_list = select_elements_with_string(iterable=metrics_name_list, search_string='.W')
    df_monitor['minute'] = df_monitor['time'].dt.minute
    df_monitor['hour'] = df_monitor['time'].dt.hour
    df_monitor['date'] = df_monitor['time'].dt.date
    df_monitor['date'] = df_monitor['date'].astype(pd.StringDtype())

    ## == processing data ====##
    # ========================================================
    # = Processing outliers based on the PV size
    # ========================================================
    if len(power_metrics_list) != 0:
        df_pre = df_monitor.copy()
        # outliers
        for power_metric in power_metrics_list:
            df_pre.loc[df_pre[power_metric]>1.2*pvsize, metrics_name_list] = np.NaN
        # ========================================================
        # = Filling up the missing data
        # ========================================================
        first_valid_idx = df_pre['Gen.W'].first_valid_index()
        last_valid_idx = df_pre['Gen.W'].last_valid_index()
        df_pre = df_pre.iloc[first_valid_idx: last_valid_idx+1, :]
        df_pre.fillna(method='ffill', inplace=True)
        return df_monitor, df_pre, metrics_name_list
    else:
        return None

## 3.1. Find Significant difference

In [15]:
import itertools

In [16]:
# ========================================================
# = only consecutive significant points should be picked up
# ========================================================
def consecutive_comparison(df, threshold_value, diff_name, thred_time):
    '''
    df: df_pre
    threshold_value: threshold for significant identification, different values for different metrics (power, voltage, and current)
    diff_name: based on metrics (power, voltage, and current)
    thred_time: consective time period
    '''
    df['potential_'+diff_name] = df[diff_name]>threshold_value
    df['period'] = df['potential_'+diff_name].diff().ne(0).cumsum()
    df['duration'] = df.groupby('period')['potential_'+diff_name].transform('sum')
    df[diff_name+'_significant'] = df['potential_'+diff_name] & (df['duration']>=thred_time)
    df.drop(['potential_'+diff_name, 'period', 'duration'], axis=1, inplace=True)
    return df

In [None]:
def at_least_two_elements_in_list(my_list, other_list):
    count = sum(1 for element in my_list if element in other_list)
    return count >= 2

In [31]:
# ========================================================
# = Comparison between different phases and MPPTs
# ========================================================
def combination_compare(df, Uthreshold_value, Ithreshold_value, Pthreshold_value, DC_AC, thred_time, metrics_name_list):
    '''
    df: df
    Uthreshold_value, Ithreshold_value, Pthreshold_value: threshold values for different metrics
    DC_AC: "AC" or "DC"
    thred_time:
    mppt_power_list: calculate the MPPT number
    '''
    for metric, metric_per in [['P', 'W'], ['U', 'V'], ['I', 'A']]:
        if metric =='U':
            threshold_value = Uthreshold_value
        elif metric == 'I':
            threshold_value = Ithreshold_value
        else:
            threshold_value = Pthreshold_value
        # different phase
        if DC_AC == 'AC':
            metrics_list = select_elements_with_string(iterable=metrics_name_list, search_string='Inv.AC.{}.Ph'.format(metric))
            compare_metric = 'AC.{}{}.Ph'.format(metric, metric_per)
            comblen = len(metrics_list)
        else:
            metrics_list = select_elements_with_string(iterable=metrics_name_list, search_string='Inv.DC.{}.MPTT'.format(metric))
            compare_metric = 'DC.{}{}.MPPT'.format(metric, metric_per)
            comblen = len(metrics_list)
        if comblen >= 2:
            maxvalue_list = []
            for j in metrics_list:
                maxvalue_list.append(df[j].max())
            # print(maxvalue)
            combinations = list(itertools.combinations(np.arange(comblen), 2))
            # print(combinations)
            diff_name_list = []
            for comb in combinations:
                metric1 = metrics_list[comb[0]]
                metric2 = metrics_list[comb[1]]
                maxvalue1 = maxvalue_list[comb[0]]
                maxvalue2 = maxvalue_list[comb[1]]
                diff_name = '{}({} vs {})_diff'.format(compare_metric, comb[0]+1, comb[1]+1)
                diff_name_list.append(diff_name+'_significant')
                df[diff_name] = (df[metric1]/maxvalue1 - df[metric2]/maxvalue2).abs()
                df = consecutive_comparison(df=df, threshold_value=threshold_value, diff_name=diff_name, thred_time=thred_time)
            df[compare_metric+'_significant'] = False
            for diff_name_sig in diff_name_list:
                df[compare_metric+'_significant'] = df[compare_metric+'_significant']|df[diff_name_sig]
    return df

In [32]:
# ========================================================
# = Comparison between different phases and MPPTs
# ========================================================
def compare_diff(df, Uthreshold_value, Ithreshold_value, Pthreshold_value, thred_time, metrics_name_list):
    for adc_id in ['AC', 'DC']: 
        df = combination_compare(df=df, Uthreshold_value=Uthreshold_value, Ithreshold_value=Ithreshold_value,  
                                 Pthreshold_value=Pthreshold_value, DC_AC=adc_id, thred_time=thred_time, metrics_name_list=metrics_name_list)
    return df 

In [19]:
def plot_results(metrics_name_list, df_plot, save_path, save_name):
    fig, axes = plt.subplots(nrows=len(metrics_name_list), figsize=(26, 4*len(metrics_name_list)))
    for i, icol in enumerate(metrics_name_list):
        if 'AC.' in icol:
            if 'P.' in icol:
                compare_metric = 'AC.PW.Ph_significant'
            elif 'Gen.W' in icol:
                compare_metric = 'AC.PW.Ph_significant'
            elif 'U.' in icol:
                compare_metric = 'AC.UV.Ph_significant'
            else:
                compare_metric = 'AC.IA.Ph_significant'
        elif 'DC' in icol:
            if 'P.' in icol:
                compare_metric = 'DC.PW.MPPT_significant'
            elif 'U.' in icol:
                compare_metric = 'DC.UV.MPPT_significant'
            elif 'I.' in icol:
                compare_metric = 'DC.IA.MPPT_significant'
            else:
                compare_metric = None  
        else:
            compare_metric = None
        print(icol, compare_metric)
        sns.lineplot(data=df_plot, x ='time_str', y=icol, ax=axes[i])
        if compare_metric != None:
            sns.scatterplot(data=df_plot, x='time_str', y=icol, hue=compare_metric, ax=axes[i])
        axes[i].set_xticks(axes[i].get_xticks()[::288])
        # for label in axes[i].get_xticklabels():
        #     label.set_rotation(rota)
    
    plt.savefig(save_path+'/{}'.format(save_name))
    plt.close()

In [20]:
def pltsignicant(metrics_name_list, df_sig, df_compare, save_path, MID):
    # plot
    cols_list_all = ['AC.PW.Ph_significant', 'AC.UV.Ph_significant', 'AC.IA.Ph_significant', 'DC.PW.MPPT_significant', 'DC.UV.MPPT_significant', 'DC.IA.MPPT_significant']
    cols_list = df_sig.columns[df_sig.columns.isin(cols_list_all)].values

    df_save_path = df_sig[cols_list].copy()
    # print(df_save_path.columns.to_list())
    df_save_path = df_save_path.loc[:, ~(df_save_path == False).all()]
    # print(df_save_path.columns.to_list())
    file_name = ''
    if any('AC' in column for column in df_save_path.columns): #and any('DC' in column for column in df_save_path.columns):
        file_name = file_name + 'AC'
    if any('PW.Ph' in column for column in df_save_path.columns):
        file_name = file_name + '_Power'
    if any('UV.Ph' in column for column in df_save_path.columns):
        file_name = file_name + '_Voltage'
    if any('IA.Ph' in column for column in df_save_path.columns):
        file_name = file_name + '_Current'
    if any('DC' in column for column in df_save_path.columns):
        file_name = file_name + 'DC'
    if any('PW.MPPT' in column for column in df_save_path.columns):
        file_name = file_name + '_Power'
    if any('UV.MPPT' in column for column in df_save_path.columns):
        file_name = file_name + '_Voltage'
    if any('IA.MPPT' in column for column in df_save_path.columns):
        file_name = file_name + '_Current'
    # print('file_name:', file_name)
    save_file = save_path + '/{}'.format(file_name)
    if not os.path.exists(save_file):
        os.makedirs(save_file)
    date_significant_list = df_sig['date'].unique()
    print('significant days:', len(date_significant_list))
    df_plot_sig = df_compare[df_compare['date'].isin(date_significant_list)].copy()
    df_plot_sig['time_str'] = df_plot_sig['time'].dt.strftime('%Y-%m-%d %H:%M:%S')
    if len(date_significant_list) > 20:
        plot_results(metrics_name_list, df_plot=df_plot_sig[df_plot_sig['date'].isin(date_significant_list[0:20])], save_path=save_file, save_name='{}.png'.format(MID))
        print('more than 20 days are experienced significant difference')
        date_sigpower_list = df_sig.loc[df_sig['DC.PW.MPPT_significant']==True, 'date'].unique()
        if len(date_sigpower_list)>10:
            df_plot_sigP = df_compare[df_compare['date'].isin(date_sigpower_list[0:10])].copy()
        else:
            df_plot_sigP = df_compare[df_compare['date'].isin(date_sigpower_list)].copy()
        if len(df_plot_sigP)!= 0:
            df_plot_sigP['time_str'] = df_plot_sig['time'].dt.strftime('%Y-%m-%d %H:%M:%S')
            plot_results(metrics_name_list, df_plot=df_plot_sigP, save_path=save_file, save_name='{}_power_sig.png'.format(MID))
    else:
        plot_results(metrics_name_list, df_plot=df_plot_sig, save_path=save_file, save_name='{}.png'.format(MID))
    
        

## 3.2. Loop for all monitors

In [33]:
save_path = 'plots/SMA_significant_diff/'
if not os.path.exists(save_path):
    os.makedirs(save_path)

for i, MIDname in enumerate(sma_read_list):
    MID = MIDname.split('|')[1]
    pv_size = df_monitors.loc[df_monitors['source']== MIDname, 'pvSizeWatt'].values[0]
    print('='*50)
    print('{}: MID: {}, pv_size: {}'.format(i, MIDname, pv_size))
    # read & preprocess data
    df_read_preprocess = read_preprocess_monitor(df=df_sma, MID=MID, pvsize=pv_size)
    if df_read_preprocess is not None:
        df_raw, df_pre, metrics_name_list = df_read_preprocess
        print('metrics_name_list:', metrics_name_list)
        df_compare = df_pre.copy()
        df_compare = compare_diff(df=df_compare, Uthreshold_value=Uthreshold_diff, Ithreshold_value=Ithreshold_diff, 
                                Pthreshold_value=Pthreshold_diff, thred_time=time_threshold, metrics_name_list=metrics_name_list)
        
        # find significant dates
        filter_df = df_compare.filter(regex='_significant')
        comapre_columns_list = filter_df.columns.to_list()
        filter_df['time'] = df_compare['time'].values
        filter_df['date'] = filter_df['time'].dt.date.astype(str)
        df_sig = filter_df[filter_df[comapre_columns_list].any(axis=1)]
        # df_sig
        if len(df_sig) != 0:
            pltsignicant(metrics_name_list = metrics_name_list, df_sig=df_sig, df_compare=df_compare, 
                        save_path=save_path, MID=MID)

0: MID: MNTR|895040, pv_size: 27770
Gen.W
Inv.AC.P.Ph1.W
Inv.AC.P.Ph2.W
Inv.AC.P.Ph3.W
Inv.DC.P.MPTT1.W
Inv.DC.P.MPTT2.W
metrics_name_list: ['Gen.W', 'Inv.AC.P.Ph1.W', 'Inv.AC.P.Ph2.W', 'Inv.AC.P.Ph3.W', 'Inv.AC.U.Ph1.V', 'Inv.AC.U.Ph2.V', 'Inv.AC.U.Ph3.V', 'Inv.AC.I.Ph1.A', 'Inv.AC.I.Ph2.A', 'Inv.AC.I.Ph3.A', 'Inv.DC.P.MPTT1.W', 'Inv.DC.P.MPTT2.W', 'Inv.DC.U.MPTT1.V', 'Inv.DC.U.MPTT2.V', 'Inv.DC.I.MPTT1.A', 'Inv.DC.I.MPTT2.A', 'Inv.DC.R.Ohm']
significant days: 5
Gen.W None
Inv.AC.P.Ph1.W AC.PW.Ph_significant
Inv.AC.P.Ph2.W AC.PW.Ph_significant
Inv.AC.P.Ph3.W AC.PW.Ph_significant
Inv.AC.U.Ph1.V AC.UV.Ph_significant
Inv.AC.U.Ph2.V AC.UV.Ph_significant
Inv.AC.U.Ph3.V AC.UV.Ph_significant
Inv.AC.I.Ph1.A AC.IA.Ph_significant
Inv.AC.I.Ph2.A AC.IA.Ph_significant
Inv.AC.I.Ph3.A AC.IA.Ph_significant
Inv.DC.P.MPTT1.W DC.PW.MPPT_significant
Inv.DC.P.MPTT2.W DC.PW.MPPT_significant
Inv.DC.U.MPTT1.V DC.UV.MPPT_significant
Inv.DC.U.MPTT2.V DC.UV.MPPT_significant
Inv.DC.I.MPTT1.A DC.IA.MPPT_significa

: 