In [1]:
import pandas as pd
import csv
import os
import numpy as np
import pytz
import datetime
from xlsxwriter.workbook import Workbook
from xlsxwriter import Workbook
import time
from scipy import integrate, stats
import matplotlib.pyplot as plt
from functools import reduce
from sklearn import preprocessing
import re

# %matplotlib inline

In [2]:
Working_dir = os.getcwd()
Data_dir = os.path.join(Working_dir, 'Raw data')
Data_prep_dir = os.path.join(Working_dir, 'Preprocessed data')
Output_dir = os.path.join(Working_dir, 'Analysis')
Plots_dir = os.path.join(Output_dir, 'Plots')

In [3]:
nodes_file_names = ['May2018_idle.csv', 'June2018_idle.csv', 'July2018_idle.csv', \
                    'September2018.csv', 'October2018.csv', 'November2018.csv', \
                    'December2018.csv' , 'January2019.csv']
nodes_df_names = [item.split('.')[0] for item in nodes_file_names]
nodes_stats_ = 'nodes_stats_'
node_df_may = pd.read_csv( os.path.join( Output_dir, nodes_stats_ + 'May2018_idle.csv'), delimiter=";", header=0, index_col=0 )
node_names = node_df_may.columns.values
summary_index = ['total_dcenergy', 'sys_energy', \
                'cpu_energy', 'mem_energy', 'other_energy', \
                'cpu_en_percent', 'mem_en_percent', \
                'other_en_percent', 'sys_util', 'mem_util', 'cpu_util', \
                'other_util','sys_power_max', 'cpu_power_max',\
                'mem_power_max', 'dcenergy_error_percent', \
                'present_in_months', \
                'avg_exh_temp', 'avg_inlet_temp', \
                'avg_cpu1_temp', 'avg_cpu2_temp', \
                'idle_time_sec', 'working_time_sec', \
                'idle_en_integral']

In [4]:
nodes_df_dict = {}
for file_name, df_name in zip(nodes_file_names, nodes_df_names):
    nodes_df_dict[df_name] = pd.read_csv(os.path.join( Output_dir, nodes_stats_ + file_name), \
                                         delimiter=";", header=0, index_col=0)
    to_drop_ix = np.unique(np.append(\
                                     np.where(nodes_df_dict[df_name].loc['dcenergy_error_percent', :].isna())[0], \
                                     np.where(nodes_df_dict[df_name].loc['dcenergy_error_percent', :] > 5.)[0]))
    to_drop = nodes_df_dict[df_name].iloc[:,to_drop_ix].columns.values
    nodes_df_dict[df_name].drop(columns=to_drop, inplace=True)

In [5]:
nodes_no_drop_df_dict = {}
for file_name, df_name in zip(nodes_file_names, nodes_df_names):
    nodes_no_drop_df_dict[df_name] = pd.read_csv(os.path.join( Output_dir, nodes_stats_ + file_name), \
                                         delimiter=";", header=0, index_col=0)

In [6]:
data_prep_df = {}
for file_name, df_name in zip(nodes_file_names, nodes_df_names):
    # Read preprocessed data
    data_prep_df[df_name] = pd.read_csv( os.path.join( Data_prep_dir, file_name), delimiter=";", header=0 )

## Preprocessing - all the months

http://benalexkeen.com/working-with-timezones-in-python/

In [16]:
# winter_dt = datetime.datetime(2017, 1, 1, 14, 0)
# print(winter_dt.strftime("Winter Local Time: %H:%M"))

# summer_dt = datetime.datetime(2017, 7, 1, 14, 0)
# print(summer_dt.strftime("Summer Local Time: %H:%M"))

# local_tz = pytz.timezone('Europe/Berlin')
# target_tz = pytz.timezone('UTC')

# winter_dt = local_tz.localize(winter_dt)
# summer_dt = local_tz.localize(summer_dt)

# winter_dt = target_tz.normalize(winter_dt)
# summer_dt = target_tz.normalize(summer_dt)

# print(winter_dt.strftime("Winter UTC Time: %H:%M"))
# print(summer_dt.strftime("Summer UTC Time: %H:%M"))

Winter Local Time: 14:00
Summer Local Time: 14:00
Winter UTC Time: 13:00
Summer UTC Time: 12:00


In [17]:
def fmt_converter(x):
    fmt_summer = '%a %d %b %H:%M:%S CEST %Y'
    fmt_winter = '%a %d %b %H:%M:%S CET %Y'
    
    datetime_result = datetime.datetime.now()

    if 'CEST' in x:
        datetime_result = datetime.datetime.strptime(x, fmt_summer)
    else: 
        datetime_result = datetime.datetime.strptime(x, fmt_winter)
    return datetime_result

def utc_converter(x):
    local_tz = pytz.timezone('Europe/Rome')
    target_tz = pytz.timezone('UTC')
    datetime_init = fmt_converter(x)
    datetime_localized = local_tz.localize(datetime_init)
    datetime_result = target_tz.normalize(datetime_localized)
    return datetime_result

def unix_converter(x):
    return utc_converter(x).timestamp()

In [19]:
# for file_name in nodes_file_names:
#     node_df = pd.read_csv( os.path.join( Data_dir, file_name), delimiter=";", header=0 )
#     node_df.dcenergy = node_df.dcenergy.apply(lambda x: float(x.replace(",", ".")) )
#     node_df.loc[:,'timestamp_py'] = node_df.tempo.apply(fmt_converter)
#     node_df.loc[:,'unix_timestamp'] = node_df.tempo.apply(unix_converter)
#     node_df.iloc[:,3:-1] = node_df.iloc[:, 3:-1].\
#         apply(lambda y: y.\
#                 apply(lambda x: \
#                       float(x.replace(",", ".")) if type(x)==str else x ))
#     node_df.drop(columns='timestamp_measure', inplace=True)
#     # node_df.to_csv(path_or_buf=os.path.join( Data_prep_dir, file_name), sep=';', index=False)

------
## Energy analysis

In [74]:
# def forth_integral(node, time, power=['cpu', 'mem', 'sys']):
#     power_kw = node[power + '_power'].values/1000.0
#     return(np.dot(power_kw, time))

In [11]:
def forth_integral(y_values, time):
    return(np.dot(y_values, time))

In [12]:
def error_percentage(a, b):
    if a > 0.:
        return(abs(a-b) / a * 100.)
    else: 
        return(abs(a-b) / b * 100.)

In [13]:
def percentage(a, b):
    return((a / b) * 100.)

## Create dictionary of dataframes to summarize info for every node
One dictionary entry = one dataframe = one month $\ni  \{$ all nodes - columns $\}$

### 1. Create and fill total energy column 
- total_dcenergy
   
   energy difference between last and first measurement for every node, since energy meter shows incremental observations

In [26]:
nodes_df_dict_init = {}
for file_name, df_name in zip(nodes_file_names, nodes_df_names):
    # Read preprocessed data
    node_df = data_prep_df[df_name].copy()
    
    # Create df
    nodes_df_dict_init[df_name] = pd.DataFrame(columns = list(node_df.nodename.unique()), index = [], data=0)
    
    # Energy meter difference
    for node in nodes_df_dict_init[df_name].columns.values:
        nodes_df_dict_init[df_name].loc['total_dcenergy', node] = \
            node_df[node_df.nodename == node]['dcenergy'].values[-1] - \
            node_df[node_df.nodename == node]['dcenergy'].values[0]

### 2. Calculate integrals
- System, CPU and memory energy use as integral of corresponding power measurements
- Portion of CPU, memory and other components in energy use - percentage

In [47]:
for df_name in nodes_df_names[3:4]:
    node_df = data_prep_df[df_name].copy()
    
    for node in nodes_df_dict_init[df_name].columns.values:
        one_node_df = node_df[node_df.nodename == node].copy()
        
        # Read time difference between consecutive measurements and convert from seconds to hours
        time_difference = (one_node_df['unix_timestamp'][1:].values -\
            one_node_df['unix_timestamp'][:-1].values)/3600.0

        for util_type in ['cpu', 'mem', 'sys']:
            # Integrate system power over time
            nodes_df_dict_init[df_name].loc[util_type + '_energy', node] = \
                forth_integral(y_values=one_node_df.iloc[1:,:][util_type + '_power'].values/1000.0,\
                               time=time_difference)
            
#     # Integral of sys power vs dcenergy meter error in %
        nodes_df_dict_init[df_name].loc['dcenergy_error_percent', node] = \
            error_percentage(nodes_df_dict_init[df_name].loc['total_dcenergy',node], \
                             nodes_df_dict_init[df_name].loc['sys_energy', node])
            
    # Energy used for purposes other than CPU and memory
    nodes_df_dict_init[df_name].loc['other_energy', :] = nodes_df_dict_init[df_name].loc['sys_energy', :] - \
        nodes_df_dict_init[df_name].loc['cpu_energy', :] - nodes_df_dict_init[df_name].loc['mem_energy', :]
    print(df_name)

### 3. What portion of energy goes for CPU, memory, other components

In [69]:
# Energy portion used for CPU and memory in overall the system energy consumption
for df_name in nodes_df_names:
    for util_type in ['cpu', 'mem', 'other']:
        nodes_df_dict_init[df_name].loc[util_type + '_en_percent', :] = \
            percentage(nodes_df_dict_init[df_name].loc[util_type + '_energy',:], \
                       nodes_df_dict_init[df_name].loc['sys_energy', :])

### 4. Max observed power utilization of CPU, memory and system

In [19]:
for df_name in nodes_df_names:
    node_df = data_prep_df[df_name].copy()
    
    for node in nodes_df_dict_init[df_name].columns.values:
        one_node_df = node_df[node_df.nodename == node].copy()
        
        # Find max power utilization by CPU, mem and system
        for util_type in ['cpu', 'mem', 'sys']:
            max_val = one_node_df.loc[:,util_type + '_power'].max() 
            nodes_df_dict_init[df_name].loc[util_type + '_power_max', node] = max_val

### 5. Average exhaust, inlet, CPU 1, CPU 2 temperature

In [148]:
for df_name in nodes_df_names:
    node_df = data_prep_df[df_name]
    
    for node in nodes_df_dict_init[df_name].columns.values:
        one_node_df = node_df[node_df.nodename == node].copy()
        one_node_right_temp_df = one_node_df[(one_node_df.exh_temp>=one_node_df.amb_temp) &\
                                             (one_node_df.exh_temp>0.) & (one_node_df.amb_temp>0.) &\
                                             (one_node_df.cpu1_temp>0.) & (one_node_df.cpu2_temp>0.)].copy()
        one_node_right_temp_df.columns = one_node_right_temp_df.columns.str.replace('amb_temp', 'inlet_temp')
        for temp_type in ['exh', 'inlet', 'cpu1', 'cpu2']:
            nodes_df_dict_init[df_name].loc['avg_'+temp_type+'_temp', node] = one_node_right_temp_df[temp_type+'_temp'].mean()

### 6. Idle mode
$\leq 200$ kWh of power use is idle mode


If sys_power level is registered to be $p^1_{{minus\;200}}$ for timestamp $t^1$, then it will be assumed that this power level was observed from the previous timestamp $t^0$ until $t^1$
- duration
- energy integral

In [32]:
for df_name in nodes_df_names:
    node_df = data_prep_df[df_name]
    
    for node in nodes_df_dict_init[df_name].columns.values:
        one_node_df = node_df[node_df.nodename == node].copy()
        
        idle_sys_power_ix = np.trim_zeros(np.where(one_node_df.sys_power <= 200.)[0]).tolist()
        
        time_b = one_node_df.iloc[idle_sys_power_ix,:].unix_timestamp
        time_a = one_node_df.iloc[[x-1 for x in idle_sys_power_ix],:].unix_timestamp
        
        nodes_df_dict_init[df_name].loc['idle_time_sec',node] = \
            (time_b.values - time_a.values).sum()
            
        nodes_df_dict_init[df_name].loc['working_time_sec',node] = \
            one_node_df.unix_timestamp.max() - one_node_df.unix_timestamp.min()
            
        nodes_df_dict_init[df_name].loc['idle_en_integral',node] = \
            forth_integral( node=one_node_df.iloc[idle_sys_power_ix,:], \
                            time=( time_b.values - time_a.values )/3600., power='sys' )

---------------------------
### Branch. Bins of energy levels

In [7]:
bins = np.arange(0,500,50)

In [8]:
df_name = nodes_df_names[0]

node_df = data_prep_df[df_name].copy()

for node in nodes_df_dict[df_name].columns.values[:1]:
    one_node_df = node_df[node_df.nodename == node].copy()
    groups = one_node_df.groupby(pd.cut(one_node_df.sys_power, bins))
    
energy_level_index = groups.sys_power.sum().index

In [15]:
energy_levels_df_dict = {}

for df_name in nodes_df_names:
    node_df = data_prep_df[df_name].copy()
    energy_levels_df_dict[df_name] = pd.DataFrame(columns = node_names, \
                                                  index = energy_level_index, \
                                                  data=0)

    for node in nodes_df_dict[df_name].columns.values:
        one_node_df = node_df[node_df.nodename == node].copy()

        groups = one_node_df.groupby(pd.cut(one_node_df.sys_power, bins))
        energy_levels_df_dict[df_name].loc[:, node] = groups.sys_power.count().values

In [47]:
nodes_idle_energy_summary_df = pd.DataFrame(columns = node_names, index = energy_level_index, data=0)
for df_name in nodes_df_names:
    en_node_df = energy_levels_df_dict[df_name]
    nodes_idle_energy_summary_df = en_node_df.replace([np.inf, -np.inf, np.nan], 0).\
                                        add(nodes_idle_energy_summary_df).fillna(en_node_df)
all_entries = nodes_idle_energy_summary_df.sum().sum()
nodes_idle_energy_summary_df.loc[:, 'No_entries_in_total_for_all_months'] = nodes_idle_energy_summary_df.sum(axis=1)
nodes_idle_energy_summary_df.loc[:, 'Percent_entries_in_total_for_all_months'] = \
    nodes_idle_energy_summary_df.filter(regex=('^cresco6x')).sum(axis=1) / all_entries * 100.

nodes_idle_energy_summary_df.Percent_entries_in_total_for_all_months = \
    nodes_idle_energy_summary_df.Percent_entries_in_total_for_all_months.round(2)
nodes_idle_energy_summary_df.to_csv( path_or_buf=os.path.join( Output_dir, 'nodes_idle_energy_summary.csv' ), sep=';' )
nodes_idle_energy_summary_df.iloc[:, -2:]

Unnamed: 0_level_0,No_entries_in_total_for_all_months,Percent_entries_in_total_for_all_months
sys_power,Unnamed: 1_level_1,Unnamed: 2_level_1
"(0, 50]",783,0.02
"(50, 100]",0,0.0
"(100, 150]",1358059,32.3
"(150, 200]",1467779,34.91
"(200, 250]",211751,5.04
"(250, 300]",994356,23.65
"(300, 350]",138883,3.3
"(350, 400]",28591,0.68
"(400, 450]",4802,0.11


In [42]:
energy_by_month = pd.DataFrame(data=0, index=nodes_df_names, columns=['total_dcenergy_MWh', 'total_sys_energy_MWh'])

for df_name in nodes_df_names:
    energy_by_month.loc[df_name, 'total_dcenergy_MWh'] = nodes_df_dict[df_name].loc['total_dcenergy', :].sum()
    energy_by_month.loc[df_name, 'total_sys_energy_MWh'] = nodes_no_drop_df_dict[df_name].loc['sys_energy', :].sum()
    energy_by_month.loc[df_name, 'idle_energy_MWh'] = nodes_no_drop_df_dict[df_name].loc['idle_en_integral', :].sum()

energy_by_month = energy_by_month/1000.

energy_by_month.loc[:, 'idle_energy_percent_MWh'] = energy_by_month.idle_energy_MWh / \
                                                                energy_by_month.total_sys_energy_MWh * 100.
energy_by_month = energy_by_month.round(3)
energy_by_month.to_csv( path_or_buf=os.path.join( Output_dir, 'energy_summary_by_month.csv' ), sep=';')
energy_by_month

Unnamed: 0,total_dcenergy_MWh,total_sys_energy_MWh,idle_energy_MWh,idle_energy_percent_MWh
May2018_idle,14.693,15.761,14.542,92.266
June2018_idle,24.173,27.456,25.584,93.182
July2018_idle,21.624,24.611,21.279,86.46
September2018,22.969,25.461,17.674,69.415
October2018,27.048,32.49,12.853,39.559
November2018,26.54,31.381,13.149,41.9
December2018,33.023,37.567,7.881,20.98
January2019,19.393,21.489,9.386,43.675


#### Branch finished
------------

### 7. System air flow

In [77]:
for df_name in nodes_df_names:
    node_df = data_prep_df[df_name]
    
    for node in nodes_no_drop_df_dict[df_name].columns.values:
        one_node_df = node_df[node_df.nodename == node].copy()
        
        # Read time difference between consecutive measurements 
        # and convert from seconds to minutes
        time_difference = (one_node_df['unix_timestamp'][1:].values -\
            one_node_df['unix_timestamp'][:-1].values) / 60.0
        
        nodes_no_drop_df_dict[df_name].loc['sys_air_flow_CFM', node] = \
            forth_integral(y_values=one_node_df.iloc[1:,:]['sysairflow'].values, \
                           time=time_difference)


### -1. Save stats

In [79]:
# nodes_df_dict_init
for file_name, df_name in zip(nodes_file_names, nodes_df_names):
    nodes_no_drop_df_dict[df_name].to_csv( path_or_buf=os.path.join( Output_dir, nodes_stats_ + file_name), sep=';' )

-------
### All calculations for nodes stats in one loop

In [63]:
nodes_df_dict_init = {}
data_prep_df       = {}

for file_name, df_name in zip(nodes_file_names, nodes_df_names):
    
    data_prep_df[df_name]       = pd.read_csv( os.path.join( Data_prep_dir, file_name), delimiter=";", header=0 )
    node_df                     = data_prep_df[df_name].copy()
    nodes_df_dict_init[df_name] = pd.DataFrame(columns = list(node_df.nodename.unique()), \
                                               index = ['total_dcenergy', 'sys_energy', \
                                                        'cpu_energy', 'mem_energy', 'other_energy', \
                                                        'cpu_en_percent', 'mem_en_percent', \
                                                        'other_en_percent', 'sys_util', 'mem_util', 'cpu_util', \
                                                        'other_util','sys_power_max', 'cpu_power_max',\
                                                        'mem_power_max', 'dcenergy_error_percent'], 
                                               data = 0)
    
    for node in nodes_df_dict_init[df_name].columns.values:
        one_node_df = node_df[node_df.nodename == node].copy()
        
        # Time difference for power integral
        time_difference = (one_node_df['unix_timestamp'][1:].values -\
            one_node_df['unix_timestamp'][:-1].values)/3600.0

        for util_type in ['cpu', 'mem', 'sys']:
            # Integrate system power over time
            nodes_df_dict_init[df_name].loc[util_type + '_energy', node] = \
                forth_integral(y_values=one_node_df.iloc[1:,:][util_type + '_power'].values/1000.0,\
                               time=time_difference)
#                 forth_integral(node=one_node_df.iloc[1:,:], time=time_difference), power=util_type)


            # Find max power utilization by CPU, mem and system
            max_val = one_node_df.loc[:,util_type + '_power'].max() 
            if nodes_df_dict_init[df_name].loc[util_type + '_power_max', node] < max_val:
                nodes_df_dict_init[df_name].loc[util_type + '_power_max', node] = max_val
        
        # Energy meter increase
        nodes_df_dict_init[df_name].loc['total_dcenergy', node] = \
            node_df[node_df.nodename == node]['dcenergy'].values[-1] - \
            node_df[node_df.nodename == node]['dcenergy'].values[0]
    
        # Integral of sys power vs dcenergy meter error in %
        nodes_df_dict_init[df_name].loc['dcenergy_error_percent', node] = \
            error_percentage(nodes_df_dict_init[df_name].loc['total_dcenergy',node], \
                             nodes_df_dict_init[df_name].loc['sys_energy', node])
            
        # Average exhaust, inlet, CPU 1, CPU 2 temperature
        one_node_right_temp_df = one_node_df[(one_node_df.exh_temp>=one_node_df.amb_temp) &\
                                             (one_node_df.exh_temp>0.) & (one_node_df.amb_temp>0.) &\
                                             (one_node_df.cpu1_temp>0.) & (one_node_df.cpu2_temp>0.)].copy()
        one_node_right_temp_df.columns = one_node_right_temp_df.columns.str.replace('amb_temp', 'inlet_temp')
        for temp_type in ['exh', 'inlet', 'cpu1', 'cpu2']:
            nodes_df_dict_init[df_name].loc['avg_'+temp_type+'_temp', node] = one_node_right_temp_df[temp_type+'_temp'].mean()
            
        # Idle stats
        idle_sys_power_ix = np.trim_zeros(np.where(one_node_df.sys_power <= 200.)[0]).tolist()
        
        time_b = one_node_df.iloc[idle_sys_power_ix,:].unix_timestamp
        time_a = one_node_df.iloc[[x-1 for x in idle_sys_power_ix],:].unix_timestamp
        
        nodes_df_dict_init[df_name].loc['idle_time_sec',node] = \
            (time_b.values - time_a.values).sum()
            
        nodes_df_dict_init[df_name].loc['working_time_sec',node] = \
            one_node_df.unix_timestamp.max() - one_node_df.unix_timestamp.min()
            
        nodes_df_dict_init[df_name].loc['idle_en_integral',node] = \
            forth_integral( y_values=one_node_df.iloc[idle_sys_power_ix,:].values/1000.0, \#node=one_node_df.iloc[idle_sys_power_ix,:], \
                            time=( time_b.values - time_a.values )/3600., power='sys' )
            
    # Energy used for purposes other than CPU and memory
    nodes_df_dict_init[df_name].loc['other_energy', :] = nodes_df_dict_init[df_name].loc['sys_energy', :] - \
        nodes_df_dict_init[df_name].loc['cpu_energy', :] - nodes_df_dict_init[df_name].loc['mem_energy', :]
        
    # Energy portion used for CPU and memory in overall the system energy consumption
    for util_type in ['cpu', 'mem', 'other']:
        nodes_df_dict_init[df_name].loc[util_type + '_en_percent', :] = \
            percentage(nodes_df_dict_init[df_name].loc[util_type + '_energy',:], \
                       nodes_df_dict_init[df_name].loc['sys_energy', :])
    # Save
    nodes_df_dict_init[df_name].to_csv( path_or_buf=os.path.join( Output_dir, nodes_stats_ + file_name), sep=';' )    
    print(df_name)

  """


May2018_idle
June2018_idle
July2018_idle
September2018
October2018
November2018
December2018
January2019


--------
## Summary for all nodes over all months
- Resultant dataframe is self-descriptive by its row names
- Each column corresponds to one node characterized by its name
- Last two columns are added for average or total summation over all nodes 

In [156]:
rows_to_calc_sum = ['total_dcenergy', 'sys_energy', 'cpu_energy', 'mem_energy', 'other_energy', 'idle_en_integral']
rows_to_calc_avg = ['cpu_en_percent', 'mem_en_percent', \
                    'other_en_percent', 'sys_power_max', 'cpu_power_max',\
                    'mem_power_max', 'dcenergy_error_percent',\
                    'sys_util', 'mem_util', 'cpu_util', 'other_util', \
                    'avg_exh_temp', 'avg_inlet_temp', \
                    'avg_cpu1_temp', 'avg_cpu2_temp', \
                    'idle_time_sec', 'working_time_sec']
rows_en_percent = ['cpu_en_percent', 'mem_en_percent']

nodes_summary_df = pd.DataFrame(columns = node_names, index = summary_index, data=0)

nodes_summary_no_drop_df = nodes_summary_df.copy()

In [157]:
for df_name in nodes_df_names:
    node_df = nodes_no_drop_df_dict[df_name].copy()
    
    # Drop columns where the relative error between 
    # dcenergy and system power integral is high
    to_drop_ix = np.unique(np.append(\
                                     np.where(node_df.loc['dcenergy_error_percent', :].isna())[0], \
                                     np.where(node_df.loc['dcenergy_error_percent', :] > 5.)[0]))
    to_drop = node_df.iloc[:,to_drop_ix].columns.values
    
    # Add counter of node presence in months to further count average
    node_df.loc['present_in_months', :] = 1
    
    # Fill summary with and without columns to drop: dcenergy based and sys_power integral based datasets
    nodes_summary_no_drop_df = node_df.replace([np.inf, -np.inf, np.nan], 0).add(nodes_summary_no_drop_df).fillna(node_df)
    
    node_df.loc[:,to_drop] = 0
    nodes_summary_df = node_df.add(nodes_summary_df).fillna(node_df)
    
nodes_summary_no_drop_df.loc['total_dcenergy', :] = 0

In [158]:
# Calc total and avg over all nodes for summary dfs
def summary_calc(df):
    df.loc[:,'total_all_nodes'] = df.loc[rows_to_calc_sum,:].sum(axis=1)
    df.loc[:,'total_all_nodes'].fillna(0, inplace=True)

    df.loc[rows_to_calc_avg,:'cresco6x216'] = df.loc[rows_to_calc_avg,:'cresco6x216'] / \
                                                                df.loc['present_in_months',:'cresco6x216']
    df.loc[:,'avg_all_nodes']  = df.loc[rows_to_calc_avg,:'cresco6x216'].mean(axis=1)
    df.loc['other_en_percent','avg_all_nodes'] = 100.0 - df.loc[rows_en_percent,'avg_all_nodes'].sum()
    df.loc[:,'avg_all_nodes'].fillna(0, inplace=True)

In [159]:
summary_calc(nodes_summary_df)
summary_calc(nodes_summary_no_drop_df)

In [163]:
nodes_summary_df.to_csv(path_or_buf=os.path.join( Output_dir, 'nodes_summary_all_months.csv'), sep=";")
nodes_summary_no_drop_df.to_csv(path_or_buf=os.path.join( Output_dir, 'nodes_summary_all_months_no_drop.csv'), sep=";") 

--------------
## Availability of energy and system power data

In [7]:
nodes_summary_df = pd.read_csv(os.path.join( Output_dir, 'nodes_summary_all_months.csv'), \
                               delimiter=";", header=0, index_col=0)
nodes_summary_no_drop_df = pd.read_csv(os.path.join( Output_dir, 'nodes_summary_all_months_no_drop.csv'), \
                                delimiter=";", header=0, index_col=0) 

#### Average of the relative error between integral of system power and energy meter  

In [8]:
nodes_summary_df.loc['dcenergy_error_percent', :].mean()

0.6585077240383141

#### Information of sys_power present for each month

In [9]:
analysis_by_month = pd.DataFrame(data=0, index=nodes_df_names, columns=['dcenergy_present', 'sys_power_present'])

for df_name in nodes_df_names:
    month_df = nodes_no_drop_df_dict[df_name].filter(regex=('energy'), axis=0)
    analysis_by_month.loc[df_name, 'dcenergy_present'] = nodes_df_dict[df_name].shape[1]
    analysis_by_month.loc[df_name, 'sys_power_present'] = len(month_df.columns.values) - month_df.loc[:, month_df.isna().any()].shape[1]
analysis_by_month

Unnamed: 0,dcenergy_present,sys_power_present
May2018_idle,198,214
June2018_idle,188,214
July2018_idle,187,214
September2018,192,214
October2018,178,214
November2018,180,213
December2018,186,213
January2019,193,213


In [10]:
analysis_by_month.dcenergy_present.mean(), analysis_by_month.sys_power_present.mean()

(187.75, 213.625)

#### Percent of energy measurements close to sys_power integral

In [11]:
# Check if the nodes excluded because of sys_power missing data are present in dataset working with dcenergy
for df_name in nodes_df_names:
    month_df = nodes_no_drop_df_dict[df_name].filter(regex=('energy'), axis=0)
    nan_cols_sys_energy = month_df.loc[:, month_df.isna().any()].columns.values
    for col in nan_cols_sys_energy:
        if col in nodes_df_dict[df_name].columns.values:
            print(df_name,  month_df.loc[:, month_df.isna().any()].columns.values )
            
print(analysis_by_month.dcenergy_present.min()/216.*100, analysis_by_month.dcenergy_present.max()/216.*100)

82.4074074074074 91.66666666666666


-------------
## Errors in DC energy meter over months concern the following nodes

In [50]:
dcenergy_error_nodes_dict = {}
dcenergy_all_error_nodes = []
for file_name, df_name in zip(nodes_file_names, nodes_df_names):
    dcenergy_error_nodes_dict[df_name] = np.where(nodes_no_drop_df_dict[df_name].loc["dcenergy_error_percent", :] > 5.)[0]
    
    node_df = data_prep_df[df_name]
    
    for node in nodes_no_drop_df_dict[df_name].iloc[:, dcenergy_error_nodes_dict[df_name] ].columns.values:
        int_name = int(node.split('x')[-1])
        if not(int_name in dcenergy_all_error_nodes):
            dcenergy_all_error_nodes.append(int_name)

In [51]:
print(list(set(dcenergy_all_error_nodes)))

[1, 2, 4, 5, 7, 9, 10, 11, 12, 13, 15, 16, 18, 20, 21, 23, 24, 26, 27, 29, 30, 31, 32, 33, 34, 35, 37, 38, 39, 40, 42, 44, 45, 47, 52, 53, 54, 55, 58, 59, 60, 61, 64, 65, 66, 67, 68, 71, 72, 73, 77, 78, 79, 82, 83, 91, 92, 93, 94, 96, 98, 100, 101, 102, 105, 107, 108, 112, 115, 117, 118, 119, 120, 121, 124, 127, 131, 134, 135, 137, 139, 140, 141, 146, 150, 151, 154, 158, 160, 162, 163, 164, 169, 170, 172, 173, 175, 176, 177, 179, 181, 182, 187, 191, 195, 197, 198, 203, 204, 206, 208, 210, 211, 213, 215, 216]


In [None]:
with open(os.path.join( Output_dir,'dcenergy_error_nodes.txt'), "w") as f:
    f.write(str(list(set(dcenergy_all_error_nodes))))

In [None]:
with open(os.path.join( Output_dir,'dcenergy_error_nodes.csv'),'w') as f:
    w = csv.writer(f)
    w.writerows(dcenergy_error_nodes_dict.keys(), dcenergy_error_nodes_dict)

In [65]:
# %matplotlib inline
plt.ioff()
for file_name, df_name in zip(nodes_file_names, nodes_df_names):
    nodes_stats = nodes_no_drop_df_dict[df_name]
    node_df     = data_prep_df[df_name]
    for node in nodes_stats.iloc[:, dcenergy_error_nodes_dict[df_name]].columns.values:
        int_name = int(node.split('x')[-1])
        fig = node_df[node_df.nodename == node].\
            plot(x='timestamp_py', y='dcenergy', title=df_name+'\nNode '+str(int_name)+' dc_energy').get_figure()
        fig.savefig( os.path.join( Plots_dir, 'Error dcenergy', 'Error_dc_en_' + df_name + '_' + node + '.png' ), dpi=500 )

---------
## Max CPU, system and memory power utilization of nodes over all months

In [84]:
node_max_power = pd.DataFrame(columns = node_names, \
                              index = ['cpu_power_max', 'mem_power_max', 'sys_power_max'], \
                              data=0)
for file_name in nodes_file_names:
    nodes_stats = pd.read_csv( os.path.join( Output_dir, nodes_stats_+file_name), \
                              delimiter=";", header=0, index_col=0 )
    for node in node_names:
        for util_type in ['cpu', 'mem', 'sys']:
            max_val = nodes_stats.loc[util_type + '_power_max', node]
            if node_max_power.loc[util_type + '_power_max', node] < max_val:
                node_max_power.loc[util_type + '_power_max', node] = max_val

In [85]:
node_max_power.to_csv( path_or_buf=os.path.join( Output_dir, 'node_max_power_all_months.csv'), sep=';') 

In [86]:
node_max_power = pd.read_csv( os.path.join( Output_dir, 'node_max_power_all_months.csv'), \
                              delimiter=";", header=0, index_col=0 )
node_max_power

Unnamed: 0,cresco6x001,cresco6x002,cresco6x003,cresco6x004,cresco6x005,cresco6x006,cresco6x007,cresco6x008,cresco6x009,cresco6x010,...,cresco6x207,cresco6x208,cresco6x209,cresco6x210,cresco6x211,cresco6x212,cresco6x213,cresco6x214,cresco6x215,cresco6x216
cpu_power_max,270.0,290.0,290.0,320.0,280.0,340.0,340.0,340.0,340.0,350.0,...,350.0,340.0,340.0,340.0,340.0,350.0,340.0,340.0,270.0,250.0
mem_power_max,44.0,44.0,43.0,42.0,44.0,44.0,44.0,45.0,41.0,44.0,...,45.0,45.0,43.0,46.0,42.0,44.0,47.0,42.0,18.0,27.0
sys_power_max,350.0,360.0,360.0,360.0,340.0,430.0,430.0,430.0,440.0,430.0,...,440.0,440.0,440.0,440.0,430.0,440.0,440.0,430.0,320.0,300.0


In [87]:
print('Max system power use observed: \t', node_max_power.loc['sys_power_max',:].max(), \
      '\nMax CPU power use observed: \t', node_max_power.loc['cpu_power_max',:].max(), \
      '\nMax memory power use observed: \t', node_max_power.loc['mem_power_max',:].max())

Max system power use observed: 	 450.0 
Max CPU power use observed: 	 350.0 
Max memory power use observed: 	 255.0


## Correlation between temperature and power utilization or load
### Temperature, fans speed and CPU power trends - September

In [None]:
for df_name in nodes_df_names:
    year = re.findall(r'\d+', df_name)[0]
    month = df_name.split(year)[0]
    plot_energy_cons(data=nodes_df_dict[df_name].iloc[5:8,:].mean(axis=1), \
                     node='', \
                     month=month, year=year, avg_flag=True)

In [24]:
nodes_stats_Sept = pd.read_csv(os.path.join( Output_dir, nodes_stats_ + 'September2018.csv'), \
                                         delimiter=";", header=0, index_col=0)
to_drop_ix = np.unique(np.append(\
                                     np.where(nodes_stats_Sept.loc["dcenergy_error_percent", :].isna())[0], \
                                     np.where(nodes_stats_Sept.loc["dcenergy_error_percent", :] > 5.)[0]))
to_drop = nodes_stats_Sept.iloc[:,to_drop_ix].columns.values
node_df_Sept = pd.read_csv(os.path.join( Data_prep_dir, 'September2018.csv'), \
                                         delimiter=";", header=0)
for node in to_drop:
    node_df_Sept.drop(index=node_df_Sept[node_df_Sept.nodename == node].index)

In [25]:
node_Sept_norm = node_df_Sept[node_df_Sept.nodename == node_df_Sept.nodename.unique()[0]].copy()

node_Sept_norm.loc[:, ['cpu_power', 'cpu1_temp', 'cpu2_temp', 'exh_temp']] = \
    preprocessing.normalize(node_Sept_norm.loc[:, ['cpu_power','cpu1_temp', 'cpu2_temp', 'exh_temp']], norm='max', axis=0)
node_Sept_norm.exh_temp = round(node_Sept_norm.exh_temp, 1)

### Spearman correlation coefficient

In [26]:
stats.spearmanr(a=node_df_Sept.exh_temp.values, b=node_df_Sept.cpu_power.values)

SpearmanrResult(correlation=0.3486975817138365, pvalue=0.0)

### Pearson correlation coefficient

In [27]:
print(stats.pearsonr(node_Sept_norm.cpu_power, node_Sept_norm.cpu1_temp), \
      stats.pearsonr(node_Sept_norm.cpu_power, node_Sept_norm.cpu2_temp), \
      stats.pearsonr(node_Sept_norm.cpu_power, node_Sept_norm.exh_temp))

(0.39054727848038256, 1.6573117150516118e-110) (0.4455773885153568, 4.1237857999012705e-147) (0.2811746778478721, 6.179197441727707e-56)


In [28]:
print(stats.pearsonr(node_Sept_norm.cpu_power, node_Sept_norm.exh_temp))

(0.2811746778478721, 6.179197441727707e-56)


## Further analysis
- normalize and correlate fans and CPU power ranges
- investigate on temperature fluctuations
- think of how to work with all the nodes

#### Rename files

In [69]:
# for filename in nodes_file_names: 
#     dst = 'nodes_stats_' + filename
#     src = 'new_nodes_stats_' + filename

#     # rename() function will 
#     # rename all the files 
#     os.rename(os.path.join( Output_dir, src ), os.path.join( Output_dir, dst )) 

# src = 'new_nodes_summary_all_months.csv'
# dst = 'nodes_summary_all_months.csv'
# os.rename(os.path.join( Output_dir, src ), os.path.join( Output_dir, dst ))

# src = 'new_nodes_summary_all_months_no_drop.csv'
# dst = 'nodes_summary_all_months_no_drop.csv'
# os.rename(os.path.join( Output_dir, src ), os.path.join( Output_dir, dst )) 