In [14]:
import pandas as pd


In [15]:
# read list of values from features.csv
features = pd.read_csv('../features.csv', header=None)
feature_list = features.values.tolist()
unique_features = features[0].unique().tolist()
print('Number of features: ', len(feature_list))
print('Number of unique features: ', len(unique_features))


Number of features:  4110
Number of unique features:  4110


In [16]:
def get_str_prfx(s, pct):
    if len(s) > 1:
        index = int(len(s)*pct)
        return s[:index]
    else:
        return s

In [17]:
# group features that have 75% or more similarity
feature_groups = {}
threshold = 0.9
frst_str = get_str_prfx(unique_features[0], threshold)

for feat in unique_features:
    prfx = get_str_prfx(feat, threshold)
    if prfx in feature_groups:
        print('Adding feature: ', feat, ' to group: ', prfx)
        feature_groups[prfx].append(feat)
    else:
        print('Creating new group: ', prfx, ' with feature: ', feat)
        feature_groups[prfx] = [feat]

print('Number of feature groups: ', len(feature_groups))

Creating new group:  0  with feature:  0
Creating new group:  breaker_coun  with feature:  breaker_counts
Creating new group:  fuse_coun  with feature:  fuse_counts
Creating new group:  switch_coun  with feature:  switch_counts
Creating new group:  transformer_coun  with feature:  transformer_counts
Creating new group:  recloser_coun  with feature:  recloser_counts
Creating new group:  pole_coun  with feature:  pole_counts
Creating new group:  poly_ewk  with feature:  poly_ewkt
Creating new group:  point_ewk  with feature:  point_ewkt
Creating new group:  grid_i  with feature:  grid_id
Creating new group:  outage_cou  with feature:  outage_count
Creating new group:  SimStartDa  with feature:  SimStartDate
Creating new group:  outage_start_ti  with feature:  outage_start_time
Creating new group:  outage_end_ti  with feature:  outage_end_time
Creating new group:  weather_start_ti  with feature:  weather_start_time
Creating new group:  weather_end_ti  with feature:  weather_end_time
Creat

In [18]:
# Print size of each group that has more than 1 feature
ct = 0

for key, value in feature_groups.items():
    if len(value) > 1:
        print(key, ' : ', len(value))
        ct += 1

print('Number of feature groups with more than 1 feature: ', ct)

10 metre wind speed_m s**-1 (max)_lambert_level 10 m_  :  2
10 metre wind speed_m s**-1 (max)_lambert_level 10 m_c  :  2
10 metre wind speed_m s**-1 (max)_lambert_level 10   :  3
10 metre wind speed_m s**-1 (max)_lambert_level 10 m  :  2
Wind speed (gust)_m s**-1 (max)_lambert_level 10 m  :  6
221_221 (min)_lambert_levels 0-40000-filtered_2-me  :  4
HOURLY_WET_SNOW_ACCUM_RATE m/h-filtered_8-mi  :  4
AIR_DENSITY kg/m3-filtered_2-mean  :  4
Snow depth_m (instant)_lambert_level 0-filtered_8-m  :  8
4_4 (max)_lambert_level 0-filtered_2-mea  :  4
HOURLY_SNOW_DEPTH_WATER_EQUIV_ACCUM_RATE m-2/h-filtered_16-m  :  6
Snow depth_m (instant)_lambert_level 0-filtered_8-me  :  4
Total cloud cover_% (instant)_lambert_level 0-filtered_16-m  :  6
2 metre dewpoint temperature_K (instant)_lambert_level 2 m-filtered_2  :  8
2 metre dewpoint temperature_K (instant)_lambert_level 2 m-filtered_2-  :  6
Total precipitation_kg m-2 (accum)_lambert_level 0-filtered_8-m  :  7
Total cloud cover_% (instant)_lambert

In [19]:
# Load your dataset
data = pd.read_parquet(path='../outage_data.parquet', engine='pyarrow')

# apply ordinal encoding to 'poly_ewkt', 'point_ewkt', 'event_type' columns
non_numerical_columns = ['poly_ewkt', 'event_type']

for column in non_numerical_columns:
    data[column] = data[column].astype('category')
    data[column] = data[column].cat.codes

# Convert datetime columns to separate columns for year, month, day, hour, minute, second
datetime_features = list(data.select_dtypes(include = "datetime64[ns, UTC]").columns)
for i in datetime_features:
    data[i+"_year"] = data[i].dt.year
    data[i+"_month"] = data[i].dt.month
    data[i+"_day"] = data[i].dt.day
    data[i+"_hour"] = data[i].dt.hour

data.drop(columns = ['point_ewkt'], inplace = True)

In [20]:
# for every group of features select the one with best correlation with 'outage_count'

import math

selected_features = []
labels = data['outage_count'].copy()

for key, value in feature_groups.items():
    if len(value) > 1:
        highest_corr = - math.inf
        print("Finding the best feature for group: ", key)
        for feat in value:
            #print("Calculating correlation for feature: ", feat)
            corr = labels.corr(data[feat])
            #print(feat, ': ', corr)
            if corr > highest_corr:
                highest_corr = corr
                selected_feature = feat

        print('Selected feature: ', selected_feature, ' with correlation: ', highest_corr)
        selected_features.append(selected_feature)
    else:
        print('Selected feature: ', value[0])
        selected_features.append(value[0])
    
display(selected_features)

Selected feature:  0
Selected feature:  breaker_counts
Selected feature:  fuse_counts
Selected feature:  switch_counts
Selected feature:  transformer_counts
Selected feature:  recloser_counts
Selected feature:  pole_counts
Selected feature:  poly_ewkt
Selected feature:  point_ewkt
Selected feature:  grid_id
Selected feature:  outage_count
Selected feature:  SimStartDate
Selected feature:  outage_start_time
Selected feature:  outage_end_time
Selected feature:  weather_start_time
Selected feature:  weather_end_time
Selected feature:  event_type
Selected feature:  x
Selected feature:  y
Finding the best feature for group:  10 metre wind speed_m s**-1 (max)_lambert_level 10 m_
Selected feature:  10 metre wind speed_m s**-1 (max)_lambert_level 10 m_cogt_5  with correlation:  0.012201150002195063
Finding the best feature for group:  10 metre wind speed_m s**-1 (max)_lambert_level 10 m_c
Selected feature:  10 metre wind speed_m s**-1 (max)_lambert_level 10 m_cogt_18  with correlation:  -0.010

['0',
 'breaker_counts',
 'fuse_counts',
 'switch_counts',
 'transformer_counts',
 'recloser_counts',
 'pole_counts',
 'poly_ewkt',
 'point_ewkt',
 'grid_id',
 'outage_count',
 'SimStartDate',
 'outage_start_time',
 'outage_end_time',
 'weather_start_time',
 'weather_end_time',
 'event_type',
 'x',
 'y',
 '10 metre wind speed_m s**-1 (max)_lambert_level 10 m_cogt_5',
 '10 metre wind speed_m s**-1 (max)_lambert_level 10 m_cogt_18',
 '10 metre wind speed_m s**-1 (max)_lambert_level 10 m_gt_5',
 '10 metre wind speed_m s**-1 (max)_lambert_level 10 m_gt_18',
 'Wind speed (gust)_m s**-1 (max)_lambert_level 10 m_gt_17',
 '221_221 (min)_lambert_levels 0-40000-filtered_2-mean_var',
 '221_221 (min)_lambert_levels 0-40000-filtered_2-mean_mean',
 'HOURLY_WET_SNOW_ACCUM_RATE m/h-filtered_8-min_sum',
 'HOURLY_WET_SNOW_ACCUM_RATE m/h-filtered_8-min_mean',
 'AIR_DENSITY kg/m3-filtered_2-mean_var',
 'AIR_DENSITY kg/m3-filtered_2-mean_mean',
 'Snow depth_m (instant)_lambert_level 0-filtered_8-min_min',
