In [9]:
import pandas as pd
import math
from difflib import SequenceMatcher

In [10]:
# read list of values from features.csv
features = pd.read_csv('../features.csv', header=None)
feature_list = features.values.tolist()
unique_features = features[0].unique().tolist()
print('Number of features: ', len(feature_list))
print('Number of unique features: ', len(unique_features))


Number of features:  4110
Number of unique features:  4110


In [11]:
# Load your dataset
data = pd.read_parquet(path='../src/data/outage_data.parquet', engine='pyarrow')

# apply ordinal encoding to 'poly_ewkt', 'point_ewkt', 'event_type' columns
non_numerical_columns = ['poly_ewkt', 'event_type']

for column in non_numerical_columns:
    data[column] = data[column].astype('category')
    data[column] = data[column].cat.codes

# Convert datetime columns to separate columns for year, month, day, hour, minute, second
datetime_features = list(data.select_dtypes(include = "datetime64[ns, UTC]").columns)
for i in datetime_features:
    data[i+"_year"] = data[i].dt.year
    data[i+"_month"] = data[i].dt.month
    data[i+"_day"] = data[i].dt.day
    data[i+"_hour"] = data[i].dt.hour

data.drop(columns = ['point_ewkt'], inplace = True)

In [12]:
target = data['outage_count']

In [13]:
def group_features(features, similarity_threshold=80):
    groups = []
    used_indices = set()

    for i, feature1 in enumerate(features):
        if i in used_indices:
            continue

        # This will be a new group
        current_group = [feature1]
        used_indices.add(i)

        for j, feature2 in enumerate(features):
            if j in used_indices or i == j:
                continue

            # Calculate similarity
            similarity = SequenceMatcher(None, feature1, feature2).ratio() * 100
            if similarity > similarity_threshold:
                current_group.append(feature2)
                used_indices.add(j)
        
        groups.append(current_group)

    return groups


In [14]:
groups = group_features(unique_features)
for i in range(len(groups)):
    print("Group {}: {}".format(i, groups[i]))

Group 0: ['0']
Group 1: ['breaker_counts']
Group 2: ['fuse_counts']
Group 3: ['switch_counts']
Group 4: ['transformer_counts']
Group 5: ['recloser_counts']
Group 6: ['pole_counts']
Group 7: ['poly_ewkt']
Group 8: ['point_ewkt']
Group 9: ['grid_id']
Group 10: ['outage_count']
Group 11: ['SimStartDate']
Group 12: ['outage_start_time']
Group 13: ['outage_end_time']
Group 14: ['weather_start_time']
Group 15: ['weather_end_time']
Group 16: ['event_type']
Group 17: ['x']
Group 18: ['y']
Group 19: ['10 metre wind speed_m s**-1 (max)_lambert_level 10 m_cogt_5', '10 metre wind speed_m s**-1 (max)_lambert_level 10 m_cogt_9', '10 metre wind speed_m s**-1 (max)_lambert_level 10 m_cogt_13', '10 metre wind speed_m s**-1 (max)_lambert_level 10 m_cogt_18', '10 metre wind speed_m s**-1 (max)_lambert_level 10 m_gt_5', '10 metre wind speed_m s**-1 (max)_lambert_level 10 m_gt_9', '10 metre wind speed_m s**-1 (max)_lambert_level 10 m_gt_13', '10 metre wind speed_m s**-1 (max)_lambert_level 10 m_gt_18', 'Wi

In [15]:
# for every group of features select the one with best correlation with 'outage_count'
selected_features = []
labels = data['outage_count'].copy()

for group in groups:
    if len(group) > 1:
        highest_corr = -math.inf
        for feat in group:
            corr = abs(labels.corr(data[feat]))
            print('Correlation between ', feat, ' and outage_count: ', corr)
            if corr > highest_corr:
                highest_corr = corr
                selected_feature = feat

        print('Selected feature: ', selected_feature, ' with correlation: ', highest_corr)
        selected_features.append(selected_feature)
    else:
        print('Selected feature: ', group[0])
        selected_features.append(group[0])
    
display(selected_features)

Selected feature:  0
Selected feature:  breaker_counts
Selected feature:  fuse_counts
Selected feature:  switch_counts
Selected feature:  transformer_counts
Selected feature:  recloser_counts
Selected feature:  pole_counts
Selected feature:  poly_ewkt
Selected feature:  point_ewkt
Selected feature:  grid_id
Selected feature:  outage_count
Selected feature:  SimStartDate
Selected feature:  outage_start_time
Selected feature:  outage_end_time
Selected feature:  weather_start_time
Selected feature:  weather_end_time
Selected feature:  event_type
Selected feature:  x
Selected feature:  y


NameError: name 'math' is not defined

In [None]:
print('Number of selected features: ', len(selected_features))

Number of selected features:  82


In [None]:
selected_features.remove('0')
selected_features.remove('point_ewkt')

In [None]:
selected_features

['breaker_counts',
 'fuse_counts',
 'switch_counts',
 'transformer_counts',
 'recloser_counts',
 'pole_counts',
 'poly_ewkt',
 'grid_id',
 'outage_count',
 'SimStartDate',
 'outage_start_time',
 'outage_end_time',
 'weather_start_time',
 'weather_end_time',
 'event_type',
 'x',
 'y',
 '(10 metre wind speed_m s**-1 (max)_lambert_level 10 m-filtered_32-max-15)^1',
 '(Wind speed (gust)_m s**-1 (max)_lambert_level 10 m-filtered_32-max-15)^2',
 '221_221 (min)_lambert_levels 0-40000-filtered_16-min_sum',
 'HOURLY_WET_SNOW_ACCUM_RATE m/h-filtered_32-max_max',
 'AIR_DENSITY kg/m3-filtered_2-max_min',
 'Surface pressure_Pa (instant)_lambert_level 0-filtered_8-max_sum',
 '4_4 (max)_lambert_level 0-filtered_2-mean_mean',
 'HOURLY_SNOW_DEPTH_WATER_EQUIV_ACCUM_RATE m-2/h-filtered_32-max_max',
 'Total cloud cover_% (instant)_lambert_level 0-filtered_24-max_var',
 '2 metre temperature_K (instant)_lambert_level 2 m-filtered_2-mean_var',
 'Total precipitation_kg m-2 (accum)_lambert_level 0_mean',
 'Sur

In [None]:
import scipy.stats as stats
import matplotlib.pyplot as plt

alpha = 0.05

insignificant_cols = []
# Perform Pearson correlation test
for col in list(data[selected_features].select_dtypes(include = "float").columns):
    correlation_coefficient, p_value = stats.pearsonr(data[col], target)
    if p_value > alpha:
        insignificant_cols.append(col)
    

In [None]:
insignificant_cols

['(10 metre wind speed_m s**-1 (max)_lambert_level 10 m-filtered_24-mean-15)^1']

In [None]:
# Remove insignificant columns
selected_features = [x for x in selected_features if x not in insignificant_cols]