In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import OrdinalEncoder


In [2]:
df = pd.read_parquet(path='..//outage_data.parquet', engine='pyarrow')

# Remove duplicate entries in 2019
# Remove all rows with SimStartDate after 2019-01-01 and event_type == 'thunderstorm'
df = df.loc[~((df['SimStartDate'] > '2019-01-01') & (df['event_type'] == 'thunderstorm'))]

In [None]:
# apply ordinal encoding to 'poly_ewkt', 'point_ewkt', 'event_type' columns

non_numerical_columns = ['poly_ewkt', 'event_type']

enconder = OrdinalEncoder()

# Drop point_ewkt column (already in lat and lon columns)
df = df.drop(columns=['point_ewkt'])

enconder.fit(df[non_numerical_columns])
df[non_numerical_columns] = enconder.transform(df[non_numerical_columns])

#df[['poly_ewkt', 'point_ewkt', 'event_type']].head()
#df['event_type'].value_counts()

# Convert datetime columns to separate columns for year, month, day, hour, minute, second
df['SimStartYear'] = df['SimStartDate'].dt.year
df['SimStartMonth'] = df['SimStartDate'].dt.month
df['SimStartDay'] = df['SimStartDate'].dt.day
df['SimStartHour'] = df['SimStartDate'].dt.hour

df['outage_start_year'] = df['outage_start_time'].dt.year
df['outage_start_month'] = df['outage_start_time'].dt.month
df['outage_start_day'] = df['outage_start_time'].dt.day
df['outage_start_hour'] = df['outage_start_time'].dt.hour

df['outage_end_year'] = df['outage_end_time'].dt.year
df['outage_end_month'] = df['outage_end_time'].dt.month
df['outage_end_day'] = df['outage_end_time'].dt.day
df['outage_end_hour'] = df['outage_end_time'].dt.hour

df['weather_start_year'] = df['weather_start_time'].dt.year
df['weather_start_month'] = df['weather_start_time'].dt.month
df['weather_start_day'] = df['weather_start_time'].dt.day
df['weather_start_hour'] = df['weather_start_time'].dt.hour

df['weather_end_year'] = df['weather_end_time'].dt.year
df['weather_end_month'] = df['weather_end_time'].dt.month
df['weather_end_day'] = df['weather_end_time'].dt.day
df['weather_end_hour'] = df['weather_end_time'].dt.hour


In [8]:
# Use all data after Nov 1, 2018 (15 storms) as test set
test_df = df.loc[df['SimStartDate'] >= '2018-11-01']
train_df = df.loc[df['SimStartDate'] < '2018-11-01']

# Drop columns that are not needed for training
train_df = train_df.drop(['SimStartDate', 'outage_start_time', 'outage_end_time', 'weather_start_time', 'weather_end_time'], axis=1)
test_df = test_df.drop(['SimStartDate', 'outage_start_time', 'outage_end_time', 'weather_start_time', 'weather_end_time'], axis=1)

X = train_df.drop(['outage_count'], axis=1)
y = train_df['outage_count']

X_train,X_test,y_train,y_test = train_test_split(X, y, random_state=0,test_size=0.4)


In [15]:
#params = {'min_samples_leaf':[1,3,10],'n_estimators':[100,1000],
#          'max_features':[0.1,0.5,1.],'max_samples':[0.5,None]}
#
#model = RandomForestRegressor()
#grid_search = GridSearchCV(model,params,cv=3)
#grid_search.fit(X_train,y_train)

In [9]:
rf_model = RandomForestRegressor(n_estimators=100, max_depth=None, min_samples_split=2, random_state=0)
rf_model.fit(X_train, y_train)

test_preds = rf_model.predict(X_test)
test_acc = np.sum(test_preds==y_test)/len(y_test)
print('Test set accuracy is {:.3f}'.format(test_acc))

In [None]:
def calc_feature_importances(model,feat_names,num_to_show):
    # Determine the relative importance of each feature using the random forest model
    importances = model.feature_importances_
    # Get an array of the indices that would sort "importances" in reverse order to get largest to smallest
    indices = np.argsort(importances)[::-1]
    ranked_feats = []
    for i in range(len(indices)):
        feat_name = feat_names[indices[i]]
        ranked_feats.append(feat_name)
    RF_ranking = pd.DataFrame()
    RF_ranking['Feat Index'] = indices
    RF_ranking['Feature'] = ranked_feats
    RF_ranking['Importance'] = np.sort(importances)[::-1]
    display(RF_ranking.iloc[:num_to_show,:])

    # Plot the importance value for each feature
    RF_ranking[:num_to_show][::-1].plot(x='Feature',y='Importance',kind='barh',figsize=(12,7),legend=False,title='RF Feature Importance')
    plt.show()
    return RF_ranking

In [None]:
top_feats = 30
ranking = calc_feature_importances(rf_model,X.columns,top_feats)