# Your Title Here

**Name(s)**: Ethan Cao

**Website Link**: (your website link)

## Code

In [115]:
import pandas as pd
import numpy as np
import os

import plotly.express as px
pd.options.plotting.backend = 'plotly'
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder, FunctionTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer

### Framing the Problem

In [116]:
def combine_times(date_col_name, time_col_name, new_col_name, df):
    df = df.copy()
    df[new_col_name] = df[date_col_name] + pd.to_timedelta(df[time_col_name].astype(str))
    return df

data = pd.read_excel("outage.xlsx", skiprows=[0,1,2,3,4,6], index_col=1).iloc[:,1:]
data = combine_times("OUTAGE.START.DATE", 'OUTAGE.START.TIME', 'OUTAGE.START.DATETIME', data)
data = combine_times("OUTAGE.RESTORATION.DATE", "OUTAGE.RESTORATION.TIME", "OUTAGE.RESTORATION.DATETIME", data)

In [117]:
data['CAUSE.CATEGORY'].unique()

array(['severe weather', 'intentional attack',
       'system operability disruption', 'equipment failure',
       'public appeal', 'fuel supply emergency', 'islanding'],
      dtype=object)

In [118]:
data

Unnamed: 0_level_0,YEAR,MONTH,U.S._STATE,POSTAL.CODE,NERC.REGION,CLIMATE.REGION,ANOMALY.LEVEL,CLIMATE.CATEGORY,OUTAGE.START.DATE,OUTAGE.START.TIME,...,POPDEN_URBAN,POPDEN_UC,POPDEN_RURAL,AREAPCT_URBAN,AREAPCT_UC,PCT_LAND,PCT_WATER_TOT,PCT_WATER_INLAND,OUTAGE.START.DATETIME,OUTAGE.RESTORATION.DATETIME
OBS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2011,7.0,Minnesota,MN,MRO,East North Central,-0.3,normal,2011-07-01,17:00:00,...,2279.0,1700.5,18.2,2.14,0.60,91.592666,8.407334,5.478743,2011-07-01 17:00:00,2011-07-03 20:00:00
2,2014,5.0,Minnesota,MN,MRO,East North Central,-0.1,normal,2014-05-11,18:38:00,...,2279.0,1700.5,18.2,2.14,0.60,91.592666,8.407334,5.478743,2014-05-11 18:38:00,2014-05-11 18:39:00
3,2010,10.0,Minnesota,MN,MRO,East North Central,-1.5,cold,2010-10-26,20:00:00,...,2279.0,1700.5,18.2,2.14,0.60,91.592666,8.407334,5.478743,2010-10-26 20:00:00,2010-10-28 22:00:00
4,2012,6.0,Minnesota,MN,MRO,East North Central,-0.1,normal,2012-06-19,04:30:00,...,2279.0,1700.5,18.2,2.14,0.60,91.592666,8.407334,5.478743,2012-06-19 04:30:00,2012-06-20 23:00:00
5,2015,7.0,Minnesota,MN,MRO,East North Central,1.2,warm,2015-07-18,02:00:00,...,2279.0,1700.5,18.2,2.14,0.60,91.592666,8.407334,5.478743,2015-07-18 02:00:00,2015-07-19 07:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1530,2011,12.0,North Dakota,ND,MRO,West North Central,-0.9,cold,2011-12-06,08:00:00,...,2192.2,1868.2,3.9,0.27,0.10,97.599649,2.401765,2.401765,2011-12-06 08:00:00,2011-12-06 20:00:00
1531,2006,,North Dakota,ND,MRO,West North Central,,,NaT,,...,2192.2,1868.2,3.9,0.27,0.10,97.599649,2.401765,2.401765,NaT,NaT
1532,2009,8.0,South Dakota,SD,RFC,West North Central,0.5,warm,2009-08-29,22:54:00,...,2038.3,1905.4,4.7,0.30,0.15,98.307744,1.692256,1.692256,2009-08-29 22:54:00,2009-08-29 23:53:00
1533,2009,8.0,South Dakota,SD,MRO,West North Central,0.5,warm,2009-08-29,11:00:00,...,2038.3,1905.4,4.7,0.30,0.15,98.307744,1.692256,1.692256,2009-08-29 11:00:00,2009-08-29 14:01:00


In [119]:
data['IS.HURRICANE'] = data['HURRICANE.NAMES'].isna() == False

In [120]:
data['MONTH.START'] = data['OUTAGE.START.DATETIME'].apply(lambda x : x.month)

In [193]:
data.columns

Index(['YEAR', 'MONTH', 'U.S._STATE', 'POSTAL.CODE', 'NERC.REGION',
       'CLIMATE.REGION', 'ANOMALY.LEVEL', 'CLIMATE.CATEGORY',
       'OUTAGE.START.DATE', 'OUTAGE.START.TIME', 'OUTAGE.RESTORATION.DATE',
       'OUTAGE.RESTORATION.TIME', 'CAUSE.CATEGORY', 'CAUSE.CATEGORY.DETAIL',
       'HURRICANE.NAMES', 'OUTAGE.DURATION', 'DEMAND.LOSS.MW',
       'CUSTOMERS.AFFECTED', 'RES.PRICE', 'COM.PRICE', 'IND.PRICE',
       'TOTAL.PRICE', 'RES.SALES', 'COM.SALES', 'IND.SALES', 'TOTAL.SALES',
       'RES.PERCEN', 'COM.PERCEN', 'IND.PERCEN', 'RES.CUSTOMERS',
       'COM.CUSTOMERS', 'IND.CUSTOMERS', 'TOTAL.CUSTOMERS', 'RES.CUST.PCT',
       'COM.CUST.PCT', 'IND.CUST.PCT', 'PC.REALGSP.STATE', 'PC.REALGSP.USA',
       'PC.REALGSP.REL', 'PC.REALGSP.CHANGE', 'UTIL.REALGSP', 'TOTAL.REALGSP',
       'UTIL.CONTRI', 'PI.UTIL.OFUSA', 'POPULATION', 'POPPCT_URBAN',
       'POPPCT_UC', 'POPDEN_URBAN', 'POPDEN_UC', 'POPDEN_RURAL',
       'AREAPCT_URBAN', 'AREAPCT_UC', 'PCT_LAND', 'PCT_WATER_TOT',
       'PCT

In [122]:
data['IS.HURRICANE'].sum()

72

In [195]:
features = ['POSTAL.CODE', 'ANOMALY.LEVEL', 'OUTAGE.DURATION', 'MONTH', 'CAUSE.CATEGORY']

In [196]:
data['ANOMALY.LEVEL'] = data['ANOMALY.LEVEL'].fillna(0)

In [217]:
px.bar(data,x='CAUSE.CATEGORY',y='CUSTOMERS.AFFECTED')

### Baseline Model

In [255]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.compose import ColumnTransformer



data_class = data[features].dropna()

X = data_class[['POSTAL.CODE', 'ANOMALY.LEVEL', 'MONTH']]
y = data_class['CAUSE.CATEGORY']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
col_trans = ColumnTransformer([
    ('one-hot', OneHotEncoder(handle_unknown='ignore'), ['POSTAL.CODE'])],
    remainder = 'passthrough')
pl = Pipeline([
    ('col_trans', col_trans),
    ('forest', RandomForestClassifier())
])

pl.fit(X_train, y_train)
prediction = pl.predict(X_test)


In [256]:
accuracy_score(y_test, prediction)

0.75

In [257]:
import pprint as pp

### Final Model

In [267]:
features = ['POSTAL.CODE', 'ANOMALY.LEVEL', 'OUTAGE.DURATION', 'MONTH','CUSTOMERS.AFFECTED', 'CAUSE.CATEGORY']
data_class = data[features].dropna()

X = data_class[['POSTAL.CODE', 'ANOMALY.LEVEL', 'MONTH','OUTAGE.DURATION','CUSTOMERS.AFFECTED']]
y = data_class['CAUSE.CATEGORY']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
col_trans = ColumnTransformer([
    ('one-hot', OneHotEncoder(handle_unknown='ignore'), ['POSTAL.CODE'])],
    remainder = 'passthrough')
pl = Pipeline([
    ('col_trans', col_trans),
    ('forest', RandomForestClassifier())
])

pl.fit(X_train, y_train)
prediction = pl.predict(X_test)


In [268]:
param_grid = {"forest__max_depth": np.append(np.arange(5,30),None)}
search = GridSearchCV(pl, param_grid,cv=5)
search.fit(X_train,y_train)


The least populated class in y has only 3 members, which is less than n_splits=5.



GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('col_trans',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('one-hot',
                                                                         OneHotEncoder(handle_unknown='ignore'),
                                                                         ['POSTAL.CODE'])])),
                                       ('forest', RandomForestClassifier())]),
             param_grid={'forest__max_depth': array([5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
       23, 24, 25, 26, 27, 28, 29, None], dtype=object)})

In [269]:
search.best_params_

{'forest__max_depth': 22}

In [270]:
search.cv_results_['mean_test_score']

array([0.81516624, 0.82819808, 0.83884897, 0.85070442, 0.85070442,
       0.86017892, 0.86254579, 0.86489856, 0.86493378, 0.86254579,
       0.86730065, 0.86492674, 0.86848408, 0.86609608, 0.86846999,
       0.86965342, 0.8649197 , 0.87202733, 0.86253874, 0.8649197 ,
       0.86728656, 0.85778388, 0.8649197 , 0.86965342, 0.86966047,
       0.86730065])

In [226]:
accuracy_score(y_test, prediction)

0.8962264150943396

### Fairness Analysis