# Your Title Here

**Name(s)**: Ethan Cao

**Website Link**: (your website link)

## Code

In [45]:
import pandas as pd
import numpy as np
import os

import plotly.express as px
pd.options.plotting.backend = 'plotly'
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder, FunctionTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer

### Framing the Problem

In [46]:
def combine_times(date_col_name, time_col_name, new_col_name, df):
    df = df.copy()
    df[new_col_name] = df[date_col_name] + pd.to_timedelta(df[time_col_name].astype(str))
    return df

data = pd.read_excel("outage.xlsx", skiprows=[0,1,2,3,4,6], index_col=1).iloc[:,1:]
data = combine_times("OUTAGE.START.DATE", 'OUTAGE.START.TIME', 'OUTAGE.START.DATETIME', data)
data = combine_times("OUTAGE.RESTORATION.DATE", "OUTAGE.RESTORATION.TIME", "OUTAGE.RESTORATION.DATETIME", data)

In [47]:
data['CAUSE.CATEGORY'].unique()

array(['severe weather', 'intentional attack',
       'system operability disruption', 'equipment failure',
       'public appeal', 'fuel supply emergency', 'islanding'],
      dtype=object)

In [None]:
data

In [48]:
data['IS.HURRICANE'] = data['HURRICANE.NAMES'].isna() == False

In [49]:
data['MONTH.START'] = data['OUTAGE.START.DATETIME'].apply(lambda x : x.month)

In [50]:
data[data['MONTH.START'].isna()]

Unnamed: 0_level_0,YEAR,MONTH,U.S._STATE,POSTAL.CODE,NERC.REGION,CLIMATE.REGION,ANOMALY.LEVEL,CLIMATE.CATEGORY,OUTAGE.START.DATE,OUTAGE.START.TIME,...,POPDEN_RURAL,AREAPCT_URBAN,AREAPCT_UC,PCT_LAND,PCT_WATER_TOT,PCT_WATER_INLAND,OUTAGE.START.DATETIME,OUTAGE.RESTORATION.DATETIME,IS.HURRICANE,MONTH.START
OBS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
240,2000,,Texas,TX,FRCC,South,,,NaT,,...,15.2,3.35,0.58,97.258336,2.742036,2.090873,NaT,NaT,False,
340,2000,,Alabama,AL,SERC,Southeast,,,NaT,,...,40.4,4.36,0.99,96.613888,3.386112,2.018314,NaT,NaT,False,
366,2000,,Illinois,IL,SERC,Central,,,NaT,,...,28.6,7.11,1.12,95.864558,4.135442,1.415893,NaT,NaT,False,
767,2000,,North Carolina,NC,SERC,Southeast,,,NaT,,...,73.5,9.48,2.11,90.336127,9.663873,7.52894,NaT,NaT,False,
888,2000,,Delaware,DE,RFC,Northeast,,,NaT,,...,97.3,20.88,6.21,78.30454,21.69546,3.656087,NaT,NaT,False,
1319,2000,,Virginia,VA,SERC,Southeast,,,NaT,,...,53.3,6.75,0.97,92.320281,7.679719,2.997078,NaT,NaT,False,
1507,2002,,Kansas,KS,SPP,South,,,NaT,,...,9.1,1.19,0.42,99.369212,0.632004,0.632004,NaT,NaT,False,
1531,2006,,North Dakota,ND,MRO,West North Central,,,NaT,,...,3.9,0.27,0.1,97.599649,2.401765,2.401765,NaT,NaT,False,
1534,2000,,Alaska,AK,ASCC,,,,NaT,,...,0.4,0.05,0.02,85.761154,14.238846,2.901182,NaT,NaT,False,


In [51]:
data['IS.HURRICANE'].sum()

72

In [52]:
data['DURATION_OUTAGE'] = (data['OUTAGE.RESTORATION.DATE'] - data['OUTAGE.START.DATE']).astype(int)

In [53]:
features = ['POSTAL.CODE', 'ANOMALY.LEVEL', 'DURATION_OUTAGE', 'MONTH.START', 'CAUSE.CATEGORY']

In [54]:
data['ANOMALY.LEVEL'] = data['ANOMALY.LEVEL'].fillna(0)

In [55]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.compose import ColumnTransformer



data_class = data[features].dropna()

X = data_class[['POSTAL.CODE', 'ANOMALY.LEVEL', 'DURATION_OUTAGE', 'MONTH.START']]
y = data_class['CAUSE.CATEGORY']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
col_trans = ColumnTransformer([
    ('one-hot', OneHotEncoder(handle_unknown='ignore'), ['POSTAL.CODE'])],
    remainder = 'passthrough')
pl = Pipeline([
    ('col_trans', col_trans),
    ('forest', RandomForestClassifier())
])

pl.fit(X_train, y_train)
prediction = pl.predict(X_test)


In [56]:
accuracy_score(y_test, prediction)

0.6918032786885245

### Baseline Model

In [50]:
# Load the dataset
# Remove rows with missing target variable (OUTAGE.DURATION)
# data = data.dropna(subset=['OUTAGE.DURATION'])

col_trans = ColumnTransformer([
    ('one-hot', OneHotEncoder(handle_unknown='ignore'), ['POSTAL.CODE'])
    ], remainder='passthrough')

pl = Pipeline([
    ('col-trans', col_trans),
    ('lin-reg', LinearRegression())
])


# Select relevant features and target variable
selected_features_and_target = data[['DEMAND.LOSS.MW', 'CUSTOMERS.AFFECTED', 'RES.PRICE', 'COM.PRICE', 'IND.PRICE','POSTAL.CODE','OUTAGE.DURATION']].dropna()
X = selected_features_and_target.drop(columns=['OUTAGE.DURATION'])
y = selected_features_and_target['OUTAGE.DURATION']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

# Model Training without Feature Engineering
pl.fit(X_train, y_train)

# Predictions on the test set
y_pred = pl.predict(X_test)

In [44]:
pl.score(X_test, y_test)

0.13123736961286414

In [45]:
np.sqrt(mean_squared_error(y_test, y_pred))

3662.341219753386

In [46]:
import pprint as pp

### Final Model

In [48]:
col_trans = ColumnTransformer([
    ('one-hot', OneHotEncoder(handle_unknown='ignore'), ['POSTAL.CODE']),
    ('num-features', make_pipeline(StandardScaler(),PolynomialFeatures()), ['DEMAND.LOSS.MW', 'CUSTOMERS.AFFECTED', 'RES.PRICE', 'COM.PRICE', 'IND.PRICE']),
    ('time-features', FunctionTransformer(lambda x: pd.DataFrame(x['OUTAGE.START.DATETIME'].apply(lambda y: y.hour))),['OUTAGE.START.DATETIME'] )
    ], remainder='passthrough')

# Model Training with Feature Engineering using Pipelines and GridSearchCV
model = make_pipeline(
    col_trans,
    RandomForestRegressor()
)

param_grid = {
    'columntransformer__num-features__polynomialfeatures__degree': [2,3,4,5]
}
selected_features_and_target = data[['DEMAND.LOSS.MW', 'CUSTOMERS.AFFECTED', 'RES.PRICE', 'COM.PRICE', 'IND.PRICE','POSTAL.CODE','OUTAGE.START.DATETIME','OUTAGE.DURATION']].dropna()
X = selected_features_and_target.drop(columns=['OUTAGE.DURATION'])
y = selected_features_and_target['OUTAGE.DURATION']

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X, y)

# Best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score (Cross-Validation):", np.sqrt(-grid_search.best_score_))

Best Parameters: {'columntransformer__num-features__polynomialfeatures__degree': 2}
Best Score (Cross-Validation): 5917.572035673535


In [49]:
col_trans = ColumnTransformer([
    ('one-hot', OneHotEncoder(handle_unknown='ignore'), ['POSTAL.CODE']),
    ('num-features', make_pipeline(StandardScaler(),PolynomialFeatures(degree=2)), ['DEMAND.LOSS.MW', 'CUSTOMERS.AFFECTED', 'RES.PRICE', 'COM.PRICE', 'IND.PRICE']),
    ('time-features', FunctionTransformer(lambda x: pd.DataFrame(x['OUTAGE.START.DATETIME'].apply(lambda y: y.hour))),['OUTAGE.START.DATETIME'] )
    ], remainder='passthrough')



# Model Training with Feature Engineering using Pipelines and GridSearchCV
model = make_pipeline(
    col_trans,
    RandomForestRegressor()
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

model.fit(X_train, y_train)
prediction = model.predict(X_test)
(model.score(X_test, y_test))
    



-1609677104.7184784

In [108]:
temp = pd.Series(np.array(sorted(scores))) 
temp = temp[temp > 0]
temp.mean()

0.25223583174084446

### Fairness Analysis

In [None]:
# TODO