# Your Title Here

**Name(s)**: Ethan Cao

**Website Link**: (your website link)

## Code

In [50]:
import pandas as pd
import numpy as np
import os

import plotly.express as px
pd.options.plotting.backend = 'plotly'
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder, FunctionTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer

### Framing the Problem

In [51]:
def combine_times(date_col_name, time_col_name, new_col_name, df):
    df = df.copy()
    df[new_col_name] = df[date_col_name] + pd.to_timedelta(df[time_col_name].astype(str))
    return df

data = pd.read_excel("outage.xlsx", skiprows=[0,1,2,3,4,6], index_col=1).iloc[:,1:]
data = combine_times("OUTAGE.START.DATE", 'OUTAGE.START.TIME', 'OUTAGE.START.DATETIME', data)
data = combine_times("OUTAGE.RESTORATION.DATE", "OUTAGE.RESTORATION.TIME", "OUTAGE.RESTORATION.DATETIME", data)

In [114]:
data['CAUSE.CATEGORY'].unique()

array(['severe weather', 'intentional attack',
       'system operability disruption', 'equipment failure',
       'public appeal', 'fuel supply emergency', 'islanding'],
      dtype=object)

In [None]:
data

### Baseline Model

In [99]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline

from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder, FunctionTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier

# Load the dataset
# Remove rows with missing target variable (OUTAGE.DURATION)
# data = data.dropna(subset=['OUTAGE.DURATION'])

col_trans = ColumnTransformer([
    ('one-hot', OneHotEncoder(handle_unknown='ignore'), ['POSTAL.CODE'])
    ], remainder='passthrough')

pl = Pipeline([
    ('col-trans', col_trans),
    ('lin-reg', LinearRegression())
])


# Select relevant features and target variable
selected_features_and_target = data[['DEMAND.LOSS.MW', 'CUSTOMERS.AFFECTED', 'RES.PRICE', 'COM.PRICE', 'IND.PRICE','POSTAL.CODE','OUTAGE.DURATION']].dropna()
X = selected_features_and_target.drop(columns=['OUTAGE.DURATION'])
y = selected_features_and_target['OUTAGE.DURATION']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

# Model Training without Feature Engineering
pl.fit(X_train, y_train)

# Predictions on the test set
y_pred = pl.predict(X_test)

In [100]:
pl.score(X_test, y_test)

0.13688010761893388

In [101]:
np.sqrt(mean_squared_error(y_test, y_pred))

3672.567230209743

In [102]:
import pprint as pp

### Final Model

In [103]:
col_trans = ColumnTransformer([
    ('one-hot', OneHotEncoder(handle_unknown='ignore'), ['POSTAL.CODE']),
    ('num-features', make_pipeline(StandardScaler(),PolynomialFeatures()), ['DEMAND.LOSS.MW', 'CUSTOMERS.AFFECTED', 'RES.PRICE', 'COM.PRICE', 'IND.PRICE']),
    ('time-features', FunctionTransformer(lambda x: pd.DataFrame(x['OUTAGE.START.DATETIME'].apply(lambda y: y.hour))),['OUTAGE.START.DATETIME'] )
    ], remainder='passthrough')

# Model Training with Feature Engineering using Pipelines and GridSearchCV
model = make_pipeline(
    col_trans,
    RandomForestRegressor()
)

param_grid = {
    'columntransformer__num-features__polynomialfeatures__degree': [2,3,4,5]
}
selected_features_and_target = data[['DEMAND.LOSS.MW', 'CUSTOMERS.AFFECTED', 'RES.PRICE', 'COM.PRICE', 'IND.PRICE','POSTAL.CODE','OUTAGE.START.DATETIME','OUTAGE.DURATION']].dropna()
X = selected_features_and_target.drop(columns=['OUTAGE.DURATION'])
y = selected_features_and_target['OUTAGE.DURATION']

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X, y)

# Best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score (Cross-Validation):", np.sqrt(-grid_search.best_score_))

Best Parameters: {'columntransformer__num-features__polynomialfeatures__degree': 2}
Best Score (Cross-Validation): 4718.595280345606


In [113]:
col_trans = ColumnTransformer([
    ('one-hot', OneHotEncoder(handle_unknown='ignore'), ['POSTAL.CODE']),
    ('num-features', make_pipeline(StandardScaler(),PolynomialFeatures(degree=2)), ['DEMAND.LOSS.MW', 'CUSTOMERS.AFFECTED', 'RES.PRICE', 'COM.PRICE', 'IND.PRICE']),
    ('time-features', FunctionTransformer(lambda x: pd.DataFrame(x['OUTAGE.START.DATETIME'].apply(lambda y: y.hour))),['OUTAGE.START.DATETIME'] )
    ], remainder='passthrough')



# Model Training with Feature Engineering using Pipelines and GridSearchCV
model = make_pipeline(
    col_trans,
    RandomForestRegressor()
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

model.fit(X_train, y_train)
prediction = model.predict(X_test)
(model.score(X_test, y_test))
    



-0.2507572267035598

In [108]:
temp = pd.Series(np.array(sorted(scores))) 
temp = temp[temp > 0]
temp.mean()

0.25223583174084446

### Fairness Analysis

In [None]:
# TODO