# 05.02 - Modeling Setup - Custom Scoring

 + Imports & Setup
 + Load Test Data
 + Custom Scoring Metric

## Imports & setup

In [1]:
import pathlib
from datetime import datetime
import math
import sys

import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt
plt.style.use('grayscale')
from matplotlib.dates import DateFormatter
import matplotlib.dates as mdates

from sklearn.metrics import mean_absolute_error

sys.path.append("..")
from src.models.models import SetTempAsPower, SK_Prophet

%matplotlib inline

PROJECT_DIR = pathlib.Path.cwd().parent.resolve()
CLEAN_DATA_DIR = PROJECT_DIR / 'data' /'05-clean'

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

## Load Test Data

In [2]:
df = pd.read_csv(CLEAN_DATA_DIR / 'clean-cut.csv', parse_dates=True, index_col=0)
df = df.loc['1994': '1995']
df = df.resample('D').max()
# Just select a reasonable subset of data to test the model wrappers
df = df[['temp', 'dew_point_temp', 'week_of_year', 'daily_peak']]
df.rename(columns={'temp': 'temp_max'}, inplace=True)

y = df.pop('daily_peak')
X = df

X.head()

Unnamed: 0,temp_max,dew_point_temp,week_of_year
1994-01-01,2.8,1.1,52.0
1994-01-02,1.7,0.5,52.0
1994-01-03,-10.3,-12.6,1.0
1994-01-04,-7.4,-11.5,1.0
1994-01-05,-7.2,-10.7,1.0


In [3]:
y.head()

1994-01-01    16892.0
1994-01-02    18947.0
1994-01-03    21923.0
1994-01-04    21457.0
1994-01-05    22082.0
Freq: D, Name: daily_peak, dtype: float64

In [4]:
y.tail()

1995-12-27    19260.0
1995-12-28    19014.0
1995-12-29    18635.0
1995-12-30    18132.0
1995-12-31    17333.0
Freq: D, Name: daily_peak, dtype: float64

##  Custom Scoring Metric

In [5]:
def bound_precision(y_actual: pd.Series, y_predicted: pd.Series, n_to_check=5):
    """
    Accepts two pandas series, and an integer n_to_check
    Series are:
    + actual values
    + predicted values
    Sorts each series by value from high to low, and cuts off each series at n_to_check
    Determines how many hits - ie how many of the indices in the actual series are in the predicted series indices
    Returns number of hits divided by n_to_check    
    """
    y_act = y_actual.copy(deep=True)
    y_pred = y_predicted.copy(deep=True)
    y_act.reset_index(drop=True, inplace=True)
    y_pred.reset_index(drop=True, inplace=True)

    act_dates =set( y_act.sort_values(ascending=False).head(n_to_check).index)
    pred_dates = set(y_pred.sort_values(ascending=False).head(n_to_check).index)
    bound_precision =  len(act_dates.intersection(pred_dates))/ n_to_check
    return bound_precision

y_act = pd.Series([ 11,12,13,14, 15,16,17, 11, 12], index = pd.date_range(start='2019-01-01', periods=9))
y_pred = pd.Series([18,11,13,14, 16,15,15, 14, 11], index = pd.date_range(start='2019-03-20', periods=9))
b_prec = bound_precision(y_act, y_pred, n_to_check=3)
b_prec

0.6666666666666666

In [6]:
X_m = X.copy(deep=True)
y_m = y.copy(deep=True)
X_train = X_m['1994'] ; y_train = y_m['1994']
X_test = X_m['1995'] ; y_test = y_m['1995']

set_temp_as_power = SetTempAsPower(col='temp_max')
set_temp_as_power.fit(X_train, y_train)
preds = set_temp_as_power.predict(X_test)
print(preds)
print()
print(mean_absolute_error(y_test, preds))
print()
print(bound_precision(y_test, preds))

1995-01-01    17382.451087
1995-01-02    17019.626812
1995-01-03    16446.746377
1995-01-04    15511.041667
1995-01-05    15740.193841
                  ...     
1995-12-27    16103.018116
1995-12-28    16561.322464
1995-12-29    16733.186594
1995-12-30    16885.954710
1995-12-31    17325.163043
Freq: D, Length: 365, dtype: float64

3465.6429720071465

0.0


In [7]:
sk_prophet = SK_Prophet(regressors={'temp_max':()})
sk_prophet.fit(X_train, y_train)
preds = sk_prophet.predict(X_test)
print(preds)
print()
print(mean_absolute_error(y_test, preds))
print()
print(bound_precision(y_test, preds))

INFO:numexpr.utils:NumExpr defaulting to 4 threads.
INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.

Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.



date
1995-01-01    17080.619191
1995-01-02    19061.688157
1995-01-03    19220.941450
1995-01-04    19549.236353
1995-01-05    19388.603381
                  ...     
1995-12-27    28064.347694
1995-12-28    27862.681812
1995-12-29    27201.035735
1995-12-30    25291.638565
1995-12-31    24654.362881
Name: yhat, Length: 365, dtype: float64

4260.874951529598

0.2
