In [1]:
energy_type = 'steam'

In [2]:
# work with data
import pandas as pd
import numpy as np

# ML models
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LassoCV, RidgeCV
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_log_error, r2_score

random_state = 42

# technical issues
import os
%matplotlib inline

In [3]:
train_data = pd.read_csv('tables' + os.sep + 'train_data_' + energy_type + '.csv')
train_data

Unnamed: 0,meter_reading,month,day,hour,site_id,primary_use,square_feet,year_built,floor_count
0,0.00,1,1,0,6,0,13512,1966.0,4.0
1,0.00,1,1,1,6,0,13512,1966.0,4.0
2,0.00,1,1,2,6,0,13512,1966.0,4.0
3,0.00,1,1,3,6,0,13512,1966.0,4.0
4,0.00,1,1,4,6,0,13512,1966.0,4.0
...,...,...,...,...,...,...,...,...,...
2209675,1561.87,12,31,18,6,1,143110,1976.0,1.0
2209676,1552.10,12,31,19,6,1,143110,1976.0,1.0
2209677,1574.81,12,31,20,6,1,143110,1976.0,1.0
2209678,1531.23,12,31,21,6,1,143110,1976.0,1.0


In [4]:
test_data = pd.read_csv('tables' + os.sep + 'test_data_' + energy_type + '.csv')
test_data

Unnamed: 0,row_id,month,day,hour,site_id,primary_use,square_feet,year_built,floor_count
0,16340505,1,1,1,6,0,13512,1966.0,4.0
1,16340663,1,1,2,6,0,13512,1966.0,4.0
2,16340665,1,1,3,6,0,13512,1966.0,4.0
3,16340745,1,1,4,6,0,13512,1966.0,4.0
4,16340903,1,1,5,6,0,13512,1966.0,4.0
...,...,...,...,...,...,...,...,...,...
5676475,41692933,11,14,10,15,9,99541,1993.0,1.0
5676476,41693184,11,14,11,15,9,99541,1993.0,1.0
5676477,41697093,5,9,5,15,9,99541,1993.0,1.0
5676478,41697343,5,9,6,15,9,99541,1993.0,1.0


## ML Models

### Preprocessing

In [5]:
def make_pipeline(model):
    numeric_features = ['square_feet', 'year_built']
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())])

    categorical_features = ['month', 'day', 'hour', 'site_id', 'primary_use']
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])

    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', model)])
    return pipe


def manual_split(X, y, train_size=0.8):
    X_train, X_valid = X[:int(train_size*X.shape[0])], X[int(train_size*X.shape[0]):]
    y_train, y_valid = y[:int(train_size*X.shape[0])], target[int(train_size*X.shape[0]):]
    return X_train, X_valid, y_train, y_valid


def run_grid_search(X, y, model, param_grid):
    pipe = make_pipeline(model)
    grid_search = GridSearchCV(pipe, param_grid, cv=5, n_jobs=-1, scoring='r2')
    grid_search.fit(X, y)
    print("Best parameter (CV score=%0.3f):" % grid_search.best_score_)
    print(grid_search.best_params_)
    
    return grid_search

def full_procedure(X_train, X_test, y_train, model, param_grid, with_grid_search=False):
    if with_grid_search:
        grid_search = run_grid_search(X_train, y_train, model, param_grid)
        y_pred = grid_search.predict(X_test)
        y_pred[y_pred<0] = 0
    else:
        pipe = make_pipeline(model=model)
        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_test)
        y_pred[y_pred<0] = 0
        
    return y_pred

In [6]:
def FeaturesTargetSplit(data):
    target = data['meter_reading']
    features = data.drop(['meter_reading'], axis=1)
    
    return features, target

### Linear Model

In [7]:
submission = pd.DataFrame(columns=['row_id', 'meter_reading'])

In [8]:
param_grid_linear = {'regressor__alpha': np.logspace(0, 1, 1)}

for primary_use in train_data['primary_use'].unique():
    features, target = FeaturesTargetSplit(train_data[train_data['primary_use'] == primary_use])
    y_pred = full_procedure(features, test_data[test_data['primary_use'] == primary_use].drop(['row_id'], axis=1), target, Lasso(), param_grid_linear, with_grid_search=True)
    
    results = pd.DataFrame(columns=['row_id', 'meter_reading'])
    results['row_id'] = test_data[test_data['primary_use'] == primary_use]['row_id']
    results['meter_reading'] = y_pred
    submission = submission.append(results, ignore_index = True)
    print(primary_use, ': OK')

Best parameter (CV score=-0.177):
{'regressor__alpha': 1.0}
0 : OK
Best parameter (CV score=-0.073):
{'regressor__alpha': 1.0}
6 : OK
Best parameter (CV score=0.189):
{'regressor__alpha': 1.0}
4 : OK
Best parameter (CV score=-0.237):
{'regressor__alpha': 1.0}
1 : OK
Best parameter (CV score=-2.792):
{'regressor__alpha': 1.0}
9 : OK
Best parameter (CV score=-8.119):
{'regressor__alpha': 1.0}
3 : OK
Best parameter (CV score=-35.943):
{'regressor__alpha': 1.0}
2 : OK
Best parameter (CV score=-2.594):
{'regressor__alpha': 1.0}
8 : OK
Best parameter (CV score=-0.475):
{'regressor__alpha': 1.0}
12 : OK
Best parameter (CV score=-0.012):
{'regressor__alpha': 1.0}
5 : OK
Best parameter (CV score=-2.523):
{'regressor__alpha': 1.0}
15 : OK




Best parameter (CV score=-351.950):
{'regressor__alpha': 1.0}
7 : OK
Best parameter (CV score=-0.699):
{'regressor__alpha': 1.0}
13 : OK
Best parameter (CV score=0.255):
{'regressor__alpha': 1.0}
14 : OK


In [9]:
submission

Unnamed: 0,row_id,meter_reading
0,16340505,308.043488
1,16340663,308.043488
2,16340665,308.043488
3,16340745,308.043488
4,16340903,308.043488
...,...,...
5676475,41692908,789.467114
5676476,41693159,789.467114
5676477,41697067,474.575463
5676478,41697317,554.529908


In [10]:
def TruncFour(number):
    return int(number * 10000) / 10000.0
submission['meter_reading'] = submission['meter_reading'].apply(TruncFour)

In [11]:
submission.head()

Unnamed: 0,row_id,meter_reading
0,16340505,308.0434
1,16340663,308.0434
2,16340665,308.0434
3,16340745,308.0434
4,16340903,308.0434


In [12]:
submission.to_csv('./submission_' + energy_type + '.csv', sep=',', index=False)