In [1]:
energy_type = 'hotWater'

In [2]:
# work with data
import pandas as pd
import numpy as np

# ML models
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LassoCV, RidgeCV
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_log_error, r2_score

random_state = 42

# technical issues
import os
%matplotlib inline

In [3]:
train_data = pd.read_csv('tables' + os.sep + 'train_data_' + energy_type + '.csv')
train_data

Unnamed: 0,meter_reading,month,day,hour,site_id,primary_use,square_feet,year_built,floor_count
0,0.000,1,1,0,1,0,5374,1966.0,4.0
1,10.000,1,1,1,1,0,5374,1966.0,4.0
2,10.000,1,1,2,1,0,5374,1966.0,4.0
3,10.000,1,1,3,1,0,5374,1966.0,4.0
4,0.000,1,1,4,1,0,5374,1966.0,4.0
...,...,...,...,...,...,...,...,...,...
1217138,133.282,12,31,19,11,0,81390,1966.0,4.0
1217139,131.019,12,31,20,11,0,81390,1966.0,4.0
1217140,122.936,12,31,21,11,0,81390,1966.0,4.0
1217141,125.765,12,31,22,11,0,81390,1966.0,4.0


In [4]:
test_data = pd.read_csv('tables' + os.sep + 'test_data_' + energy_type + '.csv')
test_data

Unnamed: 0,row_id,month,day,hour,site_id,primary_use,square_feet,year_built,floor_count
0,2260082,1,1,0,1,0,5374,1966.0,4.0
1,2260145,1,1,1,1,0,5374,1966.0,4.0
2,2260208,1,1,2,1,0,5374,1966.0,4.0
3,2260271,1,1,3,1,0,5374,1966.0,4.0
4,2260334,1,1,4,1,0,5374,1966.0,4.0
...,...,...,...,...,...,...,...,...,...
2540395,41692715,11,14,10,15,0,171008,1930.0,4.0
2540396,41692965,11,14,11,15,0,171008,1930.0,4.0
2540397,41696866,5,9,5,15,0,171008,1930.0,4.0
2540398,41697116,5,9,6,15,0,171008,1930.0,4.0


## ML Models

### Preprocessing

In [5]:
def make_pipeline(model):
    numeric_features = ['square_feet', 'year_built']
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())])

    categorical_features = ['month', 'day', 'hour', 'site_id', 'primary_use']
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])

    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', model)])
    return pipe


def manual_split(X, y, train_size=0.8):
    X_train, X_valid = X[:int(train_size*X.shape[0])], X[int(train_size*X.shape[0]):]
    y_train, y_valid = y[:int(train_size*X.shape[0])], target[int(train_size*X.shape[0]):]
    return X_train, X_valid, y_train, y_valid


def run_grid_search(X, y, model, param_grid):
    pipe = make_pipeline(model)
    grid_search = GridSearchCV(pipe, param_grid, cv=5, n_jobs=-1, scoring='r2')
    grid_search.fit(X, y)
    print("Best parameter (CV score=%0.3f):" % grid_search.best_score_)
    print(grid_search.best_params_)
    
    return grid_search

def full_procedure(X_train, X_test, y_train, model, param_grid, with_grid_search=False):
    if with_grid_search:
        grid_search = run_grid_search(X_train, y_train, model, param_grid)
        y_pred = grid_search.predict(X_test)
        y_pred[y_pred<0] = 0
    else:
        pipe = make_pipeline(model=model)
        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_test)
        y_pred[y_pred<0] = 0
        
    return y_pred

In [6]:
def FeaturesTargetSplit(data):
    target = data['meter_reading']
    features = data.drop(['meter_reading'], axis=1)
    
    return features, target

### Linear Model

In [7]:
submission = pd.DataFrame(columns=['row_id', 'meter_reading'])

In [8]:
param_grid_linear = {'regressor__alpha': np.logspace(0, 1, 1)}

for primary_use in train_data['primary_use'].unique():
    features, target = FeaturesTargetSplit(train_data[train_data['primary_use'] == primary_use])
    y_pred = full_procedure(features, test_data[test_data['primary_use'] == primary_use].drop(['row_id'], axis=1), target, Lasso(), param_grid_linear, with_grid_search=True)
    
    results = pd.DataFrame(columns=['row_id', 'meter_reading'])
    results['row_id'] = test_data[test_data['primary_use'] == primary_use]['row_id']
    results['meter_reading'] = y_pred
    submission = submission.append(results, ignore_index = True)
    print(primary_use, ': OK')

Best parameter (CV score=-0.067):
{'regressor__alpha': 1.0}
0 : OK
Best parameter (CV score=-3.574):
{'regressor__alpha': 1.0}
9 : OK
Best parameter (CV score=-3.372):
{'regressor__alpha': 1.0}
6 : OK
Best parameter (CV score=-0.719):
{'regressor__alpha': 1.0}
4 : OK
Best parameter (CV score=-0.418):
{'regressor__alpha': 1.0}
1 : OK
Best parameter (CV score=-25.511):
{'regressor__alpha': 1.0}
2 : OK




Best parameter (CV score=-108684.825):
{'regressor__alpha': 1.0}
13 : OK
Best parameter (CV score=-2.254):
{'regressor__alpha': 1.0}
3 : OK


In [9]:
submission

Unnamed: 0,row_id,meter_reading
0,2260082,164.222585
1,2260145,164.222585
2,2260208,164.222585
3,2260271,164.222585
4,2260334,164.222585
...,...,...
2540395,37198257,123.162248
2540396,41613273,392.066889
2540397,41613561,392.066889
2540398,41613849,37.657821


In [10]:
def TruncFour(number):
    return int(number * 10000) / 10000.0
submission['meter_reading'] = submission['meter_reading'].apply(TruncFour)

In [11]:
submission.head()

Unnamed: 0,row_id,meter_reading
0,2260082,164.2225
1,2260145,164.2225
2,2260208,164.2225
3,2260271,164.2225
4,2260334,164.2225


In [12]:
submission.to_csv('./submission_' + energy_type + '.csv', sep=',', index=False)