In [None]:
energy_type = 'electricity'

In [None]:
# work with data
import pandas as pd
import numpy as np

# ML models
from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LassoCV, RidgeCV
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_log_error, r2_score

random_state = 42

# technical issues
import os
%matplotlib inline

In [None]:
train_data = pd.read_csv('tables' + os.sep + 'train_data_' + energy_type + '.csv')
train_data

In [None]:
test_data = pd.read_csv('tables' + os.sep + 'test_data_' + energy_type + '.csv')
test_data

## ML Models

### Preprocessing

In [None]:
def make_pipeline(model):
    numeric_features = ['square_feet', 'year_built']
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())])

    categorical_features = ['month', 'day', 'hour', 'site_id', 'primary_use']
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])

    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', model)])
    return pipe


def manual_split(X, y, train_size=0.8):
    X_train, X_valid = X[:int(train_size*X.shape[0])], X[int(train_size*X.shape[0]):]
    y_train, y_valid = y[:int(train_size*X.shape[0])], target[int(train_size*X.shape[0]):]
    return X_train, X_valid, y_train, y_valid


def run_grid_search(X, y, model, param_grid):
    pipe = make_pipeline(model)
    grid_search = GridSearchCV(pipe, param_grid, cv=5, n_jobs=-1, scoring='r2')
    grid_search.fit(X, y)
    print("Best parameter (CV score=%0.3f):" % grid_search.best_score_)
    print(grid_search.best_params_)
    
    return grid_search

def full_procedure(X_train, X_test, y_train, model, param_grid, with_grid_search=False):
    if with_grid_search:
        grid_search = run_grid_search(X_train, y_train, model, param_grid)
        y_pred = grid_search.predict(X_test)
        y_pred[y_pred<0] = 0
    else:
        pipe = make_pipeline(model=model)
        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_test)
        y_pred[y_pred<0] = 0
        
    return y_pred

In [None]:
def FeaturesTargetSplit(data):
    target = data['meter_reading']
    features = data.drop(['meter_reading'], axis=1)
    
    return features, target

### Linear Model

In [None]:
submission = pd.DataFrame(columns=['row_id', 'meter_reading'])

In [None]:
param_grid_linear = {'regressor__alpha': np.logspace(0, 1, 1)}

for primary_use in train_data['primary_use'].unique():
    features, target = FeaturesTargetSplit(train_data[train_data['primary_use'] == primary_use])
    y_pred = full_procedure(features, test_data[test_data['primary_use'] == primary_use].drop(['row_id'], axis=1), target, Lasso(), param_grid_linear, with_grid_search=True)
    
    results = pd.DataFrame(columns=['row_id', 'meter_reading'])
    results['row_id'] = test_data[test_data['primary_use'] == primary_use]['row_id']
    results['meter_reading'] = y_pred
    submission = submission.append(results, ignore_index = True)
    print(primary_use, ': OK')

In [None]:
submission

In [None]:
def TruncFour(number):
    return int(number * 10000) / 10000.0
submission['meter_reading'] = submission['meter_reading'].apply(TruncFour)

In [None]:
submission.head()

In [None]:
submission.to_csv('./submission_' + energy_type + '.csv', sep=',', index=False)