In [1]:
import pandas as pd
import numpy as np
import os

from sklearn.preprocessing import OneHotEncoder, StandardScaler, FunctionTransformer, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LassoCV, RidgeCV
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_log_error, r2_score


random_state = 1

In [2]:
%%time
building_metadata = pd.read_csv('data' + os.sep + 'building_metadata.csv')

Wall time: 19 ms


In [3]:
%%time
train_data = pd.read_csv('data' + os.sep + 'train.csv')

Wall time: 34 s


In [None]:
# %%time
# test = pd.read_csv('data' + os.sep + 'test.csv')

In [None]:
# weather_train = pd.read_csv('data' + os.sep + 'weather_train.csv')

In [None]:
# weather_test = pd.read_csv('data' + os.sep + 'weather_test.csv')

In [None]:
# rename timestamps columns
# train.columns = ['building_id', 'meter', 'ti   mestamp_meter', 'meter_reading']
# test.columns = ['row_id', 'building_id', 'meter', 'timestamp_meter']

# weather_train.columns = ['site_id', 'timestamp_weather', 'air_temperature', 'cloud_coverage', 'dew_temperature', \
#                         'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction', 'wind_speed']
# weather_test.columns = ['site_id', 'timestamp_weather', 'air_temperature', 'cloud_coverage', 'dew_temperature', \
#                         'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction', 'wind_speed']

In [None]:
# test_data = building_metadata.copy()
# test_data = test_data.join(test.set_index('building_id'), on='building_id', how='inner')
# test_data.head()

In [None]:
# test_data.shape

In [4]:
%%time
train_data = train_data.join(building_metadata.set_index('building_id'), on='building_id', how='inner')
train_data.head()

Wall time: 4.9 s


Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count
0,0,0,2016-01-01 00:00:00,0.0,0,Education,7432,2008.0,
2301,0,0,2016-01-01 01:00:00,0.0,0,Education,7432,2008.0,
4594,0,0,2016-01-01 02:00:00,0.0,0,Education,7432,2008.0,
6893,0,0,2016-01-01 03:00:00,0.0,0,Education,7432,2008.0,
9189,0,0,2016-01-01 04:00:00,0.0,0,Education,7432,2008.0,


In [5]:
train_data.shape

(20216100, 9)

In [6]:
#train_data_electricity = train_data[train_data['meter'] == 0]
#train_data_chilledWater = train_data[train_data['meter'] == 1]
train_data_steam = train_data[train_data['meter'] == 2]
#train_data_hotWater = train_data[train_data['meter'] == 3]

In [7]:
def ConvertDate(train_data):
    # Convert date to datetime format
    train_data['timestamp'] = pd.to_datetime(train_data['timestamp'])
    
    # Extract and store year, month, day, hour
    train_data['year'] = train_data.loc[:,'timestamp'].dt.year
    train_data['month'] = train_data.loc[:,'timestamp'].dt.month
    train_data['day'] = train_data.loc[:,'timestamp'].dt.day
    train_data['hour'] = train_data.loc[:,'timestamp'].dt.hour
    
    train_data.drop(['timestamp'], axis=1, inplace=True)

In [8]:
#ConvertDate(train_data_electricity)
#ConvertDate(train_data_chilledWater)
ConvertDate(train_data_steam)
#ConvertDate(train_data_hotWater)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: ht

In [9]:
def CreateMeanMeterReading(train_data, buildings_number):
    new_columns = list(train_data.columns)
    new_columns.append('meter_reading_mean')

    train = pd.DataFrame(columns=new_columns)

    building_ids = train_data['building_id'].unique()[1:buildings_number]
    train_data_cutted = train_data[train_data['building_id'] == train_data['building_id'].unique()[0]]
    for building_id in building_ids:
        train_data_cutted = train_data_cutted.append(train_data[train_data['building_id'] == building_id], ignore_index=True)
    
    for building_id in train_data_cutted['building_id'].unique():
        buildingID = train_data_cutted[train_data_cutted['building_id'] == building_id]
        for month_id in buildingID['month'].unique():
            buildingIDmonthID = buildingID[buildingID['month'] == month_id]
            for day_id in buildingIDmonthID['day'].unique():
                buildingIDmonthIDdayID = buildingIDmonthID[buildingIDmonthID['day'] == day_id]
                train = train.append(buildingIDmonthIDdayID[0:1], ignore_index=True)
                train.set_value(train.shape[0]-1, 'meter_reading_mean', buildingIDmonthIDdayID['meter_reading'].mean())
    
    train = train.drop(['hour', 'year', 'building_id', 'floor_count', 'meter_reading', 'meter'], axis=1)
    train['primary_use'] = LabelEncoder().fit_transform(train['primary_use'])
    train = train.apply(pd.to_numeric)
    return train

In [10]:
#train_electricity = CreateMeanMeterReading(train_data_electricity, 10)
#train_chilledWater = CreateMeanMeterReading(train_data_chilledWater, 10)
train_steam = CreateMeanMeterReading(train_data_steam, 10)
#train_hotWater = CreateMeanMeterReading(train_data_hotWater, 10)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [11]:
target = train_steam['meter_reading_mean']
train_steam.drop(['meter_reading_mean'], axis=1, inplace=True)

In [12]:
train_steam['year_built'].fillna('2016', inplace=True)

In [13]:
def make_pipeline(model):
    numeric_features = ['day', 'month', 'square_feet', 'year_built']
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())])

    categorical_features = ['site_id', 'primary_use']
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])

    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('regressor', model)])
    return pipe


def manual_split(X, y, train_size=0.8):
    X_train, X_valid = X[:int(train_size*X.shape[0])], X[int(train_size*X.shape[0]):]
    y_train, y_valid = y[:int(train_size*X.shape[0])], y[int(train_size*X.shape[0]):]
    return X_train, X_valid, y_train, y_valid


def run_grid_search(X, y, model, param_grid):
    pipe = make_pipeline(model)
    grid_search = GridSearchCV(pipe, param_grid, cv=5, n_jobs=-1, scoring='r2')
    grid_search.fit(X, y)
    print("Best parameter (CV score=%0.3f):" % grid_search.best_score_)
    print(grid_search.best_params_)
    
    return grid_search

def full_procedure(X_train, X_test, y_train, model, param_grid, with_grid_search=False):
    if with_grid_search:
        grid_search = run_grid_search(X_train, y_train, model, param_grid)
        y_pred = grid_search.predict(X_test)
        y_pred[y_pred<0] = 0
    else:
        pipe = make_pipeline(model=model)
        pipe.fit(X_train, y_train)
        y_pred = pipe.predict(X_test)
        y_pred[y_pred<0] = 0
        
    return y_pred

In [22]:
X_train, X_test, y_train, y_test = manual_split(train_steam, target)
param_grid_linear = {'regressor__alpha': np.logspace(5, 8, 7)}

y_pred = full_procedure(X_train, X_test, y_train, Lasso(), param_grid_linear)
print(r2_score(y_pred, y_test))

y_pred = full_procedure(X_train, X_test, y_train, Lasso(), param_grid_linear, with_grid_search=True)
print(r2_score(y_pred, y_test))

-1.3505371208604529
Best parameter (CV score=-13.392):
{'regressor__alpha': 100000.0}
-2.269903964240811e+31




In [18]:
# gridsearching parameters
param_grid_linear = {'regressor__alpha': np.logspace(5, 8, 7)}
param_grid_forest = {
    'regressor__bootstrap': [True],
    'regressor__max_depth': [80, 90, 100, 110],
    'regressor__max_features': [2, 3],
    'regressor__min_samples_leaf': [3, 4, 5],
    'regressor__min_samples_split': [8, 10, 12],
    'regressor__n_estimators': [100, 200, 300, 1000]
}
models = [Lasso(random_state=random_state), Ridge(random_state=random_state), 
          RandomForestRegressor(random_state=random_state)]
grid_search_results = []
for i, model in enumerate(models):
    if(i<2):
        param_grid = param_grid_linear
    else:
        param_grid = param_grid_forest
    grid_search = run_grid_search(X_train, y_train, model, param_grid=param_grid)
    grid_search_results.append(grid_search)



Best parameter (CV score=-13.392):
{'regressor__alpha': 100000.0}




Best parameter (CV score=-13.393):
{'regressor__alpha': 100000000.0}
Best parameter (CV score=-47.763):
{'regressor__bootstrap': True, 'regressor__max_depth': 80, 'regressor__max_features': 3, 'regressor__min_samples_leaf': 5, 'regressor__min_samples_split': 12, 'regressor__n_estimators': 100}




At this point it's the best that we could have done for LinearRegression(). But for 3 other models there is still some place for improvement. Let's tune hyperparameters for them.

In [19]:
X_train, X_valid, y_train, y_valid = manual_split(train_steam, target, train_size=0.8)

#dummy models run
models = [LinearRegression(), Lasso(random_state=random_state), Ridge(random_state=random_state), 
          RandomForestRegressor(random_state=random_state, n_estimators=100, n_jobs=-1)]
for model in models:
    model.fit(X_train, y_train)
    print(model, "score: %.3f" % r2_score(model.predict(X_valid), y_valid))

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False) score: -8.846
Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=1,
      selection='cyclic', tol=0.0001, warm_start=False) score: -8.887
Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=1, solver='auto', tol=0.001) score: -8.851
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                      oob_score=False, random_state=1, verbose=0,
                      warm_start=False) score: -0.254


In [20]:
#applying feature transformations
models = [Lasso(random_state=random_state), Ridge(random_state=random_state), 
          RandomForestRegressor(random_state=random_state)]

for model in models:
    pipe = make_pipeline(model=model)
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_valid)
    y_pred[y_pred<0] = 0
    print(model, "score: %.3f" % r2_score(y_pred, y_valid))

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=1,
      selection='cyclic', tol=0.0001, warm_start=False) score: -1.351
Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=1, solver='auto', tol=0.001) score: -1.331
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=1, verbose=0,
                      warm_start=False) score: -0.245


