# SETUP

In [None]:
! pip install -q kaggle

In [None]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"manideepreddyenugala","key":"d6d9a12681a7e19b4570bd40d7d9033b"}'}

In [None]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/

In [None]:
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
! kaggle competitions download -c 'ashrae-energy-prediction'

Downloading ashrae-energy-prediction.zip to /content
 97% 367M/379M [00:06<00:00, 41.8MB/s]
100% 379M/379M [00:06<00:00, 63.0MB/s]


In [None]:
!unzip  /content/ashrae-energy-prediction.zip

Archive:  /content/ashrae-energy-prediction.zip
  inflating: building_metadata.csv   
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               
  inflating: weather_test.csv        
  inflating: weather_train.csv       


# IMPORTING DATA

In [None]:
import pandas as pd
import numpy as np
train=pd.read_csv('/content/train.csv')
test=pd.read_csv('/content/test.csv')
metadata=pd.read_csv('/content/building_metadata.csv')
weather_test=pd.read_csv('/content/weather_test.csv')
weather_train=pd.read_csv('/content/weather_train.csv')

# PREPROCESSING

In [None]:
df1=train.merge(metadata.drop(['floor_count','year_built'],axis=1),on='building_id',how='left')

In [None]:
df2=df1.merge(weather_train.drop(['cloud_coverage','precip_depth_1_hr'],axis=1),on=['site_id','timestamp'],how='left')

Deleting the rows where there is null data

In [None]:
x=[]
for col in ['air_temperature','dew_temperature','wind_speed','sea_level_pressure','wind_direction']:
  x+=[df2[col].isna()[df2[col].isna()].index]

In [None]:
def Union(lst1,lst2,lst3,lst4,lst5):
  return list(set().union(lst1,lst2,lst3,lst4,lst5)) 

In [None]:
df2.drop(Union(x[0],x[1],x[2],x[3],x[4]),inplace=True)

In [None]:
df2.isna().sum()*100/df2.shape[0]

building_id           0.0
meter                 0.0
timestamp             0.0
meter_reading         0.0
site_id               0.0
primary_use           0.0
square_feet           0.0
air_temperature       0.0
dew_temperature       0.0
sea_level_pressure    0.0
wind_direction        0.0
wind_speed            0.0
dtype: float64

Now there are no null values

In [None]:
df2.set_index(np.arange(df2.shape[0]),inplace=True)

In [None]:
df2.timestamp=pd.to_datetime(df2.timestamp)
df2['month']= df2['timestamp'].dt.month
df2['day']= df2['timestamp'].dt.day
df2['time']=df2['timestamp'].dt.hour
df2.drop(['timestamp','building_id'],axis=1,inplace=True)

In [None]:
df2.columns

Index(['meter', 'meter_reading', 'site_id', 'primary_use', 'square_feet',
       'air_temperature', 'dew_temperature', 'sea_level_pressure',
       'wind_direction', 'wind_speed', 'month', 'day', 'time'],
      dtype='object')

In [None]:
input_cols=['meter', 'site_id', 'primary_use', 'square_feet',
       'air_temperature', 'dew_temperature', 'sea_level_pressure',
       'wind_direction', 'wind_speed', 'month', 'day', 'time']
target_col=['meter_reading']

In [None]:
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

numeric_cols = ['meter', 'site_id', 'square_feet',
       'air_temperature', 'dew_temperature', 'sea_level_pressure',
       'wind_direction', 'wind_speed', 'month', 'day', 'time']
categorical_cols = ['primary_use']

In [None]:
del train,weather_train,df1

In [None]:
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore').fit(df2[categorical_cols])
encoded_cols = list(encoder.get_feature_names(categorical_cols))
df2[encoded_cols] = encoder.transform(df2[categorical_cols])



In [None]:
df2

Unnamed: 0,meter,meter_reading,site_id,primary_use,square_feet,air_temperature,dew_temperature,sea_level_pressure,wind_direction,wind_speed,...,primary_use_Office,primary_use_Other,primary_use_Parking,primary_use_Public services,primary_use_Religious worship,primary_use_Retail,primary_use_Services,primary_use_Technology/science,primary_use_Utility,primary_use_Warehouse/storage
0,0,0.000,0,Education,7432,25.0,20.0,1019.7,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0.000,0,Education,2720,25.0,20.0,1019.7,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,0.000,0,Education,5376,25.0,20.0,1019.7,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0.000,0,Education,23685,25.0,20.0,1019.7,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,0.000,0,Education,116607,25.0,20.0,1019.7,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17684649,0,8.750,15,Entertainment/public assembly,19619,1.7,-5.6,1008.5,180.0,8.8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17684650,0,4.825,15,Education,4298,1.7,-5.6,1008.5,180.0,8.8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17684651,0,0.000,15,Entertainment/public assembly,11265,1.7,-5.6,1008.5,180.0,8.8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17684652,0,159.575,15,Lodging/residential,29775,1.7,-5.6,1008.5,180.0,8.8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# TRAINING

In [None]:
df2=df2.tail(100000)

In [None]:
train_size=int(0.75*df2.shape[0])
val_size=df2.shape[0]-train_size

In [None]:
# Create training and validation sets
train_inputs = df2[numeric_cols + encoded_cols].head(train_size)

In [None]:
train_targets = df2[target_col][:train_size]

In [None]:
val_inputs = df2[numeric_cols + encoded_cols].tail(val_size)

In [None]:
val_targets = df2[target_col][train_size:]

In [None]:
import xgboost
from sklearn.metrics import mean_squared_error as mse

In [None]:
def rmse(targets,preds):
  return mse(targets,preds,squared=False)

In [None]:
def loss(model,train_inputs,val_inputs,train_targets,val_targets):
  model.fit(train_inputs,train_targets)
  print('train_loss : {} , val_loss : {}'.format(rmse(model.predict(train_inputs),train_targets),rmse(model.predict(val_inputs),val_targets)))

In [None]:
from xgboost import XGBRegressor

In [None]:
model=XGBRegressor(random_state=42,n_jobs=-1,objective='reg:squarederror')

In [None]:
loss(model,train_inputs,val_inputs,train_targets,val_targets)

train_loss : 1014.6527444537511 , val_loss : 956.4988081237847


# TUNING PARAMETERS

In [None]:
from sklearn.model_selection import GridSearchCV
from pprint import pprint

n_estimators = [50,100,200,300,400]
learning_rate = [0.1,0.2,0.3,0.4,0.5,0.6,0.8,0.9,1]

grid1 = {'n_estimators':n_estimators,'learning_rate':learning_rate}

In [None]:
xg_general = GridSearchCV(estimator = model, param_grid=grid1,cv = 2,verbose=10,scoring='neg_root_mean_squared_error',n_jobs = -1)
xg_general.fit(train_inputs, train_targets);

Fitting 2 folds for each of 50 candidates, totalling 100 fits


In [None]:
xg_general.best_estimator_

XGBRegressor(learning_rate=0.5, n_estimators=200, n_jobs=-1,
             objective='reg:squarederror', random_state=42)

In [None]:
max_depth = [5,7,8,10,13,16,18,20]
min_samples_split = [250,500,750]
min_samples_leaf = [50,100,200]
#max_features=['sqrt','auto']

grid2 = {'min_samples_split': min_samples_split,'min_samples_leaf': min_samples_leaf,'max_depth':max_depth} #,'min_samples_split': min_samples_split,'min_samples_leaf': min_samples_leaf,'bootstrap':bootstrap

In [None]:
model=XGBRegressor(random_state=42,n_jobs=-1,objective='reg:squarederror',learning_rate= 0.5,n_estimators=200)
xg_booster = GridSearchCV(estimator = xg_general.best_estimator_, param_grid=grid2,cv = 2,verbose=10,scoring='neg_root_mean_squared_error',n_jobs = -1)
xg_booster.fit(train_inputs, train_targets);

Fitting 2 folds for each of 81 candidates, totalling 162 fits


18 fits failed out of a total of 162.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
18 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/xgboost/sklearn.py", line 396, in fit
    callbacks=callbacks)
  File "/usr/local/lib/python3.7/dist-packages/xgboost/training.py", line 216, in train
    xgb_model=xgb_model, callbacks=callbacks)
  File "/usr/local/lib/python3.7/dist-packages/xgboost/training.py", line 74, in _train_internal
    bst.update(dtrain, i, obj)
  File "/usr/local/lib/python3.7/dist-packages/

In [None]:
xg_booster.best_estimator_

XGBRegressor(learning_rate=0.5, max_depth=8, min_samples_leaf=50,
             min_samples_split=250, n_estimators=200, n_jobs=-1,
             objective='reg:squarederror', random_state=42)

# RE-TRAINING

In [None]:
model=XGBRegressor(learning_rate=0.5, max_depth=8, max_features='sqrt',
             min_samples_leaf=50, min_samples_split=250, n_estimators=200,
             n_jobs=-1, objective='reg:squarederror', random_state=42)
loss(model,train_inputs,val_inputs,train_targets,val_targets)

train_loss : 50.789677074109306 , val_loss : 624.2772309441815
