In [54]:
import gc
import os
from pathlib import Path
import random
import sys

from tqdm import tqdm_notebook as tqdm
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns

from IPython.core.display import display, HTML

# --- plotly ---
#from plotly import tools, subplots
#import plotly.offline as py
#py.init_notebook_mode(connected=True)
#import plotly.graph_objs as go
#import plotly.express as px
#import plotly.figure_factory as ff

# --- models ---
from sklearn import preprocessing
from sklearn.model_selection import KFold
import lightgbm as lgb
#import xgboost as xgb
#import catboost as cb

import gc
gc.collect()
from sklearn.metrics import mean_squared_error

In [55]:
# Original code from https://www.kaggle.com/gemartin/load-data-reduce-memory-usage by @gemartin
# Modified to support timestamp type, categorical type
# Modified to add option to use float16 or not. feather format does not support float16.
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            # skip datetime type or categorical type
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [56]:
%%time
root = Path('../input')

train_df = pd.read_csv(root/'train.csv')
test_df = pd.read_csv(root/'test.csv')
building_meta_df = pd.read_csv(root/'building_metadata.csv')
test_df["timestamp"] = pd.to_datetime(test_df["timestamp"])

Wall time: 26 s


In [57]:
# i'm now using my leak data station kernel to shortcut.
leak_df = pd.read_feather('../input/leak.feather')
leak_df.fillna(0, inplace=True)
print (leak_df.timestamp.min(), leak_df.timestamp.max())
leak_df = leak_df[(leak_df.timestamp.dt.year > 2016) & (leak_df.timestamp.dt.year < 2019)]
leak_df.loc[leak_df.meter_reading < 0, 'meter_reading'] = 0 # remove large negative values
leak_df = leak_df[leak_df.building_id!=245]

2016-01-01 00:00:00 2018-12-31 23:00:00


In [58]:
leak_df.meter.value_counts()

0.0    7389997
1.0    2856222
3.0     963600
2.0     727682
Name: meter, dtype: int64

In [59]:
print (leak_df.duplicated().sum())

0


In [60]:
print (len(leak_df) / len(train_df))

0.5904947541810736


In [61]:
! ls ../input

'ls' is not recognized as an internal or external command,
operable program or batch file.


In [62]:
del train_df
gc.collect()

16

# Leak Validation for public kernels(not used leak data)

In [63]:
sample_submission1 = pd.read_csv('../sub/submission105-without_leak.csv', index_col=0) #1.05
sample_submission2 = pd.read_csv('../sub/submission_noleak1.06.csv', index_col=0) #1.06
sample_submission3 = pd.read_csv('../sub/fe2_lgbm107.csv', index_col=0) #1.07
sample_submission4 = pd.read_csv('../sub/ens-test1.csv', index_col=0) # 1.08

  mask |= (ar1 == a)


In [64]:
test_df['pred1'] = sample_submission1.meter_reading
test_df['pred2'] = sample_submission2.meter_reading
test_df['pred3'] = sample_submission3.meter_reading
test_df['pred4'] = sample_submission4.meter_reading
#test_df.loc[test_df.pred3<0, 'pred3'] = 0 

del  sample_submission1,  sample_submission2
gc.collect()

test_df = reduce_mem_usage(test_df)
leak_df = reduce_mem_usage(leak_df)

Memory usage of dataframe is 2545.02 MB
Memory usage after optimization is: 1232.74 MB
Decreased by 51.6%
Memory usage of dataframe is 455.38 MB
Memory usage after optimization is: 296.00 MB
Decreased by 35.0%


In [65]:
leak_df = leak_df.merge(test_df[['building_id', 'meter', 'timestamp', 'pred1', 'pred2','pred3', 'pred4','row_id']], on = ['building_id', 'meter', 'timestamp'], how = "left")


In [66]:
leak_df = leak_df.merge(building_meta_df[['building_id', 'site_id']], on='building_id', how='left')
leak_df['pred1_l1p'] = np.log1p(leak_df.pred1)
leak_df['pred2_l1p'] = np.log1p(leak_df.pred2)
leak_df['pred3_l1p'] = np.log1p(leak_df.pred3)
leak_df['pred4_l1p'] = np.log1p(leak_df.pred4)
leak_df['meter_reading_l1p'] = np.log1p(leak_df.meter_reading)

In [67]:
leak_df.head()

Unnamed: 0,building_id,meter,meter_reading,timestamp,pred1,pred2,pred3,pred4,row_id,site_id,pred1_l1p,pred2_l1p,pred3_l1p,pred4_l1p,meter_reading_l1p
0,0,0.0,173.3703,2017-01-01,170.013199,150.045303,177.359177,194.167404,0,0,5.141741,5.01758,5.183799,5.273858,5.161181
1,1,0.0,53.512718,2017-01-01,72.297775,73.2061,80.473015,91.141502,1,0,4.29453,4.306846,4.400272,4.523325,3.998434
2,2,0.0,6.143042,2017-01-01,6.235263,10.1051,5.687088,11.0957,2,0,1.978967,2.407404,1.900179,2.49285,1.966139
3,3,0.0,101.701469,2017-01-01,175.936752,217.688995,271.116943,291.00531,3,0,5.175792,5.38765,5.606232,5.676772,4.631826
4,4,0.0,1141.240723,2017-01-01,1062.235962,988.367981,1418.096313,1085.327637,4,0,6.969072,6.897066,7.257776,6.990558,7.040747


In [68]:
leak_df[leak_df.pred1_l1p.isnull()]

Unnamed: 0,building_id,meter,meter_reading,timestamp,pred1,pred2,pred3,pred4,row_id,site_id,pred1_l1p,pred2_l1p,pred3_l1p,pred4_l1p,meter_reading_l1p


In [69]:
#ashrae-kfold-lightgbm-without-leak-1-08
#sns.distplot(leak_df.pred1_l1p)
#sns.distplot(leak_df.meter_reading_l1p)

leak_score = np.sqrt(mean_squared_error(leak_df.pred1_l1p, leak_df.meter_reading_l1p))
print ('score1=', leak_score)

score1= 0.967886


In [70]:
#ashrae-half-and-half
#sns.distplot(leak_df.pred2_l1p)
#sns.distplot(leak_df.meter_reading_l1p)

leak_score = np.sqrt(mean_squared_error(leak_df.pred2_l1p, leak_df.meter_reading_l1p))
print ('score2=', leak_score)

score2= 0.9690552


In [71]:
#ashrae-half-and-half
#sns.distplot(leak_df.pred2_l1p)
#sns.distplot(leak_df.meter_reading_l1p)

leak_score = np.sqrt(mean_squared_error(leak_df.pred3_l1p, leak_df.meter_reading_l1p))
print ('score3=', leak_score)

score3= 0.9817643


In [72]:
leak_score = np.sqrt(mean_squared_error(leak_df.pred4_l1p, leak_df.meter_reading_l1p))
print ('score4=', leak_score)

score4= 0.9840345


# Leak Validation for Blending

A one idea how we can use LV usefull is blending. We probably can find best blending method without LB probing and it's means we can save our submission.

In [73]:
leak_df['mean_pred'] = np.mean(leak_df[['pred1', 'pred2','pred4']].values, axis=1)
leak_df['mean_pred_l1p'] = np.log1p(leak_df.mean_pred)
leak_score = np.sqrt(mean_squared_error(leak_df.mean_pred_l1p, leak_df.meter_reading_l1p))


#sns.distplot(leak_df.mean_pred_l1p)
#sns.distplot(leak_df.meter_reading_l1p)

print ('mean score=', leak_score)
#mean score= 0.9594506

mean score= 0.9594506


In [74]:
leak_df['median_pred'] = np.median(leak_df[['pred1', 'pred2','pred3','pred4']].values, axis=1)
leak_df['median_pred_l1p'] = np.log1p(leak_df.median_pred)
leak_score = np.sqrt(mean_squared_error(leak_df.median_pred_l1p, leak_df.meter_reading_l1p))

#sns.distplot(leak_df.median_pred_l1p)
#sns.distplot(leak_df.meter_reading_l1p)

print ('meadian score=', leak_score)

meadian score= 0.96367586


# Find Best Weight

In [84]:
v = 0.4* leak_df['pred1'].values + 0.2 * leak_df['pred2'].values + 0.00 * leak_df['pred3'].values +  0.4* leak_df['pred4'].values
vl1p = np.log1p(v)

print (np.sqrt(mean_squared_error(vl1p, leak_df.meter_reading_l1p)))

0.9587314


In [85]:
from scipy.stats import pearsonr
pearsonr(leak_df['pred1'],leak_df['pred4'])

(0.9549593864582946, 0.0)

# Submit

In [86]:
sample_submission = pd.read_feather(os.path.join(root, 'sample_submission.feather'))
sample_submission['meter_reading'] = 0.4 * test_df.pred1 +  0.2 * test_df.pred2 +  0.4 * test_df.pred4
sample_submission.loc[sample_submission.meter_reading < 0, 'meter_reading'] = 0

In [87]:
leak_df = leak_df[['meter_reading', 'row_id']].set_index('row_id').dropna()
#sample_submission.loc[leak_df.index, 'meter_reading'] = leak_df['meter_reading']

In [88]:
sample_submission.head()

Unnamed: 0,row_id,meter_reading
0,0,175.681305
1,1,80.016937
2,2,8.953405
3,3,230.314636
4,4,1056.699097


In [89]:
sample_submission.to_csv('../sub/ensemble_best_no_leak.csv.gz', compression='gzip',index=False, float_format='%.4f')

In [90]:
sample_submission.loc[leak_df.index, 'meter_reading'] = leak_df['meter_reading']

In [91]:
sample_submission.to_csv('../sub/ensemble_best_leak.csv.gz', compression='gzip',index=False, float_format='%.4f')

In [92]:
sample_submission.head()

Unnamed: 0,row_id,meter_reading
0,0,173.3703
1,1,53.512718
2,2,6.143042
3,3,101.701469
4,4,1141.240723
