In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing
import os, gc
import random
import datetime

from tqdm import tqdm_notebook as tqdm

# matplotlib and seaborn for plotting
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb

from IPython.core.display import display, HTML
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 1000)
display(HTML("<style>.container { width:90% !important; }</style>"))

In [3]:
building = pd.read_csv('building_metadata.csv')
weather = pd.read_csv('weather_train.csv')

train = pd.read_csv('train.csv')
train = train.merge(building, on='building_id', how='left')
train = train.merge(weather, on=['site_id', 'timestamp'], how='left')

train = train.query('not (timestamp >= "2016-09-27 23:00:00" & timestamp <= "2016-10-20 23:00:00" & building_id <= 1280 & building_id >= 1275 & meter == 0 )')
train = train.query('not (timestamp >= "2016-09-27 23:00:00" & timestamp <= "2016-10-20 23:00:00" & building_id <= 1300 & building_id >= 1294 & meter == 0 )')
train = train.query('not (timestamp < "2016-05-20 23:00:00" & building_id <= 104 & meter == 0 )')

test = pd.read_csv('test.csv')
test = test.merge(building, on='building_id', how='left')
test = test.merge(weather, on=['site_id', 'timestamp'], how='left')

del weather,building

In [4]:
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
from bdateutil import isbday
import math
cal = calendar()

def get_features(df):
    df['date'] = df['timestamp'].str[:10]
    days_of_year = np.array(df['date'].iloc[0],dtype = np.datetime64)
    days_of_year = days_of_year + np.arange(df['date'].nunique())

    dr = pd.date_range(start=df['date'].iloc[0], end=df['date'].iloc[-1])
    holidays = cal.holidays(start=dr.min(), end=dr.max())
    bdays = {}
    for i in days_of_year:
        bdays[str(i)] = isbday(str(i), holidays = cal.holidays(start=dr.min(), end=dr.max()))

    df['date'] = df['date'].map(bdays)
    df['date'] = df['date'].map({True:1,False:0}).astype(np.uint8)
    
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['year'] = np.uint8(df['timestamp'].dt.year-2000)
    df['month'] = df['timestamp'].dt.month.astype(np.uint8)
    df['day'] = df['timestamp'].dt.day.astype(np.uint8)
    df['month'] = df['timestamp'].dt.month.astype(np.uint8)
    df['day_week'] = np.int8(df['timestamp'].dt.dayofweek)
    df['year_built'] = np.uint8(df['year_built']-1900)
    df['floor_count'] = np.uint8(df['floor_count'])
    
    del df['timestamp']
    return df

In [5]:
import math
get_features(train)
get_features(test)
print('Done')

Done


In [None]:
func = {'air_temperature':['mean', 'median', 'min', 'max'], 'cloud_coverage':['mean', 'median', 'min', 'max'], 'dew_temperature':['mean', 'median', 'min', 'max']}
g1 = train.groupby(['building_id', 'year', 'month','day']).agg(func)
g2 = test.groupby(['building_id', 'year', 'month','day']).agg(func)
g1.columns = ['_'.join(col) for col in g1.columns]
g2.columns = ['_'.join(col) for col in g2.columns]
train = train.merge(g1, on=['building_id', 'year', 'month','day'], how='left')
test = test.merge(g2, on=['building_id', 'year', 'month','day'], how='left')

train['primary_use'] = train['primary_use'].map(pd.concat([train['primary_use'],test['primary_use']]).value_counts())
test['primary_use'] = test['primary_use'].map(pd.concat([train['primary_use'],test['primary_use']]).value_counts())
gc.collect()

## Site_id

In [8]:
import warnings
warnings.filterwarnings("ignore")

num_folds = 3 #4,3,2

folds = KFold(n_splits = num_folds, shuffle = False, random_state = 42)
t = []
tr_oof = []
features = [c for c in train.columns if c not in ['meter','site_id','meter_reading','year']]
target = train['meter_reading']

for meter in train['meter'].unique():
    for site in train['site_id'].unique():
        
        train_small = train.loc[(train['meter'] == meter) & (train['site_id'] == site)]
        test_small = test.loc[(test['meter'] == meter) & (test['site_id'] == site)]
    
        if train_small.shape[0] != 0:
            
            print(meter,site)
            
            oof = np.zeros(len(train_small))
            predictions = np.zeros(len(test_small))
            target = np.log1p(train_small["meter_reading"])

            for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_small, train_small['month'])):
                tr_x, tr_y = train_small.iloc[trn_idx][features], target.iloc[trn_idx]
                vl_x, vl_y = train_small.iloc[val_idx][features], target.iloc[val_idx]
                clf = lgb.LGBMRegressor(n_estimators=6000,
                                               learning_rate=0.2,
                                               feature_fraction=0.9,
                                               subsample=0.2,
                                               subsample_freq=1,
                                               num_leaves=20,
                                               metric='rmse')
                clf.fit(tr_x, tr_y,eval_set=[(vl_x, vl_y)],early_stopping_rounds=150,verbose=2000)
                oof[val_idx] = clf.predict(train_small.iloc[val_idx][features])
                
                predictions += np.expm1(clf.predict(test_small[features]))

            test_small['meter_reading'] = predictions / num_folds
            test_small['meter_reading'] = test_small['meter_reading'].clip(train_small["meter_reading"].min(),train_small["meter_reading"].max())
            train_small['meter_reading_p'] = oof
            t.append(test_small[['meter_reading']])
            tr_oof.append(train_small[['meter_reading_p']])
    
    
t = pd.concat(t)
t = t.sort_index()
oof = pd.concat(tr_oof)
oof = oof.sort_index()
#print(np.sqrt(mean_squared_error(target, oof['meter_reading_p'])))

try:del test['meter_reading']
except:None
    
test = test.merge(t, left_index=True, right_index=True, how='left')

target = np.log1p(train["meter_reading"])
print(np.sqrt(mean_squared_error(target, oof['meter_reading_p'])))

0 1
Training until validation scores don't improve for 150 rounds.
Early stopping, best iteration is:
[47]	valid_0's rmse: 0.476027
Training until validation scores don't improve for 150 rounds.
Early stopping, best iteration is:
[61]	valid_0's rmse: 0.543
Training until validation scores don't improve for 150 rounds.
Early stopping, best iteration is:
[26]	valid_0's rmse: 0.739653
0 2
Training until validation scores don't improve for 150 rounds.
Early stopping, best iteration is:
[175]	valid_0's rmse: 0.466981
Training until validation scores don't improve for 150 rounds.
Early stopping, best iteration is:
[167]	valid_0's rmse: 0.557857
Training until validation scores don't improve for 150 rounds.
Early stopping, best iteration is:
[30]	valid_0's rmse: 0.627868
0 3
Training until validation scores don't improve for 150 rounds.
Early stopping, best iteration is:
[375]	valid_0's rmse: 0.497134
Training until validation scores don't improve for 150 rounds.
Early stopping, best iteratio

Training until validation scores don't improve for 150 rounds.
Early stopping, best iteration is:
[35]	valid_0's rmse: 2.01735
Training until validation scores don't improve for 150 rounds.
Early stopping, best iteration is:
[17]	valid_0's rmse: 1.75468
Training until validation scores don't improve for 150 rounds.
Early stopping, best iteration is:
[14]	valid_0's rmse: 2.00684
3 15
Training until validation scores don't improve for 150 rounds.
Early stopping, best iteration is:
[12]	valid_0's rmse: 0.827509
Training until validation scores don't improve for 150 rounds.
Early stopping, best iteration is:
[36]	valid_0's rmse: 0.81216
Training until validation scores don't improve for 150 rounds.
Early stopping, best iteration is:
[26]	valid_0's rmse: 0.873799
1 2
Training until validation scores don't improve for 150 rounds.
Early stopping, best iteration is:
[29]	valid_0's rmse: 1.65098
Training until validation scores don't improve for 150 rounds.
Early stopping, best iteration is:
[5

In [11]:
from sklearn.metrics import mean_squared_error
print(np.sqrt(mean_squared_error(target, oof['meter_reading_p'])))

1.1380563230327096


In [9]:
t.shape, test.shape

((41697600, 1), (41697600, 21))

In [10]:
t[:10]

Unnamed: 0,meter_reading
0,211.008967
1,87.294143
2,11.383717
3,241.127782
4,1382.594036
5,18.194838
6,103.287796
7,505.982718
8,914.47447
9,290.823724


In [12]:
submission = pd.read_csv('sample_submission.csv')
submission['meter_reading'] = test['meter_reading']

submission.to_csv('site1.csv.gz',index=False, compression='gzip')

submission.head(10)

Unnamed: 0,row_id,meter_reading
0,0,211.008967
1,1,87.294143
2,2,11.383717
3,3,241.127782
4,4,1382.594036
5,5,18.194838
6,6,103.287796
7,7,505.982718
8,8,914.47447
9,9,290.823724


In [None]:
from sklearn.model_selection import train_test_split as train_valid_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import mean_squared_error
import eli5

target_col = 'meter_reading'
y = train[target_col]
Xs = train.drop(target_col,axis=1)

X_train, X_valid, y_train, y_valid = train_valid_split(Xs, y, test_size=0.2, random_state=0)
del train
X_train.shape,X_valid.shape

In [None]:
%%time
model = KNeighborsRegressor(n_neighbors=2,n_jobs=-1)

model.fit(X_train.fillna(0),y_train)

In [None]:
def RMSE(actual,preds):
    return np.sqrt(mean_squared_error(actual,preds))

def get_evaluations(model):
    preds = model.predict(X_train.fillna(0))
    plt.hist(np.log1p(preds),bins=100)
    plt.show();
    print('train_rmse: ',RMSE(y_train,preds))
                    
    preds = model.predict(X_valid.fillna(0))
    plt.hist(np.log1p(preds),bins=100)
    plt.show()
    print('valid_rmse: ',RMSE(y_valid,preds))
    
get_evaluations(model)

In [None]:
eli5.show_weights(model,feature_names=list(X_train.columns))

In [None]:
#1.07: блендинг 2ух моделей:

for meter in train['meter'].unique():
    for site in train['site_id'].unique():
        .......
        
for meter in train['meter'].unique():
    for site in train['building_id'].unique():
        .......

In [None]:
#1.06: умножение предикта ~ на 0.95