In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing
import os, gc
import random
import datetime

from tqdm import tqdm_notebook as tqdm

# matplotlib and seaborn for plotting
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb

from IPython.core.display import display, HTML
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 1000)
display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
train = pd.read_csv('train.csv')
building = pd.read_csv('building_metadata.csv')
weather = pd.read_csv('weather_train.csv')
train = train.merge(building, on='building_id', how='left')
train = train.merge(weather, on=['site_id', 'timestamp'], how='left')

test = pd.read_csv('test.csv')
test = test.merge(building, on='building_id', how='left')
test = test.merge(weather, on=['site_id', 'timestamp'], how='left')

del weather

In [3]:
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
from bdateutil import isbday
import math

cal = calendar()

def get_features_for_train(df):
    
    df['date'] = df['timestamp'].str[:10]
    df['date2'] = df['timestamp'].str[:10]
    
    days_of_year = np.array(df['date'].iloc[0],dtype = np.datetime64)
    days_of_year = days_of_year + np.arange(df['date'].nunique())

    dr = pd.date_range(start=df['date'].iloc[0], end=df['date'].iloc[-1])
    holidays = cal.holidays(start=dr.min(), end=dr.max())
    bdays = {}
    for i in days_of_year:
        bdays[str(i)] = isbday(str(i), holidays = cal.holidays(start=dr.min(), end=dr.max()))

    df['date'] = df['date'].map(bdays)
    df['date'] = df['date'].map({True:1,False:0}).astype(np.uint8)
    
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['year'] = np.uint8(df['timestamp'].dt.year-2000)
    df['month'] = df['timestamp'].dt.month.astype(np.uint8)
    df['day'] = df['timestamp'].dt.day.astype(np.uint8)
    df['month'] = df['timestamp'].dt.month.astype(np.uint8)
    
    df['weekday'] = np.uint8(df['timestamp'].dt.weekday)
    df['day_week'] = np.int8(df['timestamp'].dt.dayofweek)
    
    df['week_ofyear'] = np.int8(df['timestamp'].dt.weekofyear)
    df['day_ofyear'] = np.int16(df['timestamp'].dt.dayofyear)

    df['week_month'] = df['timestamp'].dt.day/7
    df['week_month'] = df['week_month'].apply(lambda x: math.ceil(x)).astype(np.int8)

    # parse and cast columns to a smaller type
    
    #df.rename(columns={"square_feet": "log_square_feet"}, inplace=True)
    #df['log_square_feet'] = np.float16(np.log(df['log_square_feet']))
    
    df['year_built'] = np.uint8(df['year_built']-1900)
    df['floor_count'] = np.uint8(df['floor_count'])
    
    tr1 = df[['date','date2']].drop_duplicates()
    tr1.sort_values('date2',inplace=True)
    tr1['range'] = 0

    tr1.loc[tr1['date']==1,'range'] = range(1,252)
    tr1.loc[tr1['date']==0,'range'] = range(-1,-116,-1)
    
    tr_dict = dict(zip(tr1['date2'],tr1['range']))
    df['date2'] = df['date2'].map(tr_dict)
    
    df['date2'] = np.ceil(df['date2']/7)
    
    del df['timestamp']
    return df

def get_features_for_test(df):
    
    df['date'] = df['timestamp'].str[:10]
    df['date2'] = df['timestamp'].str[:10]
    df['date3'] = df['timestamp'].str[:4]
    days_of_year = np.array(df['date'].iloc[0],dtype = np.datetime64)
    days_of_year = days_of_year + np.arange(df['date'].nunique())

    dr = pd.date_range(start=df['date'].iloc[0], end=df['date'].iloc[-1])
    holidays = cal.holidays(start=dr.min(), end=dr.max())
    bdays = {}
    for i in days_of_year:
        bdays[str(i)] = isbday(str(i), holidays = cal.holidays(start=dr.min(), end=dr.max()))

    df['date'] = df['date'].map(bdays)
    df['date'] = df['date'].map({True:1,False:0}).astype(np.uint8)
    
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['year'] = np.uint8(df['timestamp'].dt.year-2000)
    df['month'] = df['timestamp'].dt.month.astype(np.uint8)
    df['day'] = df['timestamp'].dt.day.astype(np.uint8)
    df['month'] = df['timestamp'].dt.month.astype(np.uint8)
    df['weekday'] = np.uint8(df['timestamp'].dt.weekday)
    df['day_week'] = np.int8(df['timestamp'].dt.dayofweek)
    df['week_ofyear'] = np.int8(df['timestamp'].dt.weekofyear)
    df['day_ofyear'] = np.int16(df['timestamp'].dt.dayofyear)
    df['week_month'] = df['timestamp'].dt.day/7
    df['week_month'] = df['week_month'].apply(lambda x: math.ceil(x)).astype(np.int8)

    #df.rename(columns={"square_feet": "log_square_feet"}, inplace=True)
    #df['log_square_feet'] = np.float16(np.log(df['log_square_feet']))
    
    df['year_built'] = np.uint8(df['year_built']-1900)
    df['floor_count'] = np.uint8(df['floor_count'])
    
    te1 = df[['date','date2','date3']].drop_duplicates()
    te1.sort_values('date2',inplace=True)
    te1['range'] = 0

    te1.loc[(te1['date3']=='2017') & (te1['date']==1),'range'] = range(1,251)
    te1.loc[(te1['date3']=='2017') & (te1['date']==0),'range'] = range(-1,-116,-1)

    te1.loc[(te1['date3']=='2018') & (te1['date']==1),'range'] = range(1,259)
    te1.loc[(te1['date3']=='2018') & (te1['date']==0),'range'] = range(-1,-108,-1)

    te1_dict = dict(zip(te1['date2'],te1['range']))
    df['date2'] = df['date2'].map(te1_dict)
    df['date2'] = np.ceil(df['date2']/7)
    
    
    del df['timestamp'], df['date3']
    return df

In [4]:
import math
get_features_for_train(train)
get_features_for_test(test)
print('Done')

Done


In [5]:
func = {'air_temperature':['mean', 'median', 'min', 'max'],
       'cloud_coverage':['mean', 'median', 'min', 'max'],
       'dew_temperature':['mean', 'median', 'min', 'max']}
g1 = train.groupby(['building_id', 'year', 'date2']).agg(func)
g2 = test.groupby(['building_id', 'year', 'date2']).agg(func)
g1.columns = ['_'.join(col) for col in g1.columns]
g2.columns = ['_'.join(col) for col in g2.columns]
train = train.merge(g1, on=['building_id', 'year', 'date2'], how='left')
test = test.merge(g2, on=['building_id', 'year', 'date2'], how='left')

In [6]:
del train['year'], test['year']

train['primary_use'] = train['primary_use'].rank()
test['primary_use'] = test['primary_use'].rank()

gc.collect()

150

In [8]:
train['primary_use'] = train['primary_use'].rank()
test['primary_use'] = test['primary_use'].rank()

In [24]:
features

['building_id',
 'primary_use',
 'square_feet',
 'year_built',
 'floor_count',
 'air_temperature',
 'cloud_coverage',
 'dew_temperature',
 'precip_depth_1_hr',
 'sea_level_pressure',
 'wind_direction',
 'wind_speed',
 'date',
 'date2',
 'month',
 'day',
 'weekday',
 'day_week',
 'week_ofyear',
 'day_ofyear',
 'week_month',
 'air_temperature_mean',
 'air_temperature_median',
 'air_temperature_min',
 'air_temperature_max',
 'cloud_coverage_mean',
 'cloud_coverage_median',
 'cloud_coverage_min',
 'cloud_coverage_max',
 'dew_temperature_mean',
 'dew_temperature_median',
 'dew_temperature_min',
 'dew_temperature_max']

## Site_id

In [14]:
import warnings
warnings.filterwarnings("ignore")

num_folds = 3 #4,3,2

folds = KFold(n_splits = num_folds, shuffle = False, random_state = 42)
t = []
tr_oof = []
a = 0


features = [c for c in train.columns if c not in ['meter','site_id','meter_reading']]
target = train['meter_reading']


for meter in train['meter'].unique():
    
    for site in train['site_id'].unique():
        
        train_small = train.loc[(train['meter'] == meter) & (train['site_id'] == site)]
        test_small = test.loc[(test['meter'] == meter) & (test['site_id'] == site)]
    
        if train_small.shape[0] != 0:
            
            print(meter,site)
            
            oof = np.zeros(len(train_small))
            predictions = np.zeros(len(test_small))
            target = np.log1p(train_small["meter_reading"])

            for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_small, train_small['month'])):
                tr_x, tr_y = train_small.iloc[trn_idx][features], target.iloc[trn_idx]
                vl_x, vl_y = train_small.iloc[val_idx][features], target.iloc[val_idx]
                clf = lgb.LGBMRegressor(n_estimators=6000,
                                               learning_rate=0.2,
                                               feature_fraction=0.9,
                                               subsample=0.2,
                                               subsample_freq=1,
                                               num_leaves=20,
                                               metric='rmse')
                clf.fit(tr_x, tr_y,eval_set=[(vl_x, vl_y)],early_stopping_rounds=150,verbose=2000)
                oof[val_idx] = clf.predict(train_small.iloc[val_idx][features])
                
                predictions += np.expm1(clf.predict(test_small[features]))

            test_small['meter_reading'] = predictions / num_folds
            test_small['meter_reading'] = test_small['meter_reading'].clip(train_small["meter_reading"].min(),train_small["meter_reading"].max())
            train_small['meter_reading_p'] = oof
            t.append(test_small[['meter_reading']])
            tr_oof.append(train_small[['meter_reading_p']])
    
    
t = pd.concat(t)
t = t.sort_index()
oof = pd.concat(tr_oof)
oof = oof.sort_index()
#print(np.sqrt(mean_squared_error(target, oof['meter_reading_p'])))

try:del test['meter_reading']
except:None
    
test = test.merge(t, left_index=True, right_index=True, how='left')

target = np.log1p(train["meter_reading"])
print(np.sqrt(mean_squared_error(target, oof['meter_reading_p'])))

0 0
Training until validation scores don't improve for 150 rounds.
Early stopping, best iteration is:
[18]	valid_0's rmse: 1.27107
Training until validation scores don't improve for 150 rounds.
Early stopping, best iteration is:
[1]	valid_0's rmse: 2.97117
Training until validation scores don't improve for 150 rounds.
Early stopping, best iteration is:
[8]	valid_0's rmse: 0.770393
0 1
Training until validation scores don't improve for 150 rounds.
Early stopping, best iteration is:
[56]	valid_0's rmse: 0.373565
Training until validation scores don't improve for 150 rounds.
Early stopping, best iteration is:
[85]	valid_0's rmse: 0.482562
Training until validation scores don't improve for 150 rounds.
Early stopping, best iteration is:
[20]	valid_0's rmse: 0.760627
0 2
Training until validation scores don't improve for 150 rounds.
Early stopping, best iteration is:
[118]	valid_0's rmse: 0.456179
Training until validation scores don't improve for 150 rounds.
Early stopping, best iteration i

Training until validation scores don't improve for 150 rounds.
Early stopping, best iteration is:
[18]	valid_0's rmse: 2.01164
Training until validation scores don't improve for 150 rounds.
Early stopping, best iteration is:
[175]	valid_0's rmse: 1.71098
Training until validation scores don't improve for 150 rounds.
Early stopping, best iteration is:
[11]	valid_0's rmse: 2.02812
3 15
Training until validation scores don't improve for 150 rounds.
Early stopping, best iteration is:
[1]	valid_0's rmse: 1.64463
Training until validation scores don't improve for 150 rounds.
Early stopping, best iteration is:
[12]	valid_0's rmse: 0.737775
Training until validation scores don't improve for 150 rounds.
Early stopping, best iteration is:
[21]	valid_0's rmse: 0.956537
1 0
Training until validation scores don't improve for 150 rounds.
Early stopping, best iteration is:
[114]	valid_0's rmse: 2.10812
Training until validation scores don't improve for 150 rounds.
Early stopping, best iteration is:
[

NameError: name 'mean_squared_error' is not defined

In [15]:
from sklearn.metrics import mean_squared_error
print(np.sqrt(mean_squared_error(target, oof['meter_reading_p'])))

1.1737216545786244


In [17]:
t.shape, test.shape

((41697600, 1), (41697600, 37))

In [22]:
t

Unnamed: 0,meter_reading
0,2.533826
1,2.701838
2,2.617433
3,2.580984
4,2.580984
5,2.508498
6,2.533506
7,2.554788
8,793.991243
9,2.558212


In [23]:
submission = pd.read_csv('sample_submission.csv')
submission['meter_reading'] = test['meter_reading']

submission.to_csv('site_id_v1.csv.gz',index=False, compression='gzip')

submission.head(10)

Unnamed: 0,row_id,meter_reading
0,0,2.533826
1,1,2.701838
2,2,2.617433
3,3,2.580984
4,4,2.580984
5,5,2.508498
6,6,2.533506
7,7,2.554788
8,8,793.991243
9,9,2.558212


In [25]:
from sklearn.model_selection import train_test_split as train_valid_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import mean_squared_error
import eli5

target_col = 'meter_reading'
y = train[target_col]
Xs = train.drop(target_col,axis=1)

X_train, X_valid, y_train, y_valid = train_valid_split(Xs, y, test_size=0.2, random_state=0)
del train
X_train.shape,X_valid.shape

((16172880, 35), (4043220, 35))

In [None]:
%%time
model = KNeighborsRegressor(n_neighbors=2,n_jobs=-1)

model.fit(X_train.fillna(0),y_train)

In [None]:
def RMSE(actual,preds):
    return np.sqrt(mean_squared_error(actual,preds))

def get_evaluations(model):
    preds = model.predict(X_train.fillna(0))
    plt.hist(np.log1p(preds),bins=100)
    plt.show();
    print('train_rmse: ',RMSE(y_train,preds))
                    
    preds = model.predict(X_valid.fillna(0))
    plt.hist(np.log1p(preds),bins=100)
    plt.show()
    print('valid_rmse: ',RMSE(y_valid,preds))
    
get_evaluations(model)

In [None]:
eli5.show_weights(model,feature_names=list(X_train.columns))

In [None]:
#1.07: блендинг 2ух моделей:

for meter in train['meter'].unique():
    for site in train['site_id'].unique():
        .......
        
for meter in train['meter'].unique():
    for site in train['building_id'].unique():
        .......

In [None]:
#1.06: умножение предикта ~ на 0.95