In [1]:
# General imports
import numpy as np
import pandas as pd
import os, gc, sys, warnings, random, math, psutil, pickle

warnings.filterwarnings('ignore')

In [2]:
########################### Helpers
#################################################################################
## Seeder
# :seed to make all processes deterministic     # type: int
def seed_everything(seed=0):
    random.seed(seed)
    np.random.seed(seed)
    
## Simple "Memory profilers" to see memory usage
def get_memory_usage():
    return np.round(psutil.Process(os.getpid()).memory_info()[0]/2.**30, 2) 
        
def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)

In [3]:
########################### Vars
#################################################################################
SEED = 42
LOCAl_TEST = True
seed_everything(SEED)
TARGET = 'meter_reading'

In [4]:
train_df = pd.read_pickle('~/Datas/ashre-energy/FE_train_V1_noNan.pkl')

In [5]:
########################### Trick to use kernel hdd to store results
#################################################################################

# You can save just test_df or both if have sufficient space
# train_df.to_pickle('train_df.pkl')
# test_df.to_pickle('test_df.pkl')
# del train_df, test_df
gc.collect()

15

In [6]:
########################### Check memory usage
#################################################################################
for name, size in sorted(((name, sys.getsizeof(value)) for name,value in locals().items()),
                         key= lambda x: -x[1])[:10]:
    print("{:>30}: {:>8}".format(name,sizeof_fmt(size)))
print('Memory in Gb', get_memory_usage())

                      train_df:   2.1GiB
                           _i2:   715.0B
                           _i6:   421.0B
                            _i:   370.0B
                           _i5:   370.0B
                           Out:   288.0B
                           _oh:   288.0B
                          _iii:   238.0B
                           _i3:   238.0B
                           _i1:   199.0B
Memory in Gb 2.26


In [7]:
########################### Model params
import lightgbm as lgb
lgb_params = {
                    'objective':'regression',
                    'boosting_type':'gbdt',
                    'metric':'rmse',
                    'n_jobs':-1,
                    'learning_rate':0.05,
                    'num_leaves': 2**8,
                    'max_depth':-1,
                    'tree_learner':'serial',
                    'colsample_bytree': 0.7,
                    'subsample_freq':1,
                    'subsample':0.7,
                    'n_estimators':1500,
                    'max_bin':512,
                    'verbose':1,
                    'seed': SEED,
                    'early_stopping_rounds':150, 
                } 

In [None]:
########################### Model

# Models saving
model_filename = 'lgbm'
models = []

# Load train_df from hdd
# train_df = pd.read_pickle('train_df.pkl')

remove_columns = ['timestamp',TARGET]
features_columns = [col for col in list(train_df) if col not in remove_columns]

if LOCAl_TEST:
    tr_data = lgb.Dataset(train_df.iloc[:15000000][features_columns], label=np.log1p(train_df.iloc[:15000000][TARGET]))
    vl_data = lgb.Dataset(train_df.iloc[15000000:][features_columns], label=np.log1p(train_df.iloc[15000000:][TARGET]))
    eval_sets = [tr_data,vl_data]
else:
    tr_data = lgb.Dataset(train_df[features_columns], label=np.log1p(train_df[TARGET]))
    eval_sets = [tr_data]

# Remove train_df from hdd
# os.system('rm ../../Datas/ashre-energy/train_df.pkl')

# Lets make 5 seeds mix model
for cur_seed in [42,43,44,45,46]:
    
    # Seed everything
    seed_everything(cur_seed)
    lgb_params['seed'] = cur_seed
    
    estimator = lgb.train(
                lgb_params,
                tr_data,
                valid_sets = eval_sets,
                verbose_eval = 100,
            )

    # For CV you may add fold number
    # pickle.dump(estimator, open(model_filename + '__fold_' + str(i) + '.bin', "wb"))
    pickle.dump(estimator, open(model_filename + '__seed_' + str(cur_seed)  + 'V1.bin', 'wb'))
    models.append(model_filename + '__seed_' + str(cur_seed)  + '.bin')

if not LOCAl_TEST:
    del tr_data, train_df
    gc.collect()

Training until validation scores don't improve for 150 rounds
[100]	training's rmse: 0.937245	valid_1's rmse: 1.32297
[200]	training's rmse: 0.804626	valid_1's rmse: 1.25115
[300]	training's rmse: 0.740923	valid_1's rmse: 1.21984
[400]	training's rmse: 0.700257	valid_1's rmse: 1.2085
[500]	training's rmse: 0.668465	valid_1's rmse: 1.19996
[600]	training's rmse: 0.645216	valid_1's rmse: 1.19572
[700]	training's rmse: 0.6261	valid_1's rmse: 1.19277
[800]	training's rmse: 0.611352	valid_1's rmse: 1.19209
[900]	training's rmse: 0.597455	valid_1's rmse: 1.19238
Early stopping, best iteration is:
[759]	training's rmse: 0.617197	valid_1's rmse: 1.19104
Training until validation scores don't improve for 150 rounds
[100]	training's rmse: 0.941375	valid_1's rmse: 1.31344
[200]	training's rmse: 0.802247	valid_1's rmse: 1.23984
[300]	training's rmse: 0.737847	valid_1's rmse: 1.21468
[400]	training's rmse: 0.698465	valid_1's rmse: 1.20084
[500]	training's rmse: 0.669129	valid_1's rmse: 1.19253
[600

In [None]:
########################### Predict
#################################################################################
if not LOCAl_TEST:
    
    # Load test_df from hdd
    test_df = pd.read_pickle('test_df.pkl')
    
    # Remove unused columns
    test_df = test_df[features_columns]
    
    # Remove test_df from hdd
    os.system('rm test_df.pkl')
    
    # Read submission file
    submission = pd.read_csv('../input/ashrae-energy-prediction/sample_submission.csv')

    # Remove row_id for a while
    del submission['row_id']
    
    for model_path in models:
        print('Predictions for', model_path)
        estimator = pickle.load(open(model_path, 'rb'))

        predictions = []
        batch_size = 2000000
        for batch in range(int(len(test_df)/batch_size)+1):
            print('Predicting batch:', batch)
            predictions += list(np.expm1(estimator.predict(test_df[features_columns].iloc[batch*batch_size:(batch+1)*batch_size])))
            
        submission['meter_reading'] += predictions
        
    # Average over models
    submission['meter_reading'] /= len(models)
    
    # Delete test_df
    del test_df
     
    # Fix negative values
    submission['meter_reading'] = submission['meter_reading'].clip(0,None)

    # Restore row_id
    submission['row_id'] = submission.index
    
    ########################### Check
    print(submission.iloc[:20])
    print(submission['meter_reading'].describe())

In [None]:
########################### Predict
#################################################################################
if LOCAl_TEST:
    
    # Load test_df from hdd
#     test_df = pd.read_pickle('../../Datas/ashre-energy/test_df.pkl')
    test_df  = pd.read_pickle('~/Datas/ashre-energy/FE_test_V1_noNan.pkl')
    
    # Remove unused columns
    test_df = test_df[features_columns]
    
    # Remove test_df from hdd
    os.system('rm test_df.pkl')
    
    # Read submission file
    submission = pd.read_csv('../../Datas/ashre-energy/sample_submission.csv')

    # Remove row_id for a while
    del submission['row_id']
    
    for model_path in models:
        print('Predictions for', model_path)
        estimator = pickle.load(open(model_path, 'rb'))

        predictions = []
        batch_size = 2000000
        for batch in range(int(len(test_df)/batch_size)+1):
            print('Predicting batch:', batch)
            predictions += list(np.expm1(estimator.predict(test_df[features_columns].iloc[batch*batch_size:(batch+1)*batch_size])))
            
        submission['meter_reading'] += predictions
        
    # Average over models
    submission['meter_reading'] /= len(models)
    
    # Delete test_df
    del test_df
     
    # Fix negative values
    submission['meter_reading'] = submission['meter_reading'].clip(0,None)

    # Restore row_id
    submission['row_id'] = submission.index
    
    ########################### Check
    print(submission.iloc[:20])
    print(submission['meter_reading'].describe())
    submission.to_csv('~/Datas/ashre-energy/submission.csv', index=False)