In [1]:
import os
import pandas as pd
import numpy as np
import gc
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
try:
    import cPickle as pickle
except:
    import pickle
import time
from scipy.stats import skew, kurtosis

In [2]:
DATA_PATH = 'data/'
PRED_TEST_PATH = ''
PRED_TRAIN_PATH = ''
FOLDS_PATH = ''
MODEL_NAME = 'izmajlovkonstantin'

In [3]:
RANDOM_STATE = 22
np.random.seed(RANDOM_STATE)

In [4]:
train_df = pd.read_csv(f'{DATA_PATH}train.csv')
test_df = pd.read_csv(f'{DATA_PATH}test.csv')

In [5]:
y = np.log1p(train_df.target.values)
y.shape

(4459,)

In [6]:
columns_to_use = test_df.columns.tolist()
del columns_to_use[0] # Remove 'ID'
X = train_df[columns_to_use]
test = test_df[columns_to_use]

In [7]:
constant_columns = train_df.loc[:, (train_df == train_df.iloc[0]).all()].columns.tolist()
correlated_columns = ['bba402827',
 'd60ddde1b',
 '33ed23348',
 '912836770',
 'acc5b709d',
 'f8d75792f',
 '22c933b9b',
 'f333a5f60']
print(len(constant_columns))

256


In [8]:
def find_too_freq_values(thresh=None, constant_value=0): 
    cols_with_too_freq_values= []
    if thresh is None:
        thresh =0.98
    for column in train_df.columns:
        counts = train_df[column].value_counts()
        try:
            counts[constant_value]
        except KeyError:
            continue            
        value_fraction = counts[constant_value] / len(train_df)
        if value_fraction > thresh:
            cols_with_too_freq_values.append(column)
    return cols_with_too_freq_values

In [9]:
%%time
cols_with_too_freq_values = find_too_freq_values()
print(len(cols_with_too_freq_values))

2870
Wall time: 3.96 s


In [10]:
cols_to_remove = list(set(constant_columns)|set(correlated_columns))
len(cols_to_remove)

264

In [11]:
columns_to_use = test_df.columns.tolist()[1:] # Remove 'ID'
columns_to_use = [x for x in columns_to_use if x not in cols_to_remove]

In [12]:
X = train_df[columns_to_use]
test = test_df[columns_to_use]

In [13]:
from sklearn.decomposition import PCA, TruncatedSVD, FastICA, FactorAnalysis
from sklearn.random_projection  import SparseRandomProjection, GaussianRandomProjection

In [14]:
random_state = 13
feat_extractors_dict  = {'pca': PCA(n_components=0.9, random_state=random_state), 
                          'tsvd': TruncatedSVD(n_components = 50, n_iter=10, random_state=random_state), 
                          'fa': FactorAnalysis(n_components=50, random_state=random_state), 
                          'gauss': GaussianRandomProjection(n_components=50, eps=0.1, random_state=random_state), 
                          'srp': SparseRandomProjection(n_components=50, random_state=random_state)
                         }

def create_dim_reduction_feats(df, train=False):
    full_X_arr = []
    all_cols = []
    if train:
        for k, v in feat_extractors_dict.items():
            print(f'Process {k}')
            X_arr = v.fit_transform(df)
            n_components = X_arr.shape[1]
            curr_cols = [str(k)+f'{i}' for i in range(n_components)]
            all_cols += curr_cols
            if len(full_X_arr) ==0:
                full_X_arr = X_arr
            else:
                full_X_arr = np.hstack((full_X_arr, X_arr))
    else:
        for k, v in feat_extractors_dict.items():
            print(f'Process {k}')
            X_arr = v.transform(df)
            n_components = X_arr.shape[1]
            curr_cols = [str(k)+f'{i}' for i in range(n_components)]
            all_cols += curr_cols
            if len(full_X_arr) ==0:
                full_X_arr = X_arr
            else:
                full_X_arr = np.hstack((full_X_arr, X_arr))

    new_df = pd.DataFrame(full_X_arr, columns=all_cols, index=df.index)
    return new_df

In [15]:
%%time
train_dim_reduction = create_dim_reduction_feats(X, train=True)
test_dim_reduction = create_dim_reduction_feats(test)

Process pca
Process tsvd
Process fa
Process gauss
Process srp
Process pca
Process tsvd
Process fa
Process gauss
Process srp
Wall time: 1min 59s


In [20]:
def aggregate_row(row):
    non_zero_values = row.iloc[row.nonzero()]
#     print(non_zero_values)
    if len(non_zero_values.value_counts())>1:
        aggs = {'non_zero_mean': non_zero_values.mean(),
                'non_zero_std': non_zero_values.std(),
                'non_zero_max': non_zero_values.max(),
                'non_zero_min': non_zero_values.min(),
                'non_zero_sum': non_zero_values.sum(),
                'non_zero_skewness': skew(non_zero_values),
                'non_zero_kurtosis': kurtosis(non_zero_values),
                'non_zero_median': non_zero_values.median(),
                'non_zero_q1': np.percentile(non_zero_values, q=25),
                'non_zero_q3': np.percentile(non_zero_values, q=75),
                'non_zero_log_mean': np.log1p(non_zero_values.astype('float64')).mean(),
                'non_zero_log_std': np.log1p(non_zero_values.astype('float64')).std(),
                'non_zero_log_max': np.log1p(non_zero_values.astype('float64')).max(),
                'non_zero_log_min': np.log1p(non_zero_values.astype('float64')).min(),
                'non_zero_log_sum': np.log1p(non_zero_values.astype('float64')).sum(),
                'non_zero_log_skewness': skew(np.log1p(non_zero_values.astype('float64'))),
                'non_zero_log_kurtosis':  kurtosis(np.log1p(non_zero_values.astype('float64'))) ,
                'non_zero_log_median': np.log1p(non_zero_values.astype('float64')).median(),
                'non_zero_log_q1': np.percentile(np.log1p(non_zero_values.astype('float64')), q=25),
                'non_zero_log_q3': np.percentile(np.log1p(non_zero_values.astype('float64')), q=75),
                'non_zero_count': non_zero_values.count(),
                'non_zero_fraction': non_zero_values.count() / row.count()
                }
    else:
        aggs = {'non_zero_mean': -999999,
                'non_zero_std':-999999,
                'non_zero_max': -999999,
                'non_zero_min': -999999,
                'non_zero_sum': -999999,
                'non_zero_skewness': -999999,
                'non_zero_kurtosis': -999999,
                'non_zero_median': -999999,
                'non_zero_q1': -999999,
                'non_zero_q3':-999999,
                'non_zero_log_mean': -999999,
                'non_zero_log_std': -999999,
                'non_zero_log_max': -999999,
                'non_zero_log_min': -999999,
                'non_zero_log_sum': -999999,
                'non_zero_log_skewness': -999999,
                'non_zero_log_kurtosis': -999999,
                'non_zero_log_median': -999999,
                'non_zero_log_q1': -999999,
                'non_zero_log_q3':-999999,
                'non_zero_count': -999999,
                'non_zero_fraction': -999999
                }
            
    return pd.Series(aggs)

In [21]:
def transform(X):
    X_agg = X.apply(aggregate_row, axis=1)
    return X_agg

In [22]:
%%time
df_with_row_statistic_train = transform(train_df.iloc[:, 2:])
df_with_row_statistic_test = transform(test_df.iloc[:, 1:])

Wall time: 3min 33s


In [23]:
X_n = pd.concat((X,df_with_row_statistic_train),axis = 1)
X_n = pd.concat((X_n,train_dim_reduction),axis = 1)

In [24]:
test_n = pd.concat((test,df_with_row_statistic_test),axis = 1)
test_n = pd.concat((test_n,test_dim_reduction),axis = 1)

In [25]:
def generate_features(dframe,df):
    interactions = []
    for i in dframe.columns:
        for j in dframe.columns:
            if i==j:
                continue
            else:
                list_of_indexes = [i,j]
                list_of_indexes.sort()
                if list_of_indexes not in interactions:
                    interactions.append(list_of_indexes)
   
    for cols in interactions:
            col1 = cols[0]
            col2 = cols[1]

            name = col1 + "/" + (col2)
            feature_interactions.append(name)
            df = pd.concat([df, pd.Series(dframe.loc[:,col1] /(1+ dframe.loc[:,col2]), name=name)], axis=1)
                
    return df


log_feats =['26ab20ff9',
 '6786ea46d',
 '15ace8c9f',
 'b6fa5a5fd',
 'a72e0bf30',
 'fb387ea33',
 'f190486d6',
 '9fd594eec',
 '251d1aa17',
 '1c71183bb',
 '5e1085022',
 '5c6487af1',
 'b58127585',
 '37f57824c',
 '3bdee45be',
 '08e89cc54',
 '3e1100230',
 '91f701ba2',
 '66ace2992',
 'b791ce9aa']


logic = X_n.loc[:, log_feats]

feature_interactions = []

X_n = generate_features(logic,X_n)  

logic = test_n.loc[:, log_feats]

feature_interactions = []

test_n = generate_features(logic,test_n)   

In [54]:
from boostaroota import BoostARoota
br = BoostARoota(metric='rmse')
br.fit(X_n, y)

Round:  1  iteration:  1
Round:  1  iteration:  2
Round:  1  iteration:  3
Round:  1  iteration:  4
Round:  1  iteration:  5
Round:  1  iteration:  6
Round:  1  iteration:  7
Round:  1  iteration:  8
Round:  1  iteration:  9
Round:  1  iteration:  10
Round:  2  iteration:  1
Round:  2  iteration:  2
Round:  2  iteration:  3
Round:  2  iteration:  4
Round:  2  iteration:  5
Round:  2  iteration:  6
Round:  2  iteration:  7
Round:  2  iteration:  8
Round:  2  iteration:  9
Round:  2  iteration:  10
Round:  3  iteration:  1
Round:  3  iteration:  2
Round:  3  iteration:  3
Round:  3  iteration:  4
Round:  3  iteration:  5
Round:  3  iteration:  6
Round:  3  iteration:  7
Round:  3  iteration:  8
Round:  3  iteration:  9
Round:  3  iteration:  10
Round:  4  iteration:  1
Round:  4  iteration:  2
Round:  4  iteration:  3
Round:  4  iteration:  4
Round:  4  iteration:  5
Round:  4  iteration:  6
Round:  4  iteration:  7
Round:  4  iteration:  8
Round:  4  iteration:  9
Round:  4  iteration: 

<boostaroota.boostaroota.BoostARoota at 0x1e2291d1748>

In [55]:
remaining_vars = list(br.keep_vars_)

In [75]:
pd.DataFrame(remaining_vars).to_csv('remaining_vars_4.csv',index = False)

In [49]:
def run_lgb(train_X, train_y, val_X, val_y, test_X):
    params = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 40,
        'max_depth': 8, # -1,
        "learning_rate" : 0.005,
        "bagging_fraction" : 0.7,
        "feature_fraction" : 0.1, # 0.6,
        "bagging_frequency" : 6,
        "bagging_seed" : 44,
        "verbosity" : -1,
        'num_threads' : 4,
        "seed": 44,
        "nthread" : 5
    }
    
    start_time = time.time()
    lgtrain = lgb.Dataset(train_X, label=train_y)
    lgval = lgb.Dataset(val_X, label=val_y)
    model = lgb.train(params, lgtrain, 5000, 
                      valid_sets=[lgtrain, lgval], 
                      early_stopping_rounds=100, 
                      verbose_eval=150)
    print('Model training done in {} seconds.'.format(time.time() - start_time))
    
    pred_test_y = np.expm1(model.predict(test_X, num_iteration=model.best_iteration))
    pred_oof_log = model.predict(val_X, num_iteration=model.best_iteration)
    return pred_test_y, pred_oof_log, model

In [50]:
def run_calculations(X, test, big_cv_folds, func_name = None):
    if not func_name:
        return print('The function to run is not defined')
    else:
        y_oof_20_preds = []
        fold_errors_20_preds =[]
        avg_test_pred_20_preds = []
        
        for ind, cv_folds in enumerate(big_cv_folds):
            print('Fitting big fold', ind+1, 'out of', len(big_cv_folds))
            y_oof = np.zeros((y.shape[0]))
            fold_errors =[]
            pred_test_list = []
            
            for i, (train_index, val_index) in enumerate(cv_folds):
                print('Fitting sub fold', i+1, 'out of', len(cv_folds))
                X_train, X_val  = X.iloc[train_index], X.iloc[val_index]
                y_train, y_val = y[train_index], y[val_index]

                # part to include additional functions
                if func_name == 'lgb':
                    pred_test_y, pred_oof_log, clf = run_lgb(X_train, y_train, X_val, y_val, test)
                else:
                    return print('The function to run is not correct')

                y_oof[val_index] = pred_oof_log
                curr_fe = np.sqrt(mean_squared_error(y_val, pred_oof_log))
                print(f'Fold error {curr_fe}')
                fold_errors.append(curr_fe)
                pred_test_list.append(list(pred_test_y))

            print('Total error', np.sqrt(mean_squared_error(y, y_oof)))
            total_fe_std = round(np.std(fold_errors), 5)
            print(f'Total std {total_fe_std}')
            avg_test_pred = np.mean(pred_test_list, axis=0)
            
            avg_test_pred_20_preds.append(avg_test_pred)
            fold_errors_20_preds.append(fold_errors)
            y_oof_20_preds.append(y_oof)
            
        return y_oof_20_preds, avg_test_pred_20_preds, fold_errors_20_preds

In [45]:
with open(f'{FOLDS_PATH}custom_cv.pkl', 'rb') as f:
        cv_folds = pickle.load(f)

In [63]:
X_n[remaining_vars].shape

(4459, 460)

In [58]:
%%time
y_oof_lgb, pred_test_list_lgb, fold_errors = run_calculations(X_n[remaining_vars], test_n[remaining_vars], cv_folds, 'lgb')

Fitting big fold 1 out of 20
Fitting sub fold 1 out of 5
Training until validation scores don't improve for 100 rounds.
[150]	training's rmse: 1.37333	valid_1's rmse: 1.48188
[300]	training's rmse: 1.16663	valid_1's rmse: 1.3859
[450]	training's rmse: 1.03466	valid_1's rmse: 1.35263
[600]	training's rmse: 0.93747	valid_1's rmse: 1.34076
[750]	training's rmse: 0.855625	valid_1's rmse: 1.33541
[900]	training's rmse: 0.785514	valid_1's rmse: 1.33193
[1050]	training's rmse: 0.726519	valid_1's rmse: 1.33053
[1200]	training's rmse: 0.670614	valid_1's rmse: 1.32983
[1350]	training's rmse: 0.623805	valid_1's rmse: 1.32913
[1500]	training's rmse: 0.580083	valid_1's rmse: 1.32845
Early stopping, best iteration is:
[1497]	training's rmse: 0.580998	valid_1's rmse: 1.32842
Model training done in 13.497329235076904 seconds.
Fold error 1.3284191745053764
Fitting sub fold 2 out of 5
Training until validation scores don't improve for 100 rounds.
[150]	training's rmse: 1.40385	valid_1's rmse: 1.4507
[30

[1200]	training's rmse: 0.780908	valid_1's rmse: 1.30117
Early stopping, best iteration is:
[1176]	training's rmse: 0.788456	valid_1's rmse: 1.30085
Model training done in 12.663126230239868 seconds.
Fold error 1.3008496250324584
Total error 1.327615896386753
Total std 0.04578
Fitting big fold 3 out of 20
Fitting sub fold 1 out of 5
Training until validation scores don't improve for 100 rounds.
[150]	training's rmse: 1.37842	valid_1's rmse: 1.47649
[300]	training's rmse: 1.17211	valid_1's rmse: 1.3815
[450]	training's rmse: 1.04226	valid_1's rmse: 1.3484
[600]	training's rmse: 0.945393	valid_1's rmse: 1.33613
[750]	training's rmse: 0.865403	valid_1's rmse: 1.33119
[900]	training's rmse: 0.795207	valid_1's rmse: 1.32875
[1050]	training's rmse: 0.732307	valid_1's rmse: 1.32736
[1200]	training's rmse: 0.675215	valid_1's rmse: 1.32683
Early stopping, best iteration is:
[1161]	training's rmse: 0.689209	valid_1's rmse: 1.32638
Model training done in 11.151170253753662 seconds.
Fold error 1.3

[450]	training's rmse: 1.08865	valid_1's rmse: 1.37334
[600]	training's rmse: 1.006	valid_1's rmse: 1.362
[750]	training's rmse: 0.937953	valid_1's rmse: 1.35602
[900]	training's rmse: 0.877597	valid_1's rmse: 1.35294
[1050]	training's rmse: 0.822751	valid_1's rmse: 1.35124
[1200]	training's rmse: 0.773188	valid_1's rmse: 1.35041
Early stopping, best iteration is:
[1193]	training's rmse: 0.775619	valid_1's rmse: 1.3503
Model training done in 12.609271049499512 seconds.
Fold error 1.3503034116641597
Total error 1.3293185634542535
Total std 0.01777
Fitting big fold 5 out of 20
Fitting sub fold 1 out of 5
Training until validation scores don't improve for 100 rounds.
[150]	training's rmse: 1.36876	valid_1's rmse: 1.49293
[300]	training's rmse: 1.16288	valid_1's rmse: 1.4022
[450]	training's rmse: 1.03233	valid_1's rmse: 1.37018
[600]	training's rmse: 0.937095	valid_1's rmse: 1.35863
[750]	training's rmse: 0.856618	valid_1's rmse: 1.35341
[900]	training's rmse: 0.788286	valid_1's rmse: 1.3

Training until validation scores don't improve for 100 rounds.
[150]	training's rmse: 1.37392	valid_1's rmse: 1.47585
[300]	training's rmse: 1.16791	valid_1's rmse: 1.38158
[450]	training's rmse: 1.03678	valid_1's rmse: 1.34788
[600]	training's rmse: 0.940775	valid_1's rmse: 1.33519
[750]	training's rmse: 0.861165	valid_1's rmse: 1.32991
[900]	training's rmse: 0.790906	valid_1's rmse: 1.32719
[1050]	training's rmse: 0.729732	valid_1's rmse: 1.32499
[1200]	training's rmse: 0.674522	valid_1's rmse: 1.32367
[1350]	training's rmse: 0.625171	valid_1's rmse: 1.32262
[1500]	training's rmse: 0.579936	valid_1's rmse: 1.32209
Early stopping, best iteration is:
[1515]	training's rmse: 0.57563	valid_1's rmse: 1.32202
Model training done in 13.227648973464966 seconds.
Fold error 1.3220196656169263
Fitting sub fold 2 out of 5
Training until validation scores don't improve for 100 rounds.
[150]	training's rmse: 1.40052	valid_1's rmse: 1.49732
[300]	training's rmse: 1.22357	valid_1's rmse: 1.39132
[45

[1050]	training's rmse: 0.828167	valid_1's rmse: 1.31985
[1200]	training's rmse: 0.778281	valid_1's rmse: 1.31859
[1350]	training's rmse: 0.732687	valid_1's rmse: 1.31721
[1500]	training's rmse: 0.691095	valid_1's rmse: 1.31607
[1650]	training's rmse: 0.65155	valid_1's rmse: 1.31481
[1800]	training's rmse: 0.614615	valid_1's rmse: 1.3144
Early stopping, best iteration is:
[1781]	training's rmse: 0.619062	valid_1's rmse: 1.31428
Model training done in 17.400453329086304 seconds.
Fold error 1.3142804099985645
Total error 1.3295313930748704
Total std 0.0186
Fitting big fold 9 out of 20
Fitting sub fold 1 out of 5
Training until validation scores don't improve for 100 rounds.
[150]	training's rmse: 1.36575	valid_1's rmse: 1.49502
[300]	training's rmse: 1.15759	valid_1's rmse: 1.40672
[450]	training's rmse: 1.0251	valid_1's rmse: 1.37598
[600]	training's rmse: 0.927601	valid_1's rmse: 1.36571
[750]	training's rmse: 0.846726	valid_1's rmse: 1.3614
[900]	training's rmse: 0.775496	valid_1's rm

[450]	training's rmse: 1.08499	valid_1's rmse: 1.39592
[600]	training's rmse: 1.00148	valid_1's rmse: 1.38277
[750]	training's rmse: 0.932549	valid_1's rmse: 1.37688
[900]	training's rmse: 0.873165	valid_1's rmse: 1.37409
[1050]	training's rmse: 0.820457	valid_1's rmse: 1.37172
[1200]	training's rmse: 0.770823	valid_1's rmse: 1.36995
[1350]	training's rmse: 0.726154	valid_1's rmse: 1.36849
[1500]	training's rmse: 0.684575	valid_1's rmse: 1.36801
[1650]	training's rmse: 0.644357	valid_1's rmse: 1.36711
Early stopping, best iteration is:
[1645]	training's rmse: 0.645612	valid_1's rmse: 1.36702
Model training done in 15.933378219604492 seconds.
Fold error 1.3670188238686782
Total error 1.3220313559644286
Total std 0.05458
Fitting big fold 11 out of 20
Fitting sub fold 1 out of 5
Training until validation scores don't improve for 100 rounds.
[150]	training's rmse: 1.364	valid_1's rmse: 1.50687
[300]	training's rmse: 1.15982	valid_1's rmse: 1.41031
[450]	training's rmse: 1.0325	valid_1's rm

[1650]	training's rmse: 0.649925	valid_1's rmse: 1.35031
[1800]	training's rmse: 0.614091	valid_1's rmse: 1.34944
[1950]	training's rmse: 0.581717	valid_1's rmse: 1.34896
[2100]	training's rmse: 0.54905	valid_1's rmse: 1.34843
[2250]	training's rmse: 0.519015	valid_1's rmse: 1.34828
Early stopping, best iteration is:
[2198]	training's rmse: 0.528977	valid_1's rmse: 1.34806
Model training done in 20.06234121322632 seconds.
Fold error 1.348057946304623
Total error 1.3247837470685224
Total std 0.01391
Fitting big fold 13 out of 20
Fitting sub fold 1 out of 5
Training until validation scores don't improve for 100 rounds.
[150]	training's rmse: 1.36707	valid_1's rmse: 1.49396
[300]	training's rmse: 1.15823	valid_1's rmse: 1.40193
[450]	training's rmse: 1.02519	valid_1's rmse: 1.36903
[600]	training's rmse: 0.92678	valid_1's rmse: 1.35676
[750]	training's rmse: 0.84687	valid_1's rmse: 1.35081
[900]	training's rmse: 0.77787	valid_1's rmse: 1.34806
[1050]	training's rmse: 0.716789	valid_1's rm

[600]	training's rmse: 1.01327	valid_1's rmse: 1.33963
[750]	training's rmse: 0.944583	valid_1's rmse: 1.33149
[900]	training's rmse: 0.884664	valid_1's rmse: 1.32696
[1050]	training's rmse: 0.832736	valid_1's rmse: 1.32477
[1200]	training's rmse: 0.784111	valid_1's rmse: 1.32299
[1350]	training's rmse: 0.739319	valid_1's rmse: 1.32061
[1500]	training's rmse: 0.698702	valid_1's rmse: 1.31897
[1650]	training's rmse: 0.659531	valid_1's rmse: 1.31824
[1800]	training's rmse: 0.623401	valid_1's rmse: 1.31741
[1950]	training's rmse: 0.589344	valid_1's rmse: 1.31679
[2100]	training's rmse: 0.557277	valid_1's rmse: 1.31566
[2250]	training's rmse: 0.526967	valid_1's rmse: 1.31501
[2400]	training's rmse: 0.498321	valid_1's rmse: 1.31467
[2550]	training's rmse: 0.47204	valid_1's rmse: 1.31458
Early stopping, best iteration is:
[2505]	training's rmse: 0.479775	valid_1's rmse: 1.3145
Model training done in 24.71419644355774 seconds.
Fold error 1.3145003389057641
Total error 1.3283620535946883
Total

[1950]	training's rmse: 0.620619	valid_1's rmse: 1.27794
Early stopping, best iteration is:
[1974]	training's rmse: 0.615267	valid_1's rmse: 1.27783
Model training done in 24.019810914993286 seconds.
Fold error 1.2778268462864748
Fitting sub fold 4 out of 5
Training until validation scores don't improve for 100 rounds.
[150]	training's rmse: 1.39546	valid_1's rmse: 1.47874
[300]	training's rmse: 1.21184	valid_1's rmse: 1.39009
[450]	training's rmse: 1.09905	valid_1's rmse: 1.36208
[600]	training's rmse: 1.01821	valid_1's rmse: 1.35081
[750]	training's rmse: 0.953112	valid_1's rmse: 1.3451
[900]	training's rmse: 0.895831	valid_1's rmse: 1.3416
[1050]	training's rmse: 0.843766	valid_1's rmse: 1.33884
[1200]	training's rmse: 0.796168	valid_1's rmse: 1.33799
[1350]	training's rmse: 0.751738	valid_1's rmse: 1.33711
[1500]	training's rmse: 0.711177	valid_1's rmse: 1.3365
[1650]	training's rmse: 0.67169	valid_1's rmse: 1.33592
Early stopping, best iteration is:
[1673]	training's rmse: 0.66608

[600]	training's rmse: 1.02324	valid_1's rmse: 1.33695
[750]	training's rmse: 0.956392	valid_1's rmse: 1.32801
[900]	training's rmse: 0.899109	valid_1's rmse: 1.32352
[1050]	training's rmse: 0.847142	valid_1's rmse: 1.32145
[1200]	training's rmse: 0.798086	valid_1's rmse: 1.31922
[1350]	training's rmse: 0.753602	valid_1's rmse: 1.31737
[1500]	training's rmse: 0.712468	valid_1's rmse: 1.31709
Early stopping, best iteration is:
[1534]	training's rmse: 0.703343	valid_1's rmse: 1.31663
Model training done in 19.1785147190094 seconds.
Fold error 1.3166259346330778
Fitting sub fold 5 out of 5
Training until validation scores don't improve for 100 rounds.
[150]	training's rmse: 1.39238	valid_1's rmse: 1.4849
[300]	training's rmse: 1.20743	valid_1's rmse: 1.38794
[450]	training's rmse: 1.09432	valid_1's rmse: 1.3539
[600]	training's rmse: 1.01126	valid_1's rmse: 1.33976
[750]	training's rmse: 0.942444	valid_1's rmse: 1.3323
[900]	training's rmse: 0.881686	valid_1's rmse: 1.32946
[1050]	trainin

Early stopping, best iteration is:
[1584]	training's rmse: 0.707885	valid_1's rmse: 1.27763
Model training done in 19.63164210319519 seconds.
Fold error 1.2776275078106472
Fitting sub fold 4 out of 5
Training until validation scores don't improve for 100 rounds.
[150]	training's rmse: 1.39429	valid_1's rmse: 1.50087
[300]	training's rmse: 1.21255	valid_1's rmse: 1.39889
[450]	training's rmse: 1.10151	valid_1's rmse: 1.3607
[600]	training's rmse: 1.01956	valid_1's rmse: 1.3465
[750]	training's rmse: 0.951831	valid_1's rmse: 1.33819
[900]	training's rmse: 0.892624	valid_1's rmse: 1.33507
[1050]	training's rmse: 0.838002	valid_1's rmse: 1.33323
[1200]	training's rmse: 0.787748	valid_1's rmse: 1.33077
[1350]	training's rmse: 0.742686	valid_1's rmse: 1.32965
[1500]	training's rmse: 0.701872	valid_1's rmse: 1.32853
Early stopping, best iteration is:
[1477]	training's rmse: 0.708296	valid_1's rmse: 1.32819
Model training done in 18.655680894851685 seconds.
Fold error 1.328185234095416
Fitting

In [None]:
print('Total error',np.mean(([np.mean(x) for x in fold_errors])))
print('Total std ',np.mean(([np.std (x) for x in fold_errors])))

In [60]:
print('Length of test predictions:', len(pred_test_list_lgb))
avg_pred_test_list_lgb = np.mean(pred_test_list_lgb, axis=0)
print('Length of avg test predictions:', len(avg_pred_test_list_lgb))

Length of test predictions: 20
Length of avg test predictions: 49342


In [62]:
# 20x oof train preds
with open(os.path.join(DATA_PATH, 'izmaylov_20folds_train_cv1323_std0021.pkl'), 'wb') as f:
    pickle.dump(y_oof_lgb, f)
    
#20x test preds
with open(os.path.join(DATA_PATH, 'izmaylov_20folds_test_cv1323_std0021.pkl'), 'wb') as f:
    pickle.dump(pred_test_list_lgb, f)