In [1]:
## import libraries
import numpy as np
np.random.seed(123)

import pandas as pd
import subprocess
from scipy.sparse import csr_matrix, hstack
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import KFold
from keras.models import Sequential ,load_model
from keras.layers import Dense, Dropout, Activation
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import PReLU
from keras.callbacks import EarlyStopping, ModelCheckpoint
import keras.backend as K


def batch_generator(X, y, batch_size, shuffle):
    number_of_batches = np.ceil(X.shape[0]/batch_size)
    counter = 0
    sample_index = np.arange(X.shape[0])
    if shuffle:
        np.random.shuffle(sample_index)
    while True:
        batch_index = sample_index[batch_size*counter:batch_size*(counter+1)]
        X_batch = X[batch_index,:].toarray()
        y_batch = y[batch_index]
        counter += 1
        yield X_batch, y_batch
        if (counter == number_of_batches):
            if shuffle:
                np.random.shuffle(sample_index)
            counter = 0

def batch_generatorp(X, batch_size, shuffle):
    number_of_batches = X.shape[0] / np.ceil(X.shape[0]/batch_size)
    counter = 0
    sample_index = np.arange(X.shape[0])
    while True:
        batch_index = sample_index[batch_size * counter:batch_size * (counter + 1)]
        X_batch = X[batch_index, :].toarray()
        counter += 1
        yield X_batch
        if (counter == number_of_batches):
            counter = 0



## read data
train = pd.read_csv('./input/train.csv')
test = pd.read_csv('./input/test.csv')

index = list(train.index)
print index[0:10]
np.random.shuffle(index)
print index[0:10]
train = train.iloc[index]
'train = train.iloc[np.random.permutation(len(train))]'

## set test loss to NaN
test['loss'] = np.nan

## response and IDs
y = np.log(train['loss'].values+200)
id_train = train['id'].values
id_test = test['id'].values

## stack train test
ntrain = train.shape[0]
tr_te = pd.concat((train, test), axis = 0)

## Preprocessing and transforming to sparse data
sparse_data = []

f_cat = [f for f in tr_te.columns if 'cat' in f]
for f in f_cat:
    dummy = pd.get_dummies(tr_te[f].astype('category'))
    tmp = csr_matrix(dummy)
    sparse_data.append(tmp)

f_num = [f for f in tr_te.columns if 'cont' in f]
scaler = StandardScaler()
tmp = csr_matrix(scaler.fit_transform(tr_te[f_num]))
sparse_data.append(tmp)

del(tr_te, train, test)

## sparse train and test data
xtr_te = hstack(sparse_data, format = 'csr')
xtrain = xtr_te[:ntrain, :]
xtest = xtr_te[ntrain:, :]

print('Dim train', xtrain.shape)
print('Dim test', xtest.shape)

del(xtr_te, sparse_data, tmp)

Using Theano backend.
Using gpu device 0: Tesla K80 (CNMeM is enabled with initial size: 95.0% of memory, cuDNN 5105)


[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
[62418, 170809, 45900, 64448, 81993, 43058, 128975, 181839, 7545, 3564]
('Dim train', (188318, 1190))
('Dim test', (125546, 1190))


In [2]:
## neural net

def mae(y_true, y_pred):
   return K.mean(K.abs(K.exp(y_true) - K.exp(y_pred)))

# def mae(y_true, y_pred):
#     return K.mean(K.abs((y_true) - (y_pred)))

def nn_model():
    model = Sequential()
    
    model.add(Dense(400, input_dim = xtrain.shape[1], init = 'he_normal'))
    model.add(PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(0.4))
        
    model.add(Dense(200, init = 'he_normal'))
    model.add(PReLU())
    model.add(BatchNormalization())    
    model.add(Dropout(0.2))
    
    model.add(Dense(100, init = 'he_normal'))
    model.add(PReLU())
    model.add(BatchNormalization())    
    model.add(Dropout(0.2))
    
    model.add(Dense(50, init = 'he_normal'))
    model.add(PReLU())
    model.add(BatchNormalization())    
    model.add(Dropout(0.2))
    
    model.add(Dense(1, init = 'he_normal'))
    model.compile(loss = 'mae', optimizer = 'adadelta',metrics=[mae])
    return(model)

In [36]:
## cv-folds

nfolds = 10
folds = KFold(len(y), n_folds = nfolds, shuffle = True, random_state = 111) #was 111

## train models
i = 0
nbags = 10
nepochs = 80
pred_oob = np.zeros(xtrain.shape[0])
pred_test = np.zeros(xtest.shape[0])
partial_evalutaion = open('temp_scores9.txt','a') 


for (inTr, inTe) in folds:
    xtr = xtrain[inTr]
    ytr = y[inTr]
    xte = xtrain[inTe]
    yte = y[inTe]
    pred = np.zeros(xte.shape[0])
    for j in range(nbags):
        print j
        model = nn_model()
        
        callsback_list = [EarlyStopping(patience=10),\
                  ModelCheckpoint('keras-regressor-' + str(i+1) +'_'+ str(j+1) + '.check'\
                                  , monitor='val_loss', save_best_only=True, verbose=0)]
        
        fit = model.fit_generator(generator = batch_generator(xtr, ytr, 128, True),
                                  nb_epoch = nepochs,
                                  samples_per_epoch = xtr.shape[0],
                                  verbose = 0,
                                  validation_data=(xte.todense(),yte),
                                  callbacks=callsback_list)       
        
        fit = load_model('keras-regressor-' + str(i+1) + '_'+ str(j+1) + '.check')


        loss = np.exp(fit.predict_generator(generator = batch_generatorp(xte, 800, False), 
                                               val_samples = xte.shape[0])[:,0])-200
        
        score_loss = mean_absolute_error(np.exp(yte)-200,loss)
        
        pred += loss
        pred_test += np.exp(fit.predict_generator(generator = batch_generatorp(xtest, 800, False),
                                                    val_samples = xtest.shape[0])[:,0])-200
        
        print ('Fold '+ str(i+1) +'_'+ str(j+1) +str(score_loss))
        partial_evalutaion.write('Fold '+ str(i+1) +'_'+ str(j+1) +str(score_loss)+'\n')
        partial_evalutaion.flush()


    pred /= nbags
    pred_oob[inTe] = pred
    score = mean_absolute_error(np.exp(yte)-200, pred)

    i += 1
    print('Fold ', i, '- MAE:', score)
    
    partial_evalutaion.write('Fold '+ str(i) + '- MAE:'+ str(score)+'\n')
    partial_evalutaion.flush()



print('Total - MAE:', mean_absolute_error(np.exp(y)-200, pred_oob))

0
Fold 1_11143.932077
1
Fold 1_21141.58092922
2
Fold 1_31141.26053636
3
Fold 1_41140.46089725
4
Fold 1_51141.2743266
5
Fold 1_61142.78512442
6
Fold 1_71139.83133779
7
Fold 1_81142.30934976
8
Fold 1_91138.77190929
9
Fold 1_101161.05874151
('Fold ', 1, '- MAE:', 1135.3454871735294)
0
Fold 2_11158.56330909
1
Fold 2_21152.16611747
2
Fold 2_31199.44050397
3
Fold 2_41156.3822877
4
Fold 2_51156.18950129
5
Fold 2_61189.60851005
6
Fold 2_71167.52455878
7
Fold 2_81157.36642272
8
Fold 2_91158.75024189
9
Fold 2_101181.34234616
('Fold ', 2, '- MAE:', 1157.860703419675)
0
Fold 3_11145.55367288
1
Fold 3_21149.18721354
2
Fold 3_31147.08387396
3
Fold 3_41147.53908031
4
Fold 3_51146.48064462
5
Fold 3_61144.73082441
6
Fold 3_71152.92186021
7
Fold 3_81148.3623074
8
Fold 3_91146.01431125
9
Fold 3_101147.7026388
('Fold ', 3, '- MAE:', 1139.7899461290201)
0
Fold 4_11134.34362674
1
Fold 4_21129.6796282
2
Fold 4_31162.62033108
3
Fold 4_41128.41460146
4
Fold 4_51128.56961257
5
Fold 4_61127.43848664
6
Fold 4_711

In [37]:
total_score = mean_absolute_error(np.exp(y)-200, pred_oob)
## train predictions

df = pd.DataFrame({'id': id_train, 'loss': pred_oob})
df.to_csv('preds_oob' +str(total_score) + '.csv', index = False)

## test predictions
pred_test /= (nfolds*nbags)
df = pd.DataFrame({'id': id_test, 'loss': pred_test})
df.to_csv('submission_keras_shift_perm'+str(total_score) + '.csv', index = False)