In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import theano
import theano.tensor as T
import lasagne as L

Load train sample from json file to pandas dataframe:

In [2]:
df_train = pd.read_json('data/processed/train.json')

Extract images from dataframe:

In [3]:
mean_angle = np.mean(df_train[df_train['inc_angle'] != 'na']['inc_angle'])

In [4]:
def get_preprocessed_angle(row):
    angle = row['inc_angle']
    float_angle = mean_angle if angle == 'na' else float(angle)
    return np.pi / 180 * float_angle

In [5]:
def get_scaled_imgs(df):
    imgs = []
    
    for i, row in df.iterrows():
        band_1 = np.array(row['band_1']).reshape(75, 75)
        band_2 = np.array(row['band_2']).reshape(75, 75)
        
        sum_band = band_1 + band_2
        
        
        #band_1 = (band_1 - band_1.min() - 0.5) / (band_1.max() - band_1.min())
        #band_2 = (band_2 - band_2.min() - 0.5) / (band_2.max() - band_2.min())

        imgs.append(np.array([band_1, band_2,
                              #composite, 
                              sum_band, 
                             # prod_band_1, prod_band_2
                             ]))

    return np.array(imgs)

In [6]:
X_train, y_train = get_scaled_imgs(df_train), np.array(df_train['is_iceberg'], dtype=np.int32)

In [7]:
CHANNELS = X_train.shape[1]

In [11]:
def make_more_data(X_train, y_train):
    def rot90(img, n):
        result = np.copy(img)    
        for _ in range(n):
            result = [np.rot90(result[channel]) for channel in range(len(result))]
        return np.array(result)
    
    def flip(img):
        return np.array([np.flip(img[channel], 0) for channel in range(len(img))])
    
    X_result = []
    y_result = []
    
    for X, y in zip(X_train, y_train):
        X_result.append(rot90(X, 0))
        X_result.append(rot90(X, 1))
        X_result.append(rot90(X, 2))
        X_result.append(rot90(X, 3))
        X_result.append(flip(rot90(X, 0)))
        X_result.append(flip(rot90(X, 0)))
        X_result.append(flip(rot90(X, 0)))
        X_result.append(flip(rot90(X, 0)))
        
        for _ in range(8):
            y_result.append(y)
        
        
    return np.array(X_result), np.array(y_result)

In [12]:
X_train, y_train = make_more_data(X_train, y_train)

In [13]:
X_train.shape

(12832, 3, 75, 75)

Split into train and validation parts: 

In [14]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

Network. We use CNN with the following architecture: input -> (conv -> conv -> pool) x 3 -> dense -> dense. For conv layers, filter size is 3, stride is 1 everywhere except the first conv layer; for pool layers, pool size is 2. After last pool layer, resulting shape is batch_size x 256 channels x 1 x 1, then we feed it to dense layers. We use LeakyRectify as an activation function everywhere except the last layer.

In [8]:
from lasagne.layers import batch_norm

input_X = T.tensor4("X")
input_y = T.vector("y", dtype='int32')
input_shape = [None, CHANNELS, 75, 75]
nl = L.nonlinearities.LeakyRectify()
n = 8

incoming = L.layers.InputLayer(shape=input_shape, input_var=input_X)
incoming = batch_norm(L.layers.Conv2DLayer(incoming, num_filters=8*n, filter_size=3, stride=2, nonlinearity=nl))
incoming = batch_norm(L.layers.Conv2DLayer(incoming, num_filters=8*n, filter_size=3, nonlinearity=nl))
incoming = L.layers.Pool2DLayer(incoming, pool_size=2)
#incoming = L.layers.DropoutLayer(incoming, p=0.3)
incoming = batch_norm(L.layers.Conv2DLayer(incoming, num_filters=8*n, filter_size=3, nonlinearity=nl))
incoming = batch_norm(L.layers.Conv2DLayer(incoming, num_filters=16*n, filter_size=3, nonlinearity=nl))
incoming = L.layers.Pool2DLayer(incoming, pool_size=2)
incoming = L.layers.DropoutLayer(incoming, p=0.3)
incoming = batch_norm(L.layers.Conv2DLayer(incoming, num_filters=16*n, filter_size=3, nonlinearity=nl))
incoming = batch_norm(L.layers.Conv2DLayer(incoming, num_filters=16*n, filter_size=3, nonlinearity=nl))
incoming = L.layers.Pool2DLayer(incoming, pool_size=2)
incoming = L.layers.DropoutLayer(incoming, p=0.3)
incoming = L.layers.DenseLayer(incoming, num_units=16, nonlinearity=nl)
#incoming = L.layers.DropoutLayer(incoming, p=0.3)
incoming = L.layers.DenseLayer(incoming, num_units=1, nonlinearity=L.nonlinearities.sigmoid)

y_predicted = L.layers.get_output(incoming)

loss = L.objectives.binary_crossentropy(y_predicted, input_y).mean()
accuracy = L.objectives.binary_accuracy(y_predicted, input_y).mean()

updates = L.updates.adam(loss, L.layers.get_all_params(incoming, trainable=True))
train_fn = theano.function([input_X, input_y], [loss, accuracy], updates=updates)
val_fn = theano.function([input_X, input_y], [loss, accuracy])
predict_fn = theano.function([input_X], y_predicted)

training loop:

In [16]:
def iterate_minibatches(X, y, batch_size):
    indices = np.arange(len(X))
    np.random.shuffle(indices)

    for start_index in range(0, len(X) - batch_size + 1, batch_size):
        excerpt = indices[start_index:(start_index + batch_size)]

        yield X[excerpt], y[excerpt]

In [17]:
import time

num_epochs = 20 #amount of passes through the data
            
batch_size = 50 #number of samples processed at each function call

for epoch in range(num_epochs):
    #in each epoch we do a full pass over the training data:
    train_err = 0
    train_acc = 0
    train_batches = 0
    start_time = time.time()
    for batch in iterate_minibatches(X_train, y_train, batch_size):
        inputs, targets = batch
        train_err_batch, train_acc_batch = train_fn(inputs, targets)
        train_err += train_err_batch
        train_acc += train_acc_batch
        train_batches += 1

    val_acc = 0
    val_err = 0
    val_batches = 0
    for batch in iterate_minibatches(X_val, y_val, batch_size):
        inputs, targets = batch
        val_err_batch, val_acc_batch = val_fn(inputs, targets)
        val_acc += val_acc_batch
        val_err += val_err_batch
        val_batches += 1
    
    # Then we print the results for this epoch:
    print("Epoch {} of {} took {:.3f}s".format(
        epoch + 1, num_epochs, time.time() - start_time))

    print("  training loss (in-iteration):\t\t{:.6f}".format(train_err / train_batches))
    print("  train accuracy:\t\t{:.2f} %".format(
        train_acc / train_batches * 100))
    print("  validation accuracy:\t\t{:.2f} %".format(
        val_acc / val_batches * 100))
    print("  validation loss:\t\t{:.6f}".format(val_err / val_batches))

Epoch 1 of 10 took 352.521s
  training loss (in-iteration):		0.336889
  train accuracy:		84.17 %
  validation accuracy:		86.16 %
  validation loss:		0.286313
Epoch 2 of 10 took 350.204s
  training loss (in-iteration):		0.239300
  train accuracy:		89.83 %
  validation accuracy:		91.44 %
  validation loss:		0.208778
Epoch 3 of 10 took 351.679s
  training loss (in-iteration):		0.195487
  train accuracy:		92.08 %
  validation accuracy:		90.16 %
  validation loss:		0.232575
Epoch 4 of 10 took 349.274s
  training loss (in-iteration):		0.156019
  train accuracy:		93.96 %
  validation accuracy:		93.12 %
  validation loss:		0.171717
Epoch 5 of 10 took 348.346s
  training loss (in-iteration):		0.129953
  train accuracy:		95.20 %
  validation accuracy:		93.52 %
  validation loss:		0.158794
Epoch 6 of 10 took 351.116s
  training loss (in-iteration):		0.104248
  train accuracy:		96.21 %
  validation accuracy:		93.84 %
  validation loss:		0.155944
Epoch 7 of 10 took 348.221s
  training loss (in-iter

Load the test dataframe, calculate results, save them in csv:

In [34]:
import gc
#del X_train, y_train, X_val, y_val
del df_test
gc.collect()

6986

In [18]:
np.savez('model.npz', *L.layers.get_all_param_values(incoming))

In [9]:
with np.load('model.npz') as f:
    param_values = [f['arr_%d' % i] for i in range(len(f.files))]
L.layers.set_all_param_values(incoming, param_values)

In [10]:
def iterate_minibatches_test(X, batch_size=500):
    indices = np.arange(len(X))

    for start_index in range(0, len(X), batch_size):
        excerpt = indices[start_index:(start_index + batch_size)]

        yield X[excerpt]

In [11]:
df_test = pd.read_json('data/processed/test.json')
X_test = get_scaled_imgs(df_test)

y_test = np.array([])
for batch in iterate_minibatches_test(X_test):
    y_test = np.concatenate([y_test, predict_fn(batch).reshape(-1)])

test_ans = df_test[['id']]
test_ans['is_iceberg'] = y_test

test_ans.to_csv('submission.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Possible improvements:
* add another third channel in the input layer (need to understand some physics behind the process here)
* scale data
* play with CNN architecture: try residual connections, maxouts, fire modules; batchnorms, dropout layers; modify amount and parameters of layers