# Ship or Iceberg
https://www.kaggle.com/c/statoil-iceberg-classifier-challenge

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim

import numpy as np
import pandas as pd
import sklearn.preprocessing
import sklearn.model_selection

import matplotlib.pyplot as plt

# Data

Load and preprocess data

In [3]:
datadir = "/home/christopher/Data/data/ml/ship-or-iceberg/"

with open(datadir + "train.json") as f:
    raw_train = pd.read_json(f).set_index("id")
raw_train.head()

Unnamed: 0_level_0,band_1,band_2,inc_angle,is_iceberg
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dfd5f913,"[-27.878360999999998, -27.15416, -28.668615, -...","[-27.154118, -29.537888, -31.0306, -32.190483,...",43.9239,0
e25388fd,"[-12.242375, -14.920304999999999, -14.920363, ...","[-31.506321, -27.984554, -26.645678, -23.76760...",38.1562,0
58b2aaa0,"[-24.603676, -24.603714, -24.871029, -23.15277...","[-24.870956, -24.092632, -20.653963, -19.41104...",45.2859,1
4cfc3a18,"[-22.454607, -23.082819, -23.998013, -23.99805...","[-27.889421, -27.519794, -27.165262, -29.10350...",43.8306,0
271f93f4,"[-26.006956, -23.164886, -23.164886, -26.89116...","[-27.206915, -30.259186, -30.259186, -23.16495...",35.6256,0


In [4]:
band1 = np.array([np.array([i]) for i in raw_train["band_1"]], dtype=np.float32)
band2 = np.array([np.array([i]) for i in raw_train["band_2"]], dtype=np.float32)
train_data_all = np.concatenate((band1, band2), axis=1)

In [8]:
train_data_all = train_data_all.reshape(len(train_data_all), -1)
scaler = sklearn.preprocessing.StandardScaler()
scaler.fit(train_data_all)
train_data_all = scaler.transform(train_data_all)
train_data_all = train_data_all.reshape(len(train_data_all), 2, 75, 75)

In [9]:
train_labels_all = np.zeros((len(raw_train), 2), dtype=np.float32)
for i, is_iceberg in enumerate(raw_train["is_iceberg"]):
    train_labels_all[i][is_iceberg] = 1

In [17]:
train_data, valid_data, train_labels, valid_labels = sklearn.model_selection.train_test_split(train_data_all, train_labels_all, test_size=204)

# Data summary

In [18]:
train_data = autograd.Variable(torch.from_numpy(train_data))
train_labels = autograd.Variable(torch.from_numpy(train_labels))

valid_data = autograd.Variable(torch.from_numpy(valid_data))
valid_label = autograd.Variable(torch.from_numpy(valid_labels))

In [19]:
print(train_data.shape)
print(valid_data.shape)

torch.Size([1400, 2, 75, 75])
torch.Size([204, 2, 75, 75])


# Build Neural Network and Train

In [20]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(2, 3, 4) # in_chan, out_chan, kernel_size
        self.pool = nn.MaxPool2d(3, 3)
        self.lin1 = nn.Linear(3 * 24 * 24, 100) # in_feat, out_feat
        self.lin2 = nn.Linear(100, 2)
    
    def forward(self, x):
        # 2x75x75 -> 3x72x72 -> 3x24x24 -> 3*24*24 -> 100 -> 2
        x = self.conv1(x)
        x = self.pool(x)
        x = x.view(-1, 3*24*24)
        x = self.lin1(x) 
        x = self.lin2(x)
        return x

In [22]:
net = Net().cuda()
loss_fn = nn.MSELoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

for epoch in range(10):
    batch_size = 10
    running_loss = 0.0
    for start in np.arange(0, len(train_data) - batch_size, batch_size):
        batch_input = train_data[start:start+batch_size].cuda()
        batch_labels = train_labels[start:start+batch_size].cuda()
        optimizer.zero_grad()
    
        batch_output = net(batch_input)
        batch_loss = loss_fn(batch_output, batch_labels)
        batch_loss.backward()
        optimizer.step()
        running_loss += batch_loss.data[0]
        
    print(running_loss)

35.96289709210396
29.86249701678753
27.097402542829514
24.42091115564108
21.925087705254555
19.78617623075843
17.866651087999344
16.1743786893785
14.695699429139495
13.384493980556726


# Validation

In [42]:
out = net(valid_data.cuda())
exp = np.argmax(valid_label.data.numpy(), axis=1)
res = np.argmax(out.data.cpu().numpy(), axis=1)

print("How do we perform on boats, icebergs, overall")
print(np.sum(res[exp == 0] == 0) / np.count_nonzero(exp == 0))
print(np.sum(res[exp == 1] == 1) / np.count_nonzero(exp == 1))
print(np.sum(res == exp) / len(res))

0.719626168224
0.752577319588
0.735294117647


# Test