# Ship or Iceberg
https://www.kaggle.com/c/statoil-iceberg-classifier-challenge

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [64]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import numpy as np
import pandas as pd
import sklearn.preprocessing
import sklearn.model_selection

import matplotlib.pyplot as plt

# Data

Load and preprocess data

In [3]:
datadir = "/home/christopher/Data/data/ml/ship-or-iceberg/"

with open(datadir + "train.json") as f:
    raw_train = pd.read_json(f).set_index("id")
raw_train.head()

Unnamed: 0_level_0,band_1,band_2,inc_angle,is_iceberg
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dfd5f913,"[-27.878360999999998, -27.15416, -28.668615, -...","[-27.154118, -29.537888, -31.0306, -32.190483,...",43.9239,0
e25388fd,"[-12.242375, -14.920304999999999, -14.920363, ...","[-31.506321, -27.984554, -26.645678, -23.76760...",38.1562,0
58b2aaa0,"[-24.603676, -24.603714, -24.871029, -23.15277...","[-24.870956, -24.092632, -20.653963, -19.41104...",45.2859,1
4cfc3a18,"[-22.454607, -23.082819, -23.998013, -23.99805...","[-27.889421, -27.519794, -27.165262, -29.10350...",43.8306,0
271f93f4,"[-26.006956, -23.164886, -23.164886, -26.89116...","[-27.206915, -30.259186, -30.259186, -23.16495...",35.6256,0


In [4]:
band1 = np.array([np.array([i]) for i in raw_train["band_1"]], dtype=np.float32)
band2 = np.array([np.array([i]) for i in raw_train["band_2"]], dtype=np.float32)
train_data_all = np.concatenate((band1, band2), axis=1)

In [8]:
train_data_all = train_data_all.reshape(len(train_data_all), -1)
scaler = sklearn.preprocessing.StandardScaler()
scaler.fit(train_data_all)
train_data_all = scaler.transform(train_data_all)
train_data_all = train_data_all.reshape(len(train_data_all), 2, 75, 75)

In [9]:
train_labels_all = np.zeros((len(raw_train), 2), dtype=np.float32)
for i, is_iceberg in enumerate(raw_train["is_iceberg"]):
    train_labels_all[i][is_iceberg] = 1

In [17]:
train_data, valid_data, train_labels, valid_labels = sklearn.model_selection.train_test_split(train_data_all, train_labels_all, test_size=204)

# Data summary

In [18]:
train_data = autograd.Variable(torch.from_numpy(train_data))
train_labels = autograd.Variable(torch.from_numpy(train_labels))

valid_data = autograd.Variable(torch.from_numpy(valid_data))
valid_label = autograd.Variable(torch.from_numpy(valid_labels))

In [130]:
print(train_data.shape)
print(valid_data.shape)
print(np.sum(train_labels.data.numpy()[:,0])/len(train_labels)) # Classes are almost equally represented

torch.Size([1400, 2, 75, 75])
torch.Size([204, 2, 75, 75])
0.531428571429
0.468571428571


# Build Neural Network and Train

In [131]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(2, 3, 4) # in_chan, out_chan, kernel_size
        self.conv2 = nn.Conv2d(3, 6, 5)
        self.lin1 = nn.Linear(6 * 10 * 10, 200) # in_feat, out_feat
        self.lin2 = nn.Linear(200, 50)
        self.lin3 = nn.Linear(50, 2)
    
    def forward(self, x):
        # 2x75x75 -> 3x72x72 -> 3x24x24 -> 6x20x20 -> 6x10x10 -> 3*24*24 -> 200 -> 50 -> 2
        x = F.max_pool2d(self.conv1(x), 3)
        x = F.max_pool2d(self.conv2(x), 2)
        x = x.view(-1, 6*10*10)
        x = F.relu(self.lin1(x))
        x = F.relu(self.lin2(x))
        x = self.lin3(x)
        return x

In [132]:
net = Net().cuda()
loss_fn = nn.MSELoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

for epoch in range(30):
    batch_size = 10
    running_loss = 0.0
    for start in np.arange(0, len(train_data) - batch_size, batch_size):
        batch_input = train_data[start:start+batch_size].cuda()
        batch_labels = train_labels[start:start+batch_size].cuda()
        optimizer.zero_grad()
    
        batch_output = net(batch_input)
        batch_loss = loss_fn(batch_output, batch_labels)
        batch_loss.backward()
        optimizer.step()
        running_loss += batch_loss.data[0]
        
    print(running_loss)

38.317485854029655
32.08613221347332
30.614415302872658
29.160364031791687
27.393827587366104
25.620278045535088
24.486654341220856
23.267941899597645
22.324278719723225
21.379121474921703
20.389741480350494
19.36722845584154
18.612627010792494
17.70719524100423
16.942528869956732
16.172869842499495
15.516541574150324
14.832075327634811
14.175205450505018
13.525931337848306
12.815844856202602
12.106014221906662
11.4764303304255
10.819300109520555
10.220376293174922
9.627480558119714
9.099950190633535
8.488928012549877
7.990637557581067
7.523677799385041


# Validation

In [133]:
out = net(valid_data.cuda())
exp = np.argmax(valid_label.data.numpy(), axis=1)
res = np.argmax(out.data.cpu().numpy(), axis=1)

print("How do we perform on boats, icebergs, overall")
print(np.sum(res[exp == 0] == 0) / np.count_nonzero(exp == 0))
print(np.sum(res[exp == 1] == 1) / np.count_nonzero(exp == 1))
print(np.sum(res == exp) / len(res))
print("This is much better than random!")

How do we perform on boats, icebergs, overall
0.878504672897
0.814432989691
0.848039215686
This is much better than random!


# Test