## Imports

In [1]:
import pandas as pd

from torch import optim
from torch import nn
import torch
from torch.utils.data import TensorDataset

import numpy as np

import skorch
from skorch.helper import predefined_split
from skorch import NeuralNetBinaryClassifier
from skorch.callbacks import EarlyStopping, Checkpoint

## Importing Data

In [2]:
df = pd.read_csv("data/labeled_snowy_5.csv")

display(df.head(5))

columns = df.columns[4:]

x_labels = columns[1:]
y_labels = columns[0]

print("Input features: ", end="")
print(*x_labels, sep=", ")
print("Class Labels ", y_labels)


data = df[columns].values.astype(np.float32)

noise_indices = np.where(data[:,0] == 1)[0]
real_indices = np.where(data[:,0] == 0)[0]

np.random.shuffle(noise_indices)
np.random.shuffle(real_indices)


x_test_indices = np.concatenate((noise_indices[:250], real_indices[:2000]))
x_val_indices = np.concatenate((noise_indices[250:500], real_indices[2000:4000]))
x_train_indices = np.concatenate((noise_indices[500:], real_indices[4000:]))


x_train = data[x_train_indices, 1:]
y_train = data[x_train_indices, 0]

x_val = data[x_val_indices, 1:]
y_val = data[x_val_indices, 0]

val_ds = TensorDataset(torch.tensor(x_val), torch.tensor(y_val))

x_test = data[x_test_indices, 1:]
y_test = data[x_test_indices, 0]

total_points = len(data)
positive_points = len(noise_indices)

print(f"Non-Noise Points: {total_points - positive_points}/{total_points} ({(total_points - positive_points)/total_points:.3f}%)")
print(f"Noise Points:{positive_points}/{total_points} ({positive_points/total_points:.3f}%)")


Unnamed: 0,X,Y,Z,__rangexy,Noise,Normal change rate,Number of neighbors,Surface density,Omnivariance,Eigenentropy,Anisotropy,Planarity,Linearity,Surface variation,Sphericity,Verticality,3rd eigenvalue
0,3.286,2.673,-0.996,4.236,1.0,0.010093,983.0,1251.594482,0.010354,0.259228,0.986132,0.34627,0.639862,0.010093,0.013868,0.056737,0.00084
1,3.419,2.662,-0.938,4.333,1.0,0.01158,1411.0,1796.540894,0.013134,0.292883,0.982632,0.465111,0.517521,0.01158,0.017368,0.064407,0.001123
2,3.592,2.68,-0.886,4.482,1.0,0.011959,1861.0,2369.498779,0.015455,0.322546,0.980218,0.614534,0.365685,0.011959,0.019782,0.070589,0.001316
3,3.105,0.585,-0.767,3.16,1.0,0.063361,526.0,669.723999,0.01113,0.19143,0.911196,0.223945,0.687251,0.063361,0.088804,0.003578,0.003264
4,3.253,-0.561,-0.802,3.301,1.0,0.04897,1266.0,1611.921265,0.013326,0.224003,0.926413,0.355537,0.570876,0.04897,0.073587,0.043278,0.003103


Input features: Normal change rate, Number of neighbors, Surface density, Omnivariance, Eigenentropy, Anisotropy, Planarity, Linearity, Surface variation, Sphericity, Verticality, 3rd eigenvalue
Class Labels  Noise
Non-Noise Points: 102577/105815 (0.969%)
Noise Points:3238/105815 (0.031%)


## Network

In [3]:
class Noise_Classifier(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim, num_layers, dropout=0.03):
        super(Noise_Classifier, self).__init__()
        layers = []
        activation = nn.Tanh
        for i in range(num_layers):
            if i == 0:
                layers.append(nn.Linear(input_dim, hidden_dim))
                layers.append(activation())
            elif i == num_layers - 1:
                layers.append(nn.Linear(hidden_dim, output_dim))
            else:
                layers.append(nn.Linear(hidden_dim, hidden_dim))
                layers.append(activation())
            layers.append(nn.Dropout(dropout))
            
        self.layers = nn.Sequential(*layers)

    def forward(self, x):
        return self.layers(x)


## Training

In [5]:
checkpoint_dir = './Model_Checkpoints/'

callbacks = [EarlyStopping(patience=20, threshold=0.001, threshold_mode='abs', monitor='valid_acc', lower_is_better=False),
            #Checkpoint(monitor='valid_acc_best', f_params='NoiseClassifier.pt', dirname=checkpoint_dir)
]
optimizer = optim.Adam
net = NeuralNetBinaryClassifier(
    Noise_Classifier,
    train_split=predefined_split(val_ds),
    module__input_dim=x_train.shape[1],
    module__output_dim=1,
    module__hidden_dim=200,
    module__num_layers=8,
    module__dropout=0.000,
    optimizer=optimizer,
    optimizer__weight_decay=.00001,
    max_epochs=200,
    lr=5e-5,
    batch_size=64,
    device='cuda:0',
    iterator_train__shuffle=True,
    iterator_train__num_workers=2,
    iterator_train__pin_memory=True,
    iterator_valid__num_workers=2,
    iterator_valid__pin_memory=True,
    callbacks=callbacks,
)

net = net.fit(x_train, y_train)

  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m0.0820[0m       [32m0.9627[0m        [35m0.1825[0m  2.5301


## Testing Accuracy

In [6]:
net.load_params(f_params=checkpoint_dir+'NoiseClassifier.pt', f_optimizer=checkpoint_dir + 'optimizer.pt', f_history=checkpoint_dir+'history.json')
train_acc = net.score(x_train, y_train)
val_acc = net.score(x_val, y_val)
test_acc = net.score(x_test, y_test)

print(f"Train Accuracy: {train_acc:.3f}")
print(f"Validation Accuracy: {val_acc:.3f}")
print(f"Test Accuracy: {test_acc:.3f}")

Train Accuracy: 0.992
Validation Accuracy: 0.968
Test Accuracy: 0.970


## Creating New Data Set with Predictions

In [87]:
destination = "data/NN_predictions.csv"

prediction = net.predict(data[:, 1:])

df['NN_Predictions'] = prediction

new_df = df[['X', 'Y', 'Z', '__rangexy', 'Noise', 'NN_Predictions']]
new_df.to_csv(destination, index=False)