In [None]:
# Import packages
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import torch
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

In [None]:
# Get preprocessed train and test data(plus several necessary alterations)
ais_train = pd.read_csv("xgboost_preprocessed_data/xgboost_data_new100.csv")
cols_to_imp = ['speed_std', 'dist_std', 'x_std', 'y_std'] # Columns to impute with mean
ais_train[cols_to_imp] = ais_train[cols_to_imp].fillna(ais_train[cols_to_imp].mean())
ais_test = pd.read_csv("xgboost_preprocessed_data/xgboost_data_newtest2.csv")
ais_test[cols_to_imp] = ais_test[cols_to_imp].fillna(ais_test[cols_to_imp].mean())

In [None]:
ais_train['net'] = (ais_train['red_flags'] >= 3) & (ais_train['net_name'])
ais_train['ship'] = ais_train['red_flags'] == 0
ais_test['net'] = (ais_test['red_flags'] >= 3) & (ais_test['net_name'])
ais_test['ship'] = ais_test['red_flags'] == 0

# Make Model class

In [None]:
class ann1(nn.Module):
    # Implements a feed-forward neural net
    def __init__(self, train_x, train_y, hidden_nodes, batch_size = 32, seed = 12345):
        # SUPER INIT
        super().__init__()
        self.optimizer = None
        self.loss_fn = None
        # Set seed for reproducibility
        torch.manual_seed(seed)
        # Make epoch = 0
        self.epoch = 0
        self.optimal_epochs = np.NaN
        # Save batch size
        self.batch_size = batch_size
            
        # Upsample the data before modeling
        if train_y[train_y == 1].shape[0] <= train_y[train_y == 0].shape[0]/2:
            sampling_strategy = {0: train_y[train_y == 0].shape[0], 1: round(train_y[train_y == 0].shape[0]/2)}
            smote = SMOTE(sampling_strategy = sampling_strategy, random_state = 836320)
            train_x, train_y = smote.fit_resample(train_x, train_y)            
        
        # Format data and dependencies
        train_x = torch.tensor(train_x, dtype = torch.float32)
        train_y = torch.tensor(train_y, dtype = torch.float32)
        train_dataset = TensorDataset(train_x, train_y)
        self.train_x = train_x
        self.train_y = train_y
        self.input_dim0, self.input_dim1 = train_x.shape
        self.train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        # Net layers
        self.layers = nn.Sequential(
            nn.Linear(self.input_dim1, hidden_nodes), 
            nn.PReLU(), 
            nn.Linear(hidden_nodes, 1), 
            nn.Sigmoid()
        )
        
    def forward(self, x): # Makes the forward pass in the net
        ''' 
        A method to make neural net predictions
        '''
        self.input_dim0, self.input_dim1 = x.shape
        if type(x) == np.ndarray:
            x = torch.tensor(x, dtype = torch.float32)
        outputs = self.layers(x)
        return outputs
    
    def train_net(self, val_x, val_y, num_epochs, filepath, loss_fn = nn.BCELoss(), learning_rate = .03, report_freq = 1, print_results = True, early_stopping_rounds = 5):
        ''' 
        A method to train the neural net on data
        '''
        self.loss_fn = loss_fn
        val_x = torch.tensor(val_x, dtype = torch.float32)
        val_y = torch.tensor(val_y, dtype = torch.float32)
        val_dataset = TensorDataset(val_x, val_y)
        self.val_loader = DataLoader(val_dataset, batch_size=self.batch_size, shuffle=True)
        self.train()  # Set the model to training mode

        # Define optimizer, number of data points
        self.optimizer = optim.Adam(self.parameters(), lr = learning_rate)
        dataset_size = self.input_dim0
        n_test = val_x.shape[0]
        # Make lists for later
        train_loss_list = []
        train_acc_list = []
        val_loss_list = []
        val_acc_list = []
        for epoch in range(self.epoch, self.epoch + num_epochs):
            for x_batch, y_batch in self.train_loader:
                # Forward pass 
                outputs = self.forward(x_batch)
                # Compute loss 
                loss = loss_fn(outputs, y_batch.unsqueeze(1))
                # Backpropagation 
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()
            threshold = .5 # Set probability threshold for classification 
            # Get epoch train results
            with torch.no_grad():
                outputs = self.forward(self.train_x)
                train_loss = loss_fn(outputs, self.train_y.unsqueeze(1))
                train_accuracy = torch.sum((outputs >= threshold) == self.train_y.unsqueeze(1))/dataset_size

            train_loss_list.append(train_loss.item())
            train_acc_list.append(train_accuracy.item())
            # Get epoch's validation set results
            with torch.no_grad():
                val_preds = self.forward(val_x)
                val_loss = loss_fn(val_preds, val_y.unsqueeze(1))
                val_accuracy = torch.sum((val_preds >= threshold) == val_y.unsqueeze(1))/n_test

            val_loss_list.append(val_loss.item())
            val_acc_list.append(val_accuracy.item())
            # Print epoch results?
            if print_results:
                if (epoch + 1) % report_freq == 0 or epoch == 0 or (epoch + 1) == num_epochs:
                    print(f"Epoch {epoch+1}/{self.epoch + num_epochs}, Train Loss: {train_loss.item():.4f}, Train Accuracy: {train_accuracy: .4f}, \
                    Val Loss: {val_loss.item(): .4f}, Val Accuracy: {val_accuracy: .4f}")
            # Save model if it is the best so far
            if epoch == 0 or min(val_loss_list) == val_loss:
                checkpoint = {
                    'model_state_dict': self.state_dict(),
                    'optimizer_state_dict': self.optimizer.state_dict(),
                    'epoch': epoch,
                    'train_loss_list': train_loss_list,
                    'train_acc_list': train_acc_list,
                    'val_loss_list': val_loss_list,
                    'val_acc_list': val_acc_list
                }
                torch.save(checkpoint, filepath)
            # Add early stopping 
            elif val_loss_list.index(min(val_loss_list)) == epoch - early_stopping_rounds:
                break
        # Make epochs 
        self.epoch = self.epoch + num_epochs
        # Print best number of epochs 
        optimal_epochs = [ind + 1 for ind, val in enumerate(val_loss_list) if val == min(val_loss_list)][0]
        self.optimal_epochs = optimal_epochs
        if print_results:
            print(f"The optimal number of epochs was: {optimal_epochs}")
        self.load_best_model(filename = filepath)
    
    def evaluate(self, test_x, test_y, loss_fn = nn.BCELoss()):
        test_x = torch.tensor(test_x, dtype = torch.float32)
        test_y = torch.tensor(test_y, dtype = torch.float32)
        with torch.no_grad():
            test_preds = self.forward(test_x)
            test_accuracy = torch.sum((test_preds >= .5) == test_y.unsqueeze(1))/test_x.shape[0]
            test_sensitivity = torch.sum((test_preds >= .5) & (test_y.unsqueeze(1) == True))/(torch.sum((test_preds >= .5) & (test_y.unsqueeze(1) == True)) + torch.sum((test_preds <= .5) & (test_y.unsqueeze(1) == True)))
            test_specificity = torch.sum((test_preds <= .5) & (test_y.unsqueeze(1) == False))/(torch.sum((test_preds <= .5) & (test_y.unsqueeze(1) == False)) + torch.sum((test_preds >= .5) & (test_y.unsqueeze(1) == False)))
            test_loss = loss_fn(test_preds, test_y.unsqueeze(1))
        print(f"The test accuracy is:{test_accuracy: .4f}, test loss is:{test_loss: .4f}, test sensitivity is:{test_sensitivity: .4f}, test specificity is:{test_specificity: .4f}")
    
    def predict(self, new_x):
        '''
        Predict probabilities of a selected dataset
        '''
        new_x = torch.tensor(new_x, dtype = torch.float32)
        with torch.no_grad():
            preds = self.forward(new_x)
        return preds
    
    def load_best_model(self, filename):
        checkpoint = torch.load(filename)
        model_dict = self.state_dict()
        pretrained_dict = {k: v for k, v in checkpoint['model_state_dict'].items() if k in model_dict}
        self.load_state_dict(pretrained_dict)
        if self.optimizer is None:
            self.optimizer = optim.Adam(self.parameters(), lr = .03)
        self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        self.epoch = checkpoint['epoch']
        self.train_loss_list = checkpoint['train_loss_list']
        self.train_acc_list = checkpoint['train_acc_list']
        self.val_loss_list = checkpoint['val_loss_list']
        self.val_acc_list = checkpoint['val_acc_list']            

# Get data ready

In [None]:
# Find the ones we think are def nets and def not nets
max_red_flags = 4
prob_nets = ais_train.query("red_flags >= 3 & net_name").copy().reset_index(drop=True)
prob_nets['net'] = 1
prob_ships = ais_train.query("red_flags == 0").copy().reset_index(drop=True)
prob_ships['net'] = 0
# Combine the datasets of probable good/bad
model_data = pd.concat([prob_nets, prob_ships])
x_mat = model_data.loc[:, ['speed_0', 'speed_med', 'speed_99', 'speed_std', 'dist_med', 'dist_99', 'dist_std', 
                           'x_0', 'x_med', 'x_99', 'x_std', 'y_0', 'y_med', 'y_99', 'y_std']].values
y_mat = model_data['net'].values
# Split data
x_train, x_val, y_train, y_val = train_test_split(x_mat, y_mat, test_size = 0.15, random_state=369)
del x_mat, y_mat

In [None]:
# Do the same for test data
prob_nets = ais_test.query("red_flags >= 3 & net_name").copy().reset_index(drop=True)
prob_nets['net'] = 1
prob_ships = ais_test.query("red_flags == 0").copy().reset_index(drop=True)
prob_ships['net'] = 0
# Combine the datasets of probable good/bad
model_test = pd.concat([prob_nets, prob_ships])
x_test_model = model_test.loc[:, ['speed_0', 'speed_med', 'speed_99', 'speed_std', 'dist_med', 'dist_99', 
                                  'dist_std', 'x_0', 'x_med', 'x_99', 'x_std', 'y_0', 'y_med', 'y_99', 'y_std']].values
y_test_model = model_test['net'].values

In [None]:
# For all test data
x_test = ais_test.loc[:, ['speed_0', 'speed_med', 'speed_99', 'speed_std', 'dist_med', 'dist_99', 'dist_std', 'x_0', 
                                   'x_med', 'x_99', 'x_std', 'y_0', 'y_med', 'y_99', 'y_std']].values
flags_test = ais_test.loc[:, ['red_flags']].values
names_test = ais_test.loc[:, ['net_name']].values

In [None]:
# Format unlabeled data for semi-supervised learning
x_unlabeled = ais_train.query("(red_flags != 0) & ((net_name == False) | ((net_name == True) & (red_flags < 3)))").copy().reset_index(drop=True)
x_unlabeled = x_unlabeled.loc[:, ['speed_0', 'speed_med', 'speed_99', 'speed_std', 'dist_med', 'dist_99', 'dist_std', 
                                  'x_0', 'x_med', 'x_99', 'x_std', 'y_0', 'y_med', 'y_99', 'y_std']].values

# Model

### Semi-supervised

In [None]:
num_rounds = 5
semi_x = x_train
semi_y = y_train
semi_unlabeled = x_unlabeled
unassigned = semi_unlabeled.shape[0]
print(f'''Pre-modeling: {sum(semi_y == 1)} probable nets and {sum(semi_y == 0)} probable ships with {semi_unlabeled.shape[0]} out of model.''')
for num_round in range(1, num_rounds + 1):
    # Train model
    model1 = ann1(semi_x, semi_y, hidden_nodes = 20, seed = 2345)
    model1.train_net(val_x = x_val, val_y = y_val, num_epochs = 100, learning_rate = .005, report_freq = 1, filepath = "models/ann1.pth", early_stopping_rounds = 4)
    # Reassign unlabeled data
    preds = model1.predict(semi_unlabeled)
    new_nets = semi_unlabeled[(preds >= .985).squeeze().numpy()]
    semi_x = np.concatenate((semi_x, new_nets))
    semi_y = np.concatenate((semi_y, np.ones(new_nets.shape[0])))
    new_ships = semi_unlabeled[(preds <= .015).squeeze().numpy()]
    semi_x = np.concatenate((semi_x, new_ships))
    semi_y = np.concatenate((semi_y, np.zeros(new_ships.shape[0])))
    semi_unlabeled = semi_unlabeled[((preds >= .015) & (preds <= .985)).squeeze().numpy()]
    print(f'''After round {num_round}: {sum(semi_y == 1)} probable nets and {sum(semi_y == 0)} probable ships with {semi_unlabeled.shape[0]} out of model.''')
    if ((semi_unlabeled.shape[0] == 0) | (semi_unlabeled.shape[0] == unassigned)):
        break
    unassigned = semi_unlabeled.shape[0]

In [None]:
test_preds = model1.predict(x_test) 
test_preds = pd.concat([pd.DataFrame(flags_test, columns = ["red_flags"]), 
                        pd.DataFrame(test_preds, columns = ["prob_net"]),
                        pd.DataFrame(names_test, columns = ["net_name"])], 
                        axis = 1)
print(
    (test_preds[test_preds['prob_net'] >= 0.5].groupby('red_flags').size() / test_preds.groupby('red_flags').size() * 100).fillna(0)
)
print(
    (test_preds[test_preds['prob_net'] >= 0.5].groupby('net_name').size() / test_preds.groupby('net_name').size() * 100).fillna(0)
)
model1.evaluate(x_test_model, y_test_model)

### Not semi-supervised

In [None]:
model0 = ann1(x_train, y_train, hidden_nodes = 20, seed = 2345)
model0.train_net(val_x = x_val, val_y = y_val, num_epochs = 200, learning_rate = .005, report_freq = 1, filepath = "ann0.pth", early_stopping_rounds = 5)
test_preds = model0.predict(x_test) 
test_preds = pd.concat([pd.DataFrame(flags_test, columns = ["red_flags"]), 
                        pd.DataFrame(test_preds, columns = ["prob_net"]),
                        pd.DataFrame(names_test, columns = ["net_name"])], 
                        axis = 1)
print(
    (test_preds[test_preds['prob_net'] >= 0.5].groupby('red_flags').size() / test_preds.groupby('red_flags').size() * 100).fillna(0)
)
print(
    (test_preds[test_preds['prob_net'] >= 0.5].groupby('net_name').size() / test_preds.groupby('net_name').size() * 100).fillna(0)
)
model0.evaluate(x_test_model, y_test_model)