In [1]:
# Basics
import numpy as np
import pandas as pd
import glob
# To check vessel names
import re
# To get exact lat/long
#from shapely.wkt import loads
# To map the ships or coastlines
#import geopandas as gpd
#from geopy.distance import geodesic
# To find coastline distance
from scipy.spatial import KDTree
# For XgBoosting
import xgboost as xgb
import random
# For CV with and weighted XGBoosting
from sklearn.model_selection import StratifiedKFold
from sklearn.utils.class_weight import compute_sample_weight

In [2]:
# Get preprocessed train and test data
ais_train = pd.read_csv("xgboost_preprocessed_data/xgboost_data_new100.csv")
ais_test = pd.read_csv("xgboost_preprocessed_data/xgboost_data_newtest2.csv")

In [3]:
class semi_supervised_xgb:
    def __init__(self, class_1, class_0, unknown):
        '''
        Inputs:
            class_1: A dataframe of observations you think are part of class 1, with x and y variables included
            class_2: A dataframe of observations you think are part of class 0, with x and y variables included
            unknown: A dataframe of observations of unknown class, with x and y variables included

        Methods:
            xgboost: Takes in labeled and unlabeled data and returns an XGBoost model
        '''
        # Define important things
        self.class_1_og = class_1
        self.class_0_og = class_0
        self.unknown_og = unknown
        self.class_1 = class_1
        self.class_0 = class_0
        self.unknown = unknown
        self.boost = 0
        # Define to keep track of iterations done
        self.boost_rounds_completed = 0

    def xgboost(self, features, y_name, num_rounds = 11, k_cv = 7, boost_rounds = 4000,
                np_seed = 4200, xgb_seed = 98765, 
                boost_params = {
                    'objective': 'binary:logistic',  # Binary classification objective 
                    'eval_metric': 'logloss',  # Evaluation metric
                    },
                learning_rates = [.01], 
                verbose = True, 
                class_1_prob = .95, class_0_prob = .05): 
        '''
        Description: 
            Uses xgboost to self train a final xgboost model and outputs it
        
        Inputs:
            features: A list of the x variables in the given data
            y_name: The name of the y variable in the given data
            num_rounds: How many rounds of reclassification to use
            k_cv: How many cross-validation folds to use in the XGBoost
            boost_rounds: The number of rounds to run the XGBoost
            np_seed: Before setting the cross-validation folds, this sets the seed for replicability
            xgb_seed: The seed to use in the XGBoosts for replicability
            boost_params: Parameters to use in the XGBoost models. Defaults to logloss on a binary classification. 
            learning_rates: A list of learning rates to use in the XGBoost models
            verbose: If True, prints the model's progress
            class_1_prob: If this number is met or exceeded by a model's given probability, a given observation is reclassified into class 1
            class_0_prob: If this number is not met by a model's given probability, a given observation is reclassified into class 0

        Outputs:
            xgboost model of class xgboost.core.Booster
        '''
        # Print out starting numbers:
        if verbose:
            print(f"Pre-modeling: {len(self.class_1)} of class 1 and {len(self.class_0)} of class 0 with {len(self.unknown)} out of model")
        unassigned = len(self.unknown)
        for i in range(1, num_rounds + 1):
            self.boost_rounds_completed += 1
            model_data = pd.concat([self.class_1, self.class_0]).reset_index(drop = True)
            x_features = model_data[features]
            y_feature = model_data[y_name]
            # Get sample weights
            sample_weights = compute_sample_weight(
                class_weight = 'balanced',
                y = y_feature 
            )
            # Set random seed
            np.random.seed(np_seed) # Set seed
            # Make DMatrix with weights to avoid issues with imbalanced data
            x_mat = xgb.DMatrix(x_features, label = y_feature, weight = sample_weights)
            # Make folds for CV
            cv_folds = StratifiedKFold(n_splits=k_cv, shuffle=True, random_state=42)
            # Get that CV in
            best_rmse = float('inf')
            for lr in learning_rates:
                # Update learning rate in parameters
                boost_params['eta'] = lr
            
                # Perform cross-validation
                cv_results = xgb.cv(params = boost_params, 
                                    dtrain = x_mat,  
                                    num_boost_round = boost_rounds, 
                                    nfold = k_cv, 
                                    folds = cv_folds,
                                    metrics = 'logloss', 
                                    early_stopping_rounds = 12, 
                                    stratified = True,
                                    seed = xgb_seed)
                
                if cv_results['test-logloss-mean'].min() < best_rmse:
                    best_rmse = cv_results['test-logloss-mean'].min()
                    best_lr = lr
                    optimal_rounds = cv_results['test-logloss-mean'].idxmin() + 1

            # Perform the optimized boost 
            boost_params['eta'] = best_lr
            real_boost = xgb.train(params = boost_params, dtrain = x_mat, 
                                   num_boost_round = optimal_rounds)
            test_features = self.unknown[features]
            test_mat = xgb.DMatrix(test_features)
            # Make predictions 
            predictions = real_boost.predict(test_mat) 
            with_probs = pd.concat([self.unknown, pd.DataFrame(predictions)], axis = 1)\
                .rename(columns = {0: 'prob_1'})
            # Move out-of-model rows into correct DataFrames 
            self.class_1 = pd.concat([self.class_1,
                                      with_probs.query(f"prob_1 >= {class_1_prob}")\
                                        .drop(['prob_1'], axis = 1)])
            self.class_1[y_name] = 1
            self.class_0 = pd.concat([self.class_0,
                                    with_probs.query(f"prob_1 < {class_0_prob}")\
                                        .drop(['prob_1'], axis = 1)])
            self.class_0[y_name] = 0
            self.unknown = with_probs.query(f"prob_1 < {class_1_prob} & prob_1 >= {class_0_prob}")\
                .drop(['prob_1'], axis = 1)
            # Print statement: 
            if verbose:
                print(f'''Round {self.boost_rounds_completed}: {len(self.class_1)} probable nets and {len(self.class_0)} probable ships with {len(self.unknown)} out of model. 
LR of {best_lr} for {optimal_rounds} rounds''')
            if (len(self.unknown) == 0):
                if verbose:
                    print('Stopping early: Out of unassigned ships')
                break
            if (len(self.unknown) == unassigned):
                if verbose:
                    print('Stopping early: Progress finished')
                break
            unassigned = len(self.unknown) # To use next loop 
        self.boost = real_boost
        return real_boost # return best XGBoost model at the end
    
    def reset(self):
        ''' 
        Resets the labels to their original form
        '''
        self.class_0 = self.class_0_og
        self.class_1 = self.class_1_og
        self.unknown = self.unknown_og
        self.boost_rounds_completed = 0

In [4]:
# Find the ones we think are def nets and def not nets
max_red_flags = ais_train.red_flags.max()
prob_nets = ais_train.query("red_flags >= 3 & net_name").copy().reset_index(drop=True)
del max_red_flags
prob_nets['net'] = 1
prob_ships = ais_train.query("red_flags == 0").copy().reset_index(drop=True)
prob_ships['net'] = 0
out_of_model = ais_train.query("(red_flags > 0) & ((net_name == False) | ((net_name == True) & (red_flags < 3)))").copy()

In [5]:
train = semi_supervised_xgb(prob_nets, prob_ships, out_of_model)
train_boost = train.xgboost(learning_rates = [.03], num_rounds = 5, k_cv = 3, boost_rounds = 6000,
                            features = ['speed_0', 'speed_med', 'speed_99', 'speed_std', 
                                        'dist_med', 'dist_99', 'dist_std', 'x_0', 'x_med', 
                                        'x_99', 'x_std', 'y_0', 'y_med', 'y_99', 'y_std'], y_name = 'net', 
                            class_0_prob = .015, class_1_prob = .985)

Pre-modeling: 14260 of class 1 and 52015 of class 0 with 151281 out of model
Round 1: 63272 probable nets and 107866 probable ships with 46418 out of model. 
LR of 0.03 for 1127 rounds
Round 2: 63356 probable nets and 108052 probable ships with 46148 out of model. 
LR of 0.03 for 1972 rounds
Round 3: 63403 probable nets and 108066 probable ships with 46087 out of model. 
LR of 0.03 for 1770 rounds
Round 4: 63433 probable nets and 108077 probable ships with 46046 out of model. 
LR of 0.03 for 1454 rounds
Round 5: 63487 probable nets and 108093 probable ships with 45976 out of model. 
LR of 0.03 for 1877 rounds


In [9]:
# Evaluate model on different time's data
test_mat = xgb.DMatrix(ais_test[['speed_0', 'speed_med', 'speed_99', 'speed_std', 'dist_med', 'dist_99', 
                                 'dist_std', 'x_0', 'x_med', 'x_99', 'x_std', 'y_0', 'y_med', 'y_99', 'y_std']])
test_preds = train_boost.predict(test_mat)
# Make predictions 
if "prob_net" in ais_test.columns:
    ais_test = ais_test.drop(columns=["prob_net"])
ais_test = pd.concat([ais_test.reset_index(drop = True), 
                      pd.DataFrame(test_preds)], 
                     axis = 1)\
    .rename(columns = {0: 'prob_net'})

print(
    (ais_test[ais_test['prob_net'] >= 0.5].groupby('red_flags').size() / ais_test.groupby('red_flags').size() * 100).fillna(0)
)
print(
    (ais_test[ais_test['prob_net'] >= 0.5].groupby('net_name').size() / ais_test.groupby('net_name').size() * 100).fillna(0)
)

red_flags
0    15.180662
1    47.582545
2    79.010829
3    91.224192
4    79.979253
dtype: float64
net_name
False    48.439598
True     85.442818
dtype: float64


In [10]:
from sklearn.metrics import log_loss
ais_test['net'] = np.where((ais_test['net_name'] == True) & (ais_test['red_flags'] >= 3), 1, np.where(ais_test['red_flags'] == 0, 0, np.nan))
ais_test2 = ais_test.dropna(subset = ['net']).copy()
ais_test2['class_net'] = ais_test2['prob_net'] >= .5
test_acc = ((ais_test2['class_net']) == ais_test2['net']).sum()/len(ais_test2)
test_sensitivity = sum((ais_test2['class_net']) & (ais_test2['net'] == True))/(sum((ais_test2['class_net']) & (ais_test2['net'] == True)) + sum((ais_test2['class_net'] == False) & (ais_test2['net'] == True)))
test_specificity = sum((ais_test2['class_net'] == False) & (ais_test2['net'] == False))/(sum((ais_test2['class_net'] == False) & (ais_test2['net'] == False)) + sum((ais_test2['class_net']) & (ais_test2['net'] == False)))
test_loss = log_loss(ais_test2['net'], ais_test2['prob_net'])
print(f"The test accuracy is: {test_acc: .4f}")
print(f"The test sensitivity is: {test_sensitivity: .4f}")
print(f"The test specificity is: {test_specificity: .4f}")
print(f"The test loss is: {test_loss: .4f}")

The test accuracy is:  0.8910
The test sensitivity is:  0.9153
The test specificity is:  0.8482
The test loss is:  0.2921


In [8]:
train_boost.get_score(importance_type = 'gain')

{'speed_0': 8.498777389526367,
 'speed_med': 23.83993148803711,
 'speed_99': 16.889787673950195,
 'speed_std': 5.401068210601807,
 'dist_med': 6.849855422973633,
 'dist_99': 7.31329870223999,
 'dist_std': 5.056441307067871,
 'x_0': 30.66214370727539,
 'x_med': 13.663167953491211,
 'x_99': 30.82803726196289,
 'x_std': 6.634103775024414,
 'y_0': 48.76719284057617,
 'y_med': 12.533854484558105,
 'y_99': 16.095462799072266,
 'y_std': 6.234163284301758}

In [9]:
train_boost.save_model('models/xgboost_new.json')