A Random Forest model to predict weather a ship was fishing or not. The training data for the model was generated by Kristina Boerder at Dalhousie University. The data has AIS messages and labels for weather the ship was fishing or not and the type of fishing gear used.

### Generating the training dataset.
This repository by [Global Fishing Watch](https://github.com/GlobalFishingWatch/training-data) contains the dataset required to run the model. To generate the dataset:
1. Clone the repository.
2. Run `git lfs pull`. The dataset converted to numpy arrarys and pickled.
3. Run `./prepare.sh`.

In [55]:
import os
import numpy as np
import pandas as pd

import sys
sys.path.insert(0, '../')

import feature_generation
from feature_generation import ais_distance_to_shore

from matplotlib import pyplot as plt



#Suppressing scientific notation
pd.set_option('display.float_format', lambda x: '%.0f' % x)

In [56]:
df_purse_seiners = np.load("../../data/alex_crowd_sourced.npz")
data = df_purse_seiners['x']

data = pd.DataFrame(data, columns=data.dtype.names)
data = data[data.duplicated(keep='first')]
data.dropna(how='any', inplace=True)
data.replace({'is_fishing': {-1: 0}}, inplace=True) #Unknown labels (-1s) replaced by "Not fishing"

data.head()


Unnamed: 0,mmsi,timestamp,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing
1,16508054338972,1377727720,0,2236,0,31,-18,178,0
3,16508054338972,1377728011,0,2236,0,0,-18,178,0
5,16508054338972,1377733591,0,2236,0,210,-18,178,0
7,16508054338972,1377734251,0,2236,0,226,-18,178,0
9,16508054338972,1377740551,0,2236,0,144,-18,178,0


In [57]:
### Feature engineering
### Convert timestamps to Eastern Australian time-zone.

from datetime import datetime, timedelta
import pytz


def convert_epoch_to_datetime(timestamp):
    utc_dt = datetime.utcfromtimestamp(timestamp).replace(tzinfo=pytz.utc)
    au_tz = pytz.timezone('Australia/Sydney')
    au_dt = au_tz.normalize(utc_dt.astimezone(au_tz))
    return au_dt


data['timestamp'] = data['timestamp'].apply(lambda x: convert_epoch_to_datetime(x))
data['day'] = data['timestamp'].apply(lambda x: x.hour > 12)
data.head()

features = data[['distance_from_shore', 'speed', 'course', 'day']]

In [58]:
spire_data = pd.read_csv("../../../../shared/ais_data/ais_full_year/positions_2016_07000000000000")

df_spire = spire_data[spire_data.longitude.between(-180, 180, inclusive=True) & spire_data.latitude.between(-90, 90, inclusive=True)]

In [59]:
distance_from_shore = feature_generation.ais_distance_to_shore.calculate_distance_to_shore(df_spire['longitude'], df_spire['latitude'], country_name=True)

Loading coordinates (...)


In [60]:
#distance_from_shore = calculate_distance_to_shore(df_spire['longitude'], df_spire['latitude'], country_name=True)
df_spire['distance_from_shore'] = distance_from_shore['distance_km']
df_spire.dropna(how='any', inplace=True)
df_spire.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,msg_type,mmsi,timestamp,status,rot,speed,accuracy,longitude,latitude,course,heading,maneuver,distance_from_shore
0,1,371592000,2016-07-14 07:47:01 UTC,0,0,11,0,-106,17,294,293,0,0
1,1,432984000,2016-07-09 03:38:08 UTC,8,0,6,0,156,-3,179,180,0,0
2,1,510068000,2016-07-15 18:40:09 UTC,5,731,12,0,-174,4,100,511,0,0
3,3,657320000,2016-07-29 19:25:37 UTC,0,731,0,1,8,4,167,511,0,0
5,3,325304000,2016-07-17 18:42:34 UTC,1,731,0,1,73,4,276,511,0,0


In [63]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, data['is_fishing'], test_size=0.20, stratify=data.is_fishing)
print("Training and testing split was successful.")
X_train.head(), y_train.head()

Training and testing split was successful.


(          distance_from_shore  speed  course    day
 10053239              1621745      1     175   True
 14171873                87090      4       1   True
 26586708              1661612      5     195   True
 12200308              1031730      0       7  False
 25748268               584000      4     162  False, 10053239   0
 14171873   0
 26586708   0
 12200308   0
 25748268   0
 Name: is_fishing, dtype: float64)

In [64]:
from sklearn.metrics import precision_score
from sklearn.metrics import roc_curve, auc

def accuracy_score(truth, pred):
    """ Returns accuracy score for input truth and predictions. """
    
    # Ensure that the number of predictions matches number of outcomes
    if len(truth) == len(pred): 
        
        # Calculate and return the accuracy as a percent
        return (truth == pred).mean()*100
    
    else:
        return "Number of predictions does not match number of outcomes!"

    
def performance_metric(truth, pred):
    
    return precision_score(truth, pred)

In [65]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer
from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import ShuffleSplit


def fit_model(X, y):

    cv_sets = ShuffleSplit(test_size = 0.20, random_state = 0).get_n_splits(X)

    regressor = RandomForestClassifier()


    params = {'n_estimators': np.arange(50, 500, step=100)}

    scoring_fnc = make_scorer(accuracy_score)

    grid = GridSearchCV(regressor, params, scoring=scoring_fnc, cv=cv_sets)

    grid = grid.fit(X, y)

    return grid.best_estimator_



In [66]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

regressor = RandomForestClassifier(n_estimators=100)
regressor.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [67]:
predictions = regressor.predict(df_spire[['distance_from_shore', 'speed', 'course']])

ValueError: Number of features of the model must match the input. Model n_features is 4 and input n_features is 3 

In [None]:
df_spire['is_fishing'] = predictions
df_spire[df_spire['is_fishing'] == 1]

In [None]:
print regressor.feature_importances_

importances = pd.DataFrame({'feature':X_train.columns,'feature importance':np.round(regressor.feature_importances_,3)})
importances = importances.sort_values('feature importance',ascending=False).set_index('feature')
print importances
importances.plot.bar()
plt.show()

In [None]:
print accuracy_score(y_test, predictions)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

def show_values(pc, fmt="%.2f", **kw):
    '''
    Heatmap with text in each cell with matplotlib's pyplot
    Source: https://stackoverflow.com/a/25074150/395857 
    '''
    from itertools import izip
    pc.update_scalarmappable()
    ax = pc.get_axes()
    for p, color, value in izip(pc.get_paths(), pc.get_facecolors(), pc.get_array()):
        x, y = p.vertices[:-2, :].mean(0)
        if np.all(color[:3] > 0.5):
            color = (0.0, 0.0, 0.0)
        else:
            color = (1.0, 1.0, 1.0)
        ax.text(x, y, fmt % value, ha="center", va="center", color=color, **kw)


def cm2inch(*tupl):
    '''
    Specify figure size in centimeter in matplotlib
    Source: https://stackoverflow.com/a/22787457/395857
    By gns-ank
    '''
    inch = 2.54
    if type(tupl[0]) == tuple:
        return tuple(i/inch for i in tupl[0])
    else:
        return tuple(i/inch for i in tupl)


def heatmap(AUC, title, xlabel, ylabel, xticklabels, yticklabels, figure_width=40, figure_height=20, correct_orientation=False, cmap='RdBu'):
    '''
    Inspired by:
    - https://stackoverflow.com/a/16124677/395857 
    - https://stackoverflow.com/a/25074150/395857
    '''

    # Plot it out
    fig, ax = plt.subplots()    
    #c = ax.pcolor(AUC, edgecolors='k', linestyle= 'dashed', linewidths=0.2, cmap='RdBu', vmin=0.0, vmax=1.0)
    c = ax.pcolor(AUC, edgecolors='k', linestyle= 'dashed', linewidths=0.2, cmap=cmap)

    # put the major ticks at the middle of each cell
    ax.set_yticks(np.arange(AUC.shape[0]) + 0.5, minor=False)
    ax.set_xticks(np.arange(AUC.shape[1]) + 0.5, minor=False)

    # set tick labels
    #ax.set_xticklabels(np.arange(1,AUC.shape[1]+1), minor=False)
    ax.set_xticklabels(xticklabels, minor=False)
    ax.set_yticklabels(yticklabels, minor=False)

    # set title and x/y labels
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)      

    # Remove last blank column
    plt.xlim( (0, AUC.shape[1]) )

    # Turn off all the ticks
    ax = plt.gca()    
    for t in ax.xaxis.get_major_ticks():
        t.tick1On = False
        t.tick2On = False
    for t in ax.yaxis.get_major_ticks():
        t.tick1On = False
        t.tick2On = False

    # Add color bar
    plt.colorbar(c)

    # Add text in each cell 
    show_values(c)

    # Proper orientation (origin at the top left instead of bottom left)
    if correct_orientation:
        ax.invert_yaxis()
        ax.xaxis.tick_top()       

    # resize 
    fig = plt.gcf()
    #fig.set_size_inches(cm2inch(40, 20))
    #fig.set_size_inches(cm2inch(40*4, 20*4))
    fig.set_size_inches(cm2inch(figure_width, figure_height))



def plot_classification_report(classification_report, title='Classification report ', cmap='RdBu'):
    '''
    Plot scikit-learn classification report.
    Extension based on https://stackoverflow.com/a/31689645/395857 
    '''
    lines = classification_report.split('\n')

    classes = []
    plotMat = []
    support = []
    class_names = []
    for line in lines[2 : (len(lines) - 2)]:
        t = line.strip().split()
        if len(t) < 2: continue
        classes.append(t[0])
        v = [float(x) for x in t[1: len(t) - 1]]
        support.append(int(t[-1]))
        class_names.append(t[0])
        print(v)
        plotMat.append(v)

    print('plotMat: {0}'.format(plotMat))
    print('support: {0}'.format(support))

    xlabel = 'Metrics'
    ylabel = 'Classes'
    xticklabels = ['Precision', 'Recall', 'F1-score']
    yticklabels = ['{0} ({1})'.format(class_names[idx], sup) for idx, sup  in enumerate(support)]
    figure_width = 25
    figure_height = len(class_names) + 7
    correct_orientation = False
    heatmap(np.array(plotMat), title, xlabel, ylabel, xticklabels, yticklabels, figure_width, figure_height, correct_orientation, cmap=cmap)


def main():

    plot_classification_report(classification_report(y_test, regressor.predict(X_test)))
    plt.savefig('test_plot_classif_report.png', dpi=200, format='png', bbox_inches='tight')
    plt.close()

if __name__ == "__main__":
    main()

In [None]:
from IPython.display import Image
Image(filename='test_plot_classif_report.png')