# Imports and modules

In [1]:
import os
import requests
import pandas as pd
import time 
import numpy as np
import pytz
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import descartes
import geopandas as gpd

from datetime import datetime
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate, learning_curve, train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay, classification_report
from sklearn.model_selection import RandomizedSearchCV, train_test_split, cross_validate
from scipy.stats import randint
from sklearn import svm
from sklearn.feature_selection import r_regression
from sklearn.neighbors import KNeighborsClassifier
from shapely.geometry import Point, Polygon

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

# Extract from CSV and Preprocessing

## Function to extract data from CSV

In [2]:
def get_data(path, file_names):
    tables=[]
    for file_name in file_names:
        table = pd.read_csv(f'../{path}/{file_name}.csv')
        table['type']=f'{file_name}'
        tables.append(table)
    return pd.concat(tables)

path='data/raw_data'
file_names = ['trawlers', 'drifting_longlines', 'fixed_gear', 'pole_and_line', 'purse_seines', 'trollers', 'unknown']

df = get_data(path, file_names)

df.head()

Unnamed: 0,mmsi,timestamp,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing,source,type
0,1252340000000.0,1325376000.0,0.0,0.0,0.0,153.0,52.458649,4.5812,-1.0,gfw,trawlers
1,1252340000000.0,1325378000.0,0.0,0.0,0.0,153.0,52.458668,4.581167,-1.0,gfw,trawlers
2,1252340000000.0,1325379000.0,0.0,0.0,0.0,153.0,52.458633,4.581183,-1.0,gfw,trawlers
3,1252340000000.0,1325380000.0,0.0,0.0,0.0,153.0,52.458649,4.581234,-1.0,gfw,trawlers
4,1252340000000.0,1325381000.0,0.0,0.0,0.0,153.0,52.458649,4.581183,-1.0,gfw,trawlers


In [3]:
df['mmsi'].nunique()

354

## Remove unknown (-1 in is_fishing column)

In [3]:
df = df.loc[df['is_fishing'] > -1]

In [4]:
df['is_fishing'].value_counts()

is_fishing
0.000000    295979
1.000000    247498
0.666667      4806
0.333333      4096
0.750000       752
0.250000       670
0.800000        33
0.166667        12
0.400000         9
Name: count, dtype: int64

## Converting is_fishing to Binary (0 or 1)

In [41]:
df_fishing = df

In [42]:
# round the decimals so that number becomes 0 or 1
df_fishing.loc[:, ('is_fishing')] = round(df_fishing.loc[:, ('is_fishing')])

In [43]:
# check the unique values
df_fishing['is_fishing'].value_counts()

is_fishing
0.0    300766
1.0    253089
Name: count, dtype: int64

In [44]:
df_fishing.head(3)

Unnamed: 0,mmsi,date,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing,source,...,day_of_week_sin,day_of_week_cos,utc_offset,day_of_week_0,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6
60646,1252340000000.0,2015-01-01 05:08:23,0.0,0.0,0.0,128.0,52.458717,4.581316,0.0,gfw,...,0.433884,-0.900969,-4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
60647,1252340000000.0,2015-01-01 05:20:34,0.0,0.0,0.0,128.0,52.458733,4.581316,0.0,gfw,...,0.433884,-0.900969,-4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
60648,1252340000000.0,2015-01-01 05:32:53,0.0,0.0,0.0,128.0,52.458698,4.581267,0.0,gfw,...,0.433884,-0.900969,-4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


# Date Engineering

## Converting timestamp to datetime format

In [45]:
df_fishing['timestamp'] = pd.to_datetime(df_fishing['timestamp'], unit='s')
df_fishing.head(2)

KeyError: 'timestamp'

In [None]:
df_fishing.rename(columns={"timestamp": "date"}, inplace=True)
df_fishing.head()

Unnamed: 0,mmsi,date,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing,source,type
60646,1252340000000.0,2015-01-01 05:08:23,0.0,0.0,0.0,128.0,52.458717,4.581316,0.0,gfw,trawlers
60647,1252340000000.0,2015-01-01 05:20:34,0.0,0.0,0.0,128.0,52.458733,4.581316,0.0,gfw,trawlers
60648,1252340000000.0,2015-01-01 05:32:53,0.0,0.0,0.0,128.0,52.458698,4.581267,0.0,gfw,trawlers
60649,1252340000000.0,2015-01-01 05:45:23,0.0,0.0,0.0,128.0,52.458698,4.581234,0.0,gfw,trawlers
60650,1252340000000.0,2015-01-01 05:57:24,0.0,0.0,0.0,128.0,52.458683,4.581183,0.0,gfw,trawlers


In [46]:
df_fishing['year'] = df_fishing['date'].dt.year
# 12 columns for month
df_fishing['month'] = df_fishing['date'].dt.month
#df_fishing['day'] = df_fishing['date'].dt.day
# 7 columns for days
df_fishing['day_of_week'] = df_fishing['date'].dt.day_of_week
#df_fishing['day_of_year'] = df_fishing['date'].dt.day_of_year
df_fishing.head()

Unnamed: 0,mmsi,date,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing,source,...,day_of_week_cos,utc_offset,day_of_week_0,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6,day_of_week
60646,1252340000000.0,2015-01-01 05:08:23,0.0,0.0,0.0,128.0,52.458717,4.581316,0.0,gfw,...,-0.900969,-4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3
60647,1252340000000.0,2015-01-01 05:20:34,0.0,0.0,0.0,128.0,52.458733,4.581316,0.0,gfw,...,-0.900969,-4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3
60648,1252340000000.0,2015-01-01 05:32:53,0.0,0.0,0.0,128.0,52.458698,4.581267,0.0,gfw,...,-0.900969,-4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3
60649,1252340000000.0,2015-01-01 05:45:23,0.0,0.0,0.0,128.0,52.458698,4.581234,0.0,gfw,...,-0.900969,-4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3
60650,1252340000000.0,2015-01-01 05:57:24,0.0,0.0,0.0,128.0,52.458683,4.581183,0.0,gfw,...,-0.900969,-4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3


## Using Angular distance for the days of the week

In [47]:
df_fishing['day_of_week_sin'] = np.sin(df_fishing['day_of_week'] * (2 * np.pi / 7))
df_fishing['day_of_week_cos'] = np.cos(df_fishing['day_of_week'] * (2 * np.pi / 7))
df_fishing.head()

Unnamed: 0,mmsi,date,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing,source,...,day_of_week_cos,utc_offset,day_of_week_0,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6,day_of_week
60646,1252340000000.0,2015-01-01 05:08:23,0.0,0.0,0.0,128.0,52.458717,4.581316,0.0,gfw,...,-0.900969,-4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3
60647,1252340000000.0,2015-01-01 05:20:34,0.0,0.0,0.0,128.0,52.458733,4.581316,0.0,gfw,...,-0.900969,-4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3
60648,1252340000000.0,2015-01-01 05:32:53,0.0,0.0,0.0,128.0,52.458698,4.581267,0.0,gfw,...,-0.900969,-4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3
60649,1252340000000.0,2015-01-01 05:45:23,0.0,0.0,0.0,128.0,52.458698,4.581234,0.0,gfw,...,-0.900969,-4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3
60650,1252340000000.0,2015-01-01 05:57:24,0.0,0.0,0.0,128.0,52.458683,4.581183,0.0,gfw,...,-0.900969,-4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3


## UTC converter

In [48]:
def get_utc_offset_from_longitude(longitude):
    timezone = pytz.timezone(pytz.country_timezones("US")[0])  # You can replace "US" with the appropriate country code
    now = datetime.now(timezone)
    utc_offset = now.utcoffset().total_seconds() / 3600
    return utc_offset

df_fishing['utc_offset'] = df_fishing['lon'].apply(get_utc_offset_from_longitude)
df_fishing.head()

Unnamed: 0,mmsi,date,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing,source,...,day_of_week_cos,utc_offset,day_of_week_0,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6,day_of_week
60646,1252340000000.0,2015-01-01 05:08:23,0.0,0.0,0.0,128.0,52.458717,4.581316,0.0,gfw,...,-0.900969,-4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3
60647,1252340000000.0,2015-01-01 05:20:34,0.0,0.0,0.0,128.0,52.458733,4.581316,0.0,gfw,...,-0.900969,-4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3
60648,1252340000000.0,2015-01-01 05:32:53,0.0,0.0,0.0,128.0,52.458698,4.581267,0.0,gfw,...,-0.900969,-4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3
60649,1252340000000.0,2015-01-01 05:45:23,0.0,0.0,0.0,128.0,52.458698,4.581234,0.0,gfw,...,-0.900969,-4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3
60650,1252340000000.0,2015-01-01 05:57:24,0.0,0.0,0.0,128.0,52.458683,4.581183,0.0,gfw,...,-0.900969,-4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,3


## Encoding dates

In [49]:
# Instantiate the OneHotEncoder
ohe = OneHotEncoder(sparse_output =False)

# Fit encoder
ohe.fit(df_fishing[['day_of_week']])

# Transform the current "Alley" column
df_fishing[ohe.get_feature_names_out()] = ohe.transform(df_fishing[['day_of_week']])

# Drop the column "Alley" which has been encoded
df_fishing.drop(columns = ["day_of_week"], inplace = True)

In [50]:
df_fishing.head()

Unnamed: 0,mmsi,date,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing,source,...,day_of_week_sin,day_of_week_cos,utc_offset,day_of_week_0,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6
60646,1252340000000.0,2015-01-01 05:08:23,0.0,0.0,0.0,128.0,52.458717,4.581316,0.0,gfw,...,0.433884,-0.900969,-4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
60647,1252340000000.0,2015-01-01 05:20:34,0.0,0.0,0.0,128.0,52.458733,4.581316,0.0,gfw,...,0.433884,-0.900969,-4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
60648,1252340000000.0,2015-01-01 05:32:53,0.0,0.0,0.0,128.0,52.458698,4.581267,0.0,gfw,...,0.433884,-0.900969,-4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
60649,1252340000000.0,2015-01-01 05:45:23,0.0,0.0,0.0,128.0,52.458698,4.581234,0.0,gfw,...,0.433884,-0.900969,-4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
60650,1252340000000.0,2015-01-01 05:57:24,0.0,0.0,0.0,128.0,52.458683,4.581183,0.0,gfw,...,0.433884,-0.900969,-4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [51]:
df_fishing.isna().sum()

mmsi                   0
date                   0
distance_from_shore    0
distance_from_port     0
speed                  2
course                 2
lat                    0
lon                    0
is_fishing             0
source                 0
type                   0
year                   0
month                  0
day_of_week_sin        0
day_of_week_cos        0
utc_offset             0
day_of_week_0          0
day_of_week_1          0
day_of_week_2          0
day_of_week_3          0
day_of_week_4          0
day_of_week_5          0
day_of_week_6          0
dtype: int64

In [52]:
# Calculate the value counts of 'mmsi'
mmsi_counts = df_fishing['mmsi'].value_counts()

# Create a boolean mask for filtering mmsi values with counts less than or equal to 20
mask = mmsi_counts > 20

# Get the mmsi values that meet the condition
selected_mmsi = mmsi_counts[mask].index

# Use the isin() method to filter the DataFrame based on selected_mmsi
filtered_fishing_df = df_fishing[df_fishing['mmsi'].isin(selected_mmsi)]
filtered_fishing_df.nunique()

mmsi                      276
date                   545599
distance_from_shore     79297
distance_from_port      95687
speed                     240
course                   3602
lat                    363029
lon                    372401
is_fishing                  2
source                      6
type                        7
year                        5
month                      12
day_of_week_sin             7
day_of_week_cos             7
utc_offset                  1
day_of_week_0               2
day_of_week_1               2
day_of_week_2               2
day_of_week_3               2
day_of_week_4               2
day_of_week_5               2
day_of_week_6               2
dtype: int64

## Defining X features and y target

In [53]:
df_fishing.isna().sum()

mmsi                   0
date                   0
distance_from_shore    0
distance_from_port     0
speed                  2
course                 2
lat                    0
lon                    0
is_fishing             0
source                 0
type                   0
year                   0
month                  0
day_of_week_sin        0
day_of_week_cos        0
utc_offset             0
day_of_week_0          0
day_of_week_1          0
day_of_week_2          0
day_of_week_3          0
day_of_week_4          0
day_of_week_5          0
day_of_week_6          0
dtype: int64

In [54]:
# Dropping rows with NAN values
df_fishing_clean = df_fishing.dropna()

In [75]:
df_fishing_clean.head(3)

Unnamed: 0,mmsi,date,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing,source,...,day_of_week_sin,day_of_week_cos,utc_offset,day_of_week_0,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6
60646,1252340000000.0,2015-01-01 05:08:23,0.0,0.0,0.0,128.0,52.458717,4.581316,0.0,gfw,...,0.433884,-0.900969,-4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
60647,1252340000000.0,2015-01-01 05:20:34,0.0,0.0,0.0,128.0,52.458733,4.581316,0.0,gfw,...,0.433884,-0.900969,-4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
60648,1252340000000.0,2015-01-01 05:32:53,0.0,0.0,0.0,128.0,52.458698,4.581267,0.0,gfw,...,0.433884,-0.900969,-4.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [56]:
df_fishing_clean.columns

Index(['mmsi', 'date', 'distance_from_shore', 'distance_from_port', 'speed',
       'course', 'lat', 'lon', 'is_fishing', 'source', 'type', 'year', 'month',
       'day_of_week_sin', 'day_of_week_cos', 'utc_offset', 'day_of_week_0',
       'day_of_week_1', 'day_of_week_2', 'day_of_week_3', 'day_of_week_4',
       'day_of_week_5', 'day_of_week_6'],
      dtype='object')

# Defining X - Features and y - Target

In [58]:
# Defining X - the features and Y - the target
X = df_fishing_clean.drop(columns=['mmsi','date', 'source', 'type', 'utc_offset'])
y = df_fishing_clean['type']

In [59]:
X.shape

(553853, 18)

In [60]:
y.shape

(553853,)

In [61]:
from sklearn.preprocessing import LabelEncoder

# Instantiate the LabelEncoder
label_encoder = LabelEncoder()

# Fit it to the target
label_encoder.fit(y) # If in dataframe df['target']

# Transform the targets
encoded_target = label_encoder.transform(y) # If in dataframe df['target']

In [62]:
encoded_target

array([4, 4, 4, ..., 6, 6, 6])

# Split between train set and test set

In [63]:
X_train, X_test, y_train, y_test = train_test_split(X, encoded_target, test_size=0.3, random_state=88)

# Standard scaler

In [65]:
# Step 0 - Instantiate and fit Standard Scaler
scaler = StandardScaler()
scaler.fit(X_train)

# Step 1 - Scale/Transform
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Boat classification

In [66]:
# Fitting Random Forest Classification to the Training set
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)
classifier.fit(X_train_scaled, y_train)

In [67]:
y_pred = classifier.predict(X_test)
classifier.score(X_train_scaled,y_train)

0.999891667977828

In [72]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.3544018873829413

In [73]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.49      0.63     65738
           1       0.19      0.00      0.00     12599
           2       0.00      0.00      0.00      1252
           3       0.00      0.00      0.00      6811
           4       0.29      0.20      0.23     53259
           5       0.00      0.00      0.00      2392
           6       0.17      0.66      0.28     24105

    accuracy                           0.35    166156
   macro avg       0.22      0.19      0.16    166156
weighted avg       0.47      0.35      0.36    166156

