# Imports and modules

In [1]:
import os
import requests
import pandas as pd
import time 
import numpy as np
import pytz
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import descartes
import geopandas as gpd

from datetime import datetime
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate, learning_curve, train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay, classification_report
from sklearn.model_selection import RandomizedSearchCV, train_test_split, cross_validate
from scipy.stats import randint
from sklearn import svm
from sklearn.feature_selection import r_regression
from sklearn.neighbors import KNeighborsClassifier
from shapely.geometry import Point, Polygon

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

# Extract from CSV and Preprocessing

## Function to extract data from CSV

In [2]:
def get_data(path, file_names):
    tables=[]
    for file_name in file_names:
        table = pd.read_csv(f'../{path}/{file_name}.csv')
        table['type']=f'{file_name}'
        tables.append(table)
    return pd.concat(tables)

path='data/raw_data'
file_names = ['trawlers', 'drifting_longlines', 'fixed_gear', 'pole_and_line', 'purse_seines', 'trollers', 'unknown']

df = get_data(path, file_names)

df.head()

Unnamed: 0,mmsi,timestamp,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing,source,type
0,1252340000000.0,1325376000.0,0.0,0.0,0.0,153.0,52.458649,4.5812,-1.0,gfw,trawlers
1,1252340000000.0,1325378000.0,0.0,0.0,0.0,153.0,52.458668,4.581167,-1.0,gfw,trawlers
2,1252340000000.0,1325379000.0,0.0,0.0,0.0,153.0,52.458633,4.581183,-1.0,gfw,trawlers
3,1252340000000.0,1325380000.0,0.0,0.0,0.0,153.0,52.458649,4.581234,-1.0,gfw,trawlers
4,1252340000000.0,1325381000.0,0.0,0.0,0.0,153.0,52.458649,4.581183,-1.0,gfw,trawlers


In [3]:
df['mmsi'].nunique()

354

## Remove unknown (-1 in is_fishing column)

In [4]:
df = df.loc[df['is_fishing'] > -1]

In [5]:
df['is_fishing'].value_counts()

is_fishing
0.000000    295979
1.000000    247498
0.666667      4806
0.333333      4096
0.750000       752
0.250000       670
0.800000        33
0.166667        12
0.400000         9
Name: count, dtype: int64

## Converting is_fishing to Binary (0 or 1)

In [6]:
df_fishing = df

# round the decimals so that number becomes 0 or 1
df_fishing.loc[:, ('is_fishing')] = round(df_fishing.loc[:, ('is_fishing')])

# check the unique values
df_fishing['is_fishing'].value_counts()

is_fishing
0.0    300766
1.0    253089
Name: count, dtype: int64

In [7]:
df_fishing.head()

Unnamed: 0,mmsi,timestamp,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing,source,type
60646,1252340000000.0,1420089000.0,0.0,0.0,0.0,128.0,52.458717,4.581316,0.0,gfw,trawlers
60647,1252340000000.0,1420090000.0,0.0,0.0,0.0,128.0,52.458733,4.581316,0.0,gfw,trawlers
60648,1252340000000.0,1420090000.0,0.0,0.0,0.0,128.0,52.458698,4.581267,0.0,gfw,trawlers
60649,1252340000000.0,1420091000.0,0.0,0.0,0.0,128.0,52.458698,4.581234,0.0,gfw,trawlers
60650,1252340000000.0,1420092000.0,0.0,0.0,0.0,128.0,52.458683,4.581183,0.0,gfw,trawlers


# Date Engineering

## Converting timestamp to datetime format

In [8]:
df_fishing['date'] = pd.to_datetime(df_fishing['timestamp'], unit='s')

In [9]:
df_fishing.drop(columns =['timestamp'], inplace=True)

In [10]:
# Convert date to numerical values
df_fishing['month'] = df_fishing['date'].dt.month
df_fishing['day_of_week'] = df_fishing['date'].dt.day_of_week
df_fishing.head()

Unnamed: 0,mmsi,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing,source,type,date,month,day_of_week
60646,1252340000000.0,0.0,0.0,0.0,128.0,52.458717,4.581316,0.0,gfw,trawlers,2015-01-01 05:08:23,1,3
60647,1252340000000.0,0.0,0.0,0.0,128.0,52.458733,4.581316,0.0,gfw,trawlers,2015-01-01 05:20:34,1,3
60648,1252340000000.0,0.0,0.0,0.0,128.0,52.458698,4.581267,0.0,gfw,trawlers,2015-01-01 05:32:53,1,3
60649,1252340000000.0,0.0,0.0,0.0,128.0,52.458698,4.581234,0.0,gfw,trawlers,2015-01-01 05:45:23,1,3
60650,1252340000000.0,0.0,0.0,0.0,128.0,52.458683,4.581183,0.0,gfw,trawlers,2015-01-01 05:57:24,1,3


## Encoding dates

In [11]:
# Instantiate the OneHotEncoder
ohe = OneHotEncoder(sparse_output =False)

# Fit encoder
ohe.fit(df_fishing[['month']])

# Transform the current "Alley" column
df_fishing[ohe.get_feature_names_out()] = ohe.transform(df_fishing[['month']])

# Drop the column "Alley" which has been encoded
df_fishing.drop(columns = ["month"], inplace = True)

## Using Angular distance for the days of the week

In [12]:
df_fishing['day_of_week_sin'] = np.sin(df_fishing['day_of_week'] * (2 * np.pi / 7))
df_fishing['day_of_week_cos'] = np.cos(df_fishing['day_of_week'] * (2 * np.pi / 7))
df_fishing.head()

Unnamed: 0,mmsi,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing,source,type,...,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12,day_of_week_sin,day_of_week_cos
60646,1252340000000.0,0.0,0.0,0.0,128.0,52.458717,4.581316,0.0,gfw,trawlers,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.433884,-0.900969
60647,1252340000000.0,0.0,0.0,0.0,128.0,52.458733,4.581316,0.0,gfw,trawlers,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.433884,-0.900969
60648,1252340000000.0,0.0,0.0,0.0,128.0,52.458698,4.581267,0.0,gfw,trawlers,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.433884,-0.900969
60649,1252340000000.0,0.0,0.0,0.0,128.0,52.458698,4.581234,0.0,gfw,trawlers,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.433884,-0.900969
60650,1252340000000.0,0.0,0.0,0.0,128.0,52.458683,4.581183,0.0,gfw,trawlers,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.433884,-0.900969


## Removing boats with less than a number of occurences

In [13]:
df_fishing.dropna(inplace=True)
df_fishing.isna().sum()

mmsi                   0
distance_from_shore    0
distance_from_port     0
speed                  0
course                 0
lat                    0
lon                    0
is_fishing             0
source                 0
type                   0
date                   0
day_of_week            0
month_1                0
month_2                0
month_3                0
month_4                0
month_5                0
month_6                0
month_7                0
month_8                0
month_9                0
month_10               0
month_11               0
month_12               0
day_of_week_sin        0
day_of_week_cos        0
dtype: int64

In [14]:
# Calculate the value counts of 'mmsi'
mmsi_counts = df_fishing['mmsi'].value_counts()

# Create a boolean mask for filtering mmsi values with counts less than or equal to 10
mask = mmsi_counts > 10

# Get the mmsi values that meet the condition
selected_mmsi = mmsi_counts[mask].index

# Use the isin() method to filter the DataFrame based on selected_mmsi
filtered_fishing_df = df_fishing[df_fishing['mmsi'].isin(selected_mmsi)]
filtered_fishing_df.nunique()

mmsi                      299
distance_from_shore     79306
distance_from_port      95719
speed                     240
course                   3602
lat                    363344
lon                    372708
is_fishing                  2
source                      6
type                        7
date                   545940
day_of_week                 7
month_1                     2
month_2                     2
month_3                     2
month_4                     2
month_5                     2
month_6                     2
month_7                     2
month_8                     2
month_9                     2
month_10                    2
month_11                    2
month_12                    2
day_of_week_sin             7
day_of_week_cos             7
dtype: int64

# Defining X - Features and y - Target

In [15]:
# Defining X - the features and Y - the target
fishing = filtered_fishing_df[filtered_fishing_df['is_fishing']==1]
X = fishing .drop(columns=['mmsi','date', 'source', 'type'])
y = fishing['type']

In [16]:
X.shape, y.shape

((253083, 22), (253083,))

In [17]:
corr = r_regression(X, y)
col_names = list(X.columns)
df_corr = pd.DataFrame()
df_corr ['feature'] = col_names
df_corr ['corr'] = corr
df_corr ['abs_corr'] = np.abs(df_corr ['corr'])
len(df_corr[df_corr['abs_corr']>0]), len(df_corr[df_corr['abs_corr']>0.02])

TypeError: ufunc 'divide' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [None]:
from sklearn.preprocessing import LabelEncoder

# Instantiate the LabelEncoder
label_encoder = LabelEncoder()

# Fit it to the target
label_encoder.fit(y) # If in dataframe df['target']

# Transform the targets
encoded_target = label_encoder.transform(y) # If in dataframe df['target']
encoded_target

# Split between train set and test set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, encoded_target, test_size=0.3, random_state=88)

# Standard scaler

In [None]:
# Step 0 - Instantiate and fit Standard Scaler
scaler = StandardScaler()
scaler.fit(X_train)

# Step 1 - Scale/Transform
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Boat classification

In [None]:
# Fitting Random Forest Classification to the Training set
classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 42)
classifier.fit(X_train_scaled, y_train)

In [None]:
y_pred = classifier.predict(X_test)
classifier.score(X_train_scaled,y_train)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

In [None]:
print(classification_report(y_test, y_pred))