# Imports and modules

In [1]:
import os
import requests
import pandas as pd
import time 
import numpy as np
import pytz
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import geopandas as gpd

from datetime import datetime
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, classification_report
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.feature_selection import r_regression
from sklearn.neighbors import KNeighborsClassifier
from shapely.geometry import Point
from sklearn.pipeline import make_pipeline

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

# Extract from CSV and Preprocessing

## Function to extract data from CSV

In [2]:
def get_data(path, file_names):
    tables=[]
    for file_name in file_names:
        table = pd.read_csv(f'../{path}/{file_name}.csv')
        table['type']=f'{file_name}'
        tables.append(table)
    return pd.concat(tables)

path='data/raw_data'
file_names = ['trawlers', 'drifting_longlines', 'fixed_gear', 'pole_and_line', 'purse_seines', 'trollers', 'unknown']
df = get_data(path, file_names)
df

Unnamed: 0,mmsi,timestamp,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing,source,type
0,1.252340e+12,1.325376e+09,0.000000,0.000000,0.0,153.000000,52.458649,4.581200,-1.0,gfw,trawlers
1,1.252340e+12,1.325378e+09,0.000000,0.000000,0.0,153.000000,52.458668,4.581167,-1.0,gfw,trawlers
2,1.252340e+12,1.325379e+09,0.000000,0.000000,0.0,153.000000,52.458633,4.581183,-1.0,gfw,trawlers
3,1.252340e+12,1.325380e+09,0.000000,0.000000,0.0,153.000000,52.458649,4.581234,-1.0,gfw,trawlers
4,1.252340e+12,1.325381e+09,0.000000,0.000000,0.0,153.000000,52.458649,4.581183,-1.0,gfw,trawlers
...,...,...,...,...,...,...,...,...,...,...,...
6811547,2.698965e+14,1.479901e+09,16999.582031,33837.019531,5.5,142.600006,43.548107,13.784665,-1.0,gfw,unknown
6811548,2.698965e+14,1.479903e+09,19234.912109,38482.820312,5.7,157.899994,43.511894,13.812785,-1.0,gfw,unknown
6811549,2.698965e+14,1.479907e+09,18681.083984,47433.003906,5.4,284.899994,43.442558,13.845977,-1.0,gfw,unknown
6811550,2.698965e+14,1.479909e+09,17463.820312,40310.296875,5.3,319.000000,43.493008,13.807993,-1.0,gfw,unknown


In [3]:
df['mmsi'].nunique()

354

## Remove unknown (-1 in is_fishing column)

In [4]:
df = df.loc[df['is_fishing'] > -1]
df['is_fishing'].value_counts()

is_fishing
0.000000    295979
1.000000    247498
0.666667      4806
0.333333      4096
0.750000       752
0.250000       670
0.800000        33
0.166667        12
0.400000         9
Name: count, dtype: int64

## Converting is_fishing to Binary (0 or 1)

In [5]:
df_fishing = df.copy()

# round the decimals so that number becomes 0 or 1
df_fishing.loc[:, ('is_fishing')] = round(df_fishing.loc[:, ('is_fishing')])

# check the unique values
df_fishing['is_fishing'].value_counts()

is_fishing
0.0    300766
1.0    253089
Name: count, dtype: int64

## OHE 'type'

In [6]:
# Instantiate the OneHotEncoder
ohe = OneHotEncoder(sparse_output =False)

# Fit encoder
ohe.fit(df[['type']])

# Transform the current "Alley" column
df[ohe.get_feature_names_out()] = ohe.transform(df[['type']])

# Drop the column "Alley" which has been encoded
df.drop(columns = ["type", "source"], inplace = True)

# Date Engineering

## Converting timestamp to datetime format

In [7]:
df_fishing['timestamp'] = pd.to_datetime(df_fishing['timestamp'], unit='s')
df_fishing.rename(columns={"timestamp": "date"}, inplace=True)
df_fishing['month'] = df_fishing['date'].dt.month
df_fishing['day_of_week'] = df_fishing['date'].dt.day_of_week
df_fishing.head()

Unnamed: 0,mmsi,date,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing,source,type,month,day_of_week
60646,1252340000000.0,2015-01-01 05:08:23,0.0,0.0,0.0,128.0,52.458717,4.581316,0.0,gfw,trawlers,1,3
60647,1252340000000.0,2015-01-01 05:20:34,0.0,0.0,0.0,128.0,52.458733,4.581316,0.0,gfw,trawlers,1,3
60648,1252340000000.0,2015-01-01 05:32:53,0.0,0.0,0.0,128.0,52.458698,4.581267,0.0,gfw,trawlers,1,3
60649,1252340000000.0,2015-01-01 05:45:23,0.0,0.0,0.0,128.0,52.458698,4.581234,0.0,gfw,trawlers,1,3
60650,1252340000000.0,2015-01-01 05:57:24,0.0,0.0,0.0,128.0,52.458683,4.581183,0.0,gfw,trawlers,1,3


## Using Angular distance for the days of the week

In [8]:
df_fishing['day_of_week_sin'] = np.sin(df_fishing['day_of_week'] * (2 * np.pi / 7))
df_fishing['day_of_week_cos'] = np.cos(df_fishing['day_of_week'] * (2 * np.pi / 7))

## Encoding dates

In [9]:
# Instantiate the OneHotEncoder
ohe = OneHotEncoder(sparse_output =False)

# Fit encoder
ohe.fit(df_fishing[['day_of_week']])

# Transform the current "day_of_week" column
df_fishing[ohe.get_feature_names_out()] = ohe.transform(df_fishing[['day_of_week']])

# Drop the column "day_of_week" which has been encoded
df_fishing.drop(columns = ["day_of_week"], inplace = True)

In [10]:
# Instantiate the OneHotEncoder
ohe = OneHotEncoder(sparse_output =False)

# Fit encoder
ohe.fit(df_fishing[['month']])

# Transform the current "month" column
df_fishing[ohe.get_feature_names_out()] = ohe.transform(df_fishing[['month']])

# Drop the column "month" which has been encoded
df_fishing.drop(columns = ["month"], inplace = True)

In [11]:
# Calculate the value counts of 'mmsi'
mmsi_counts = df_fishing['mmsi'].value_counts()

# Create a boolean mask for filtering mmsi values with counts less than or equal to 20
mask = mmsi_counts > 10

# Get the mmsi values that meet the condition
selected_mmsi = mmsi_counts[mask].index

# Use the isin() method to filter the DataFrame based on selected_mmsi
filtered_fishing_df = df_fishing[df_fishing['mmsi'].isin(selected_mmsi)]
filtered_fishing_df.nunique()

mmsi                      299
date                   545942
distance_from_shore     79307
distance_from_port      95720
speed                     240
course                   3602
lat                    363346
lon                    372710
is_fishing                  2
source                      6
type                        7
day_of_week_sin             7
day_of_week_cos             7
day_of_week_0               2
day_of_week_1               2
day_of_week_2               2
day_of_week_3               2
day_of_week_4               2
day_of_week_5               2
day_of_week_6               2
month_1                     2
month_2                     2
month_3                     2
month_4                     2
month_5                     2
month_6                     2
month_7                     2
month_8                     2
month_9                     2
month_10                    2
month_11                    2
month_12                    2
dtype: int64

## Defining X features and y target

In [12]:
# Dropping rows with NAN values
df_fishing_clean = filtered_fishing_df.dropna()
df_fishing_clean.isna().sum()

mmsi                   0
date                   0
distance_from_shore    0
distance_from_port     0
speed                  0
course                 0
lat                    0
lon                    0
is_fishing             0
source                 0
type                   0
day_of_week_sin        0
day_of_week_cos        0
day_of_week_0          0
day_of_week_1          0
day_of_week_2          0
day_of_week_3          0
day_of_week_4          0
day_of_week_5          0
day_of_week_6          0
month_1                0
month_2                0
month_3                0
month_4                0
month_5                0
month_6                0
month_7                0
month_8                0
month_9                0
month_10               0
month_11               0
month_12               0
dtype: int64

# Defining X - Features and y - Target

In [13]:
# Defining X - the features and Y - the target
X = df_fishing_clean.drop(columns=['mmsi','date', 'is_fishing','source', 'type'])
y = df_fishing_clean['is_fishing']
X.shape, y.shape

((553702, 27), (553702,))

In [15]:
test = df_fishing_clean[df_fishing_clean['is_fishing']==1]
test.head()

Unnamed: 0,mmsi,date,distance_from_shore,distance_from_port,speed,course,lat,lon,is_fishing,source,...,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
61241,1252340000000.0,2015-01-10 04:22:47,3162.200195,24758.228516,2.1,358.0,58.266666,-6.2119,1.0,gfw,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
61242,1252340000000.0,2015-01-10 04:36:06,2236.013184,24041.039062,1.6,156.0,58.261967,-6.21245,1.0,gfw,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
61243,1252340000000.0,2015-01-10 04:48:48,999.975464,22671.011719,4.1,222.0,58.25552,-6.211983,1.0,gfw,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
61244,1252340000000.0,2015-01-10 05:01:51,2999.92627,21212.683594,3.0,329.0,58.252148,-6.23255,1.0,gfw,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
61245,1252340000000.0,2015-01-10 05:14:37,3605.462891,22671.011719,1.2,35.0,58.257801,-6.2282,1.0,gfw,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
boat1 = test._get_value(61241, 'mmsi')
one_boat1 = df_fishing_clean.loc[df_fishing_clean['mmsi']== boat1]
one_boat1.shape

(3482, 32)

In [None]:
output_folder = '../data/'
output_file = 'boat_test.csv'

# Construct the full path
output_path = f'{output_folder}/{output_file}'

# Save the DataFrame to the specified path
one_boat1.to_csv(output_path, index=False)

# Correlation

## Pearson correlation

In [None]:
corr = r_regression(X, y)
col_names = list(X.columns)
df_corr = pd.DataFrame()
df_corr ['feature'] = col_names
df_corr ['corr'] = corr
df_corr ['abs_corr'] = np.abs(df_corr ['corr'])
numbers = [0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.10, 0.20]
no_len = []

for num in numbers:
    no_len.append(len(df_corr[df_corr['abs_corr']>num]))

print(no_len)

In [None]:
df_clean = df_corr[df_corr['abs_corr']>0.02]
names = df_clean['feature'].values.tolist()

In [None]:
X18 = df_fishing_clean[names]
y18 = df_fishing_clean['is_fishing']
X18.shape, y18.shape

## Heatmap

In [None]:
data = df_fishing_clean.drop(columns=['mmsi','date', 'is_fishing','source', 'type'])

In [None]:
# plotting correlation heatmap
dataplot=sns.heatmap(data.corr())
  
# displaying heatmap
plt.show()

# Split between train set and test set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=88)

In [None]:
X_train.shape

In [None]:
X_test_with_MMSI = X_test.merge(df['mmsi'], left_index=True, right_index=True)

In [None]:
X_test_with_MMSI.head()

In [None]:
output_folder = '../data/'
output_file = 'X_test.csv'

# Construct the full path
output_path = f'{output_folder}/{output_file}'

# Save the DataFrame to the specified path
X_test_with_MMSI.to_csv(output_path, index=False)

In [None]:
X18_train, X18_test, y_train, y_test = train_test_split(X18, y, test_size=0.3, random_state=88)

## Standard Scaler

In [None]:
scale = StandardScaler()

# Logistic Regresssion

In [None]:
logistic_pipeline = make_pipeline(scale, LogisticRegression(max_iter=1000))
logistic_pipeline

In [None]:
# Train Pipeline
logistic_pipeline.fit(X_train,y_train)

# Make predictions
y_pred = logistic_pipeline.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f} Precision:{precision:.2f}")

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
cv_lr_results = cross_validate(logistic_pipeline,
                            X_test, 
                            y_test, 
                            cv=5,
                            scoring =['accuracy'])
cv_lr_results['test_accuracy'].mean()

# KNN Classifier

In [None]:
knn_pipeline = make_pipeline(scale, KNeighborsClassifier(n_neighbors=3))
knn_pipeline

In [None]:
# Train Pipeline
knn_pipeline.fit(X_train,y_train)
y_pred = knn_pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f} Precision:{precision:.2f}")

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
cv_knn_results = cross_validate(knn_pipeline,
                            X_test, 
                            y_test, 
                            cv=5,
                            scoring =['accuracy'])
cv_knn_results['test_accuracy'].mean()

# Random forest

In [None]:
rff_pipeline = make_pipeline(scale, RandomForestClassifier())
rff_pipeline

In [None]:
# Train Pipeline
rff_pipeline.fit(X_train,y_train)
y_pred = rff_pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f} Precision:{precision:.2f}")

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
cv_results = cross_validate(rff_pipeline,
                            X_test, 
                            y_test, 
                            cv=5,
                            scoring =['accuracy'])
cv_results['test_accuracy'].mean()

In [None]:
# Train Pipeline
rff_pipeline.fit(X18_train,y_train)
y18_pred = rff_pipeline.predict(X18_test)
accuracy = accuracy_score(y_test, y18_pred)
precision = precision_score(y_test, y18_pred)
print(f"Accuracy: {accuracy:.2f} Precision:{precision:.2f}")

In [None]:
print(classification_report(y_test, y18_pred))

In [None]:
cv_results25 = cross_validate(rff_pipeline,
                            X18_test, 
                            y_test, 
                            cv=5,
                            scoring =['accuracy'])
cv_results['test_accuracy'].mean()

# Export to CSV for quick access

In [None]:
output_folder = '../data/preprocessed'
output_file = 'preproc.csv'

# Construct the full path
output_path = f'{output_folder}/{output_file}'

# Save the DataFrame to the specified path
df_fishing.to_csv(output_path, index=False)

# Map data on world map with geopandas

In [None]:
geometry = [Point(xy) for xy in zip(df_fishing['lon'], df_fishing['lat'])]
crs = {'init':'epsg:4326'}
geo_df = gpd.GeoDataFrame(df_fishing, #specify our data
                          crs=crs, #specify our coordinate reference system
                          geometry=geometry) #specify the geometry list we created
geo_df.head()

## Mapping one boat

In [None]:
boat = geo_df._get_value(60649, 'mmsi')
one_boat = geo_df.loc[geo_df['mmsi']== boat]

In [None]:
# Getting world map data from geo pandas
worldmap = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres"))

# Creating axes and plotting world map
fig, ax = plt.subplots(figsize=(16, 10))
worldmap.plot(color="lightgrey", ax=ax)

# Plotting Longitudes and Latitudes of one boat
x = one_boat['lon']
y = one_boat['lat']
plt.scatter(x, y, cmap='autumn')
plt.line(x, y)

# Creating axis limits and title
plt.xlim([-180, 180])
plt.ylim([-90, 90])

plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.show()

## Mapping all boats

In [None]:
# Creating axes and plotting world map
fig, ax = plt.subplots(figsize=(16, 10))
worldmap.plot(color="lightgrey", ax=ax)

# Plotting Longitudes and Latitudes of one boat
x = geo_df['lon']
y = geo_df['lat']
boats = geo_df['mmsi']
plt.scatter(x, y, c=boats, cmap='autumn')

# Creating axis limits and title
plt.xlim([-180, 180])
plt.ylim([-90, 90])

plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.show()

In [None]:
df_fishing_clean.shape

In [None]:
df_fishing_clean['is_fishing'].value_counts()

In [None]:
grouped = df_fishing_clean.groupby('mmsi')

In [None]:
grouped['is_fishing'].value_counts()