# Imports and modules

In [41]:
import os
import requests
import pandas as pd
import time 
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from datetime import datetime

import warnings
warnings.filterwarnings('ignore')

# Extract data from CSV

## Function to extract data from CSV

In [42]:
def get_data(path, file_names):
    tables=[]
    for file_name in file_names:
        table = pd.read_csv(f'../{path}/{file_name}.csv')
        table['type']=f'{file_name}'
        tables.append(table)
    return pd.concat(tables)

path='data/raw_data'
file_names = ['trawlers', 'drifting_longlines', 'fixed_gear', 'pole_and_line', 'purse_seines', 'trollers', 'unknown']

df = get_data(path, file_names)

df.head()

In [None]:
df['mmsi'].nunique()

## Remove unknown (-1 in is_fishing column)

In [None]:
df = df.loc[df['is_fishing'] > -1]

In [None]:
df['is_fishing'].value_counts()

## OHE 'type'

In [None]:
# Instantiate the OneHotEncoder
ohe = OneHotEncoder(sparse_output =False)

# Fit encoder
ohe.fit(df[['type']])

# Transform the current "Alley" column
df[ohe.get_feature_names_out()] = ohe.transform(df[['type']])

# Drop the column "Alley" which has been encoded
df.drop(columns = ["type", "source"], inplace = True)

## Converting is_fishing to Binary (0 or 1)

In [None]:
df_fishing = df
df_fishing['is_fishing'].value_counts()

In [None]:
# round the decimals so that number becomes 0 or 1
df_fishing.loc[:, ('is_fishing')] = round(df_fishing.loc[:, ('is_fishing')])

In [None]:
# check the unique values
df_fishing['is_fishing'].value_counts()

In [None]:
df_fishing.head(3)

# Date Engineering

## Converting timestamp to datetime format

In [None]:
df_fishing['timestamp'] = pd.to_datetime(df_fishing['timestamp'], unit='s')
df_fishing.head(2)

In [None]:
df_fishing.rename(columns={"timestamp": "date"}, inplace=True)
df_fishing.head()

In [None]:
df_fishing['year'] = df_fishing['date'].dt.year
# 12 columns for month
df_fishing['month'] = df_fishing['date'].dt.month
#df_fishing['day'] = df_fishing['date'].dt.day
# 7 columns for days
df_fishing['day_of_week'] = df_fishing['date'].dt.day_of_week
#df_fishing['day_of_year'] = df_fishing['date'].dt.day_of_year
df_fishing.head()

## Using Angular distance for the days of the week

In [None]:
df_fishing['day_of_week_sin'] = np.sin(df_fishing['day_of_week'] * (2 * np.pi / 7))
df_fishing['day_of_week_cos'] = np.cos(df_fishing['day_of_week'] * (2 * np.pi / 7))
df_fishing.head()

## UTC converter

In [None]:
import pytz
from datetime import datetime

def get_utc_offset_from_longitude(longitude):
    timezone = pytz.timezone(pytz.country_timezones("US")[0])  # You can replace "US" with the appropriate country code
    now = datetime.now(timezone)
    utc_offset = now.utcoffset().total_seconds() / 3600
    return utc_offset

df_fishing['utc_offset'] = df_fishing['lon'].apply(get_utc_offset_from_longitude)
df_fishing.head()

## Encoding dates

In [None]:
# Instantiate the OneHotEncoder
ohe = OneHotEncoder(sparse_output =False)

# Fit encoder
ohe.fit(df_fishing[['day_of_week']])

# Transform the current "Alley" column
df_fishing[ohe.get_feature_names_out()] = ohe.transform(df_fishing[['day_of_week']])

# Drop the column "Alley" which has been encoded
df_fishing.drop(columns = ["day_of_week"], inplace = True)

In [None]:
df_fishing.head()

In [None]:
df_fishing.isna().sum()

In [None]:
# Calculate the value counts of 'mmsi'
mmsi_counts = df_fishing['mmsi'].value_counts()

# Create a boolean mask for filtering mmsi values with counts less than or equal to 20
mask = mmsi_counts > 20

# Get the mmsi values that meet the condition
selected_mmsi = mmsi_counts[mask].index

# Use the isin() method to filter the DataFrame based on selected_mmsi
filtered_fishing_df = df_fishing[df_fishing['mmsi'].isin(selected_mmsi)]
filtered_fishing_df.nunique()

## Defining X features and y target

In [None]:
df_fishing.isna().sum()

In [None]:
# Dropping rows with NAN values
df_fishing_clean = df_fishing.dropna()

In [None]:
df_fishing_clean.shape

In [None]:
# Defining X - the features and Y - the target
X = df_fishing_clean.drop(columns=['date', 'is_fishing', 'utc_offset'])
y = df_fishing_clean['is_fishing']

In [None]:
X.shape

In [None]:
y.shape

# Correlation

## Pearson correlation

In [None]:
from sklearn.feature_selection import r_regression
corr = r_regression(X, y)
col_names = list(X.columns)
df_corr = pd.DataFrame(corr, col_names)
df_corr

## Heatmap

In [None]:
import matplotlib.pyplot as mp
import pandas as pd
import seaborn as sb

In [None]:
data = df_fishing_clean.drop(columns=['date', 'utc_offset'])

In [None]:
# plotting correlation heatmap
dataplot=sb.heatmap(data.corr())
  
# displaying heatmap
mp.show()

# Split between train set and test set

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=88)

# Standard scale

In [None]:
from sklearn.preprocessing import StandardScaler

# Step 0 - Instantiate and fit Standard Scaler
scaler = StandardScaler()
scaler.fit(X_train)

# Step 1 - Scale/Transform
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Logistic Regresssion

In [None]:
import numpy as np
import seaborn as sns
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate, learning_curve, train_test_split
from sklearn.preprocessing import PolynomialFeatures

model = LogisticRegression(max_iter=1000).fit(X,y)

# Score the model
# LogisticRegression will default scoring to accuracy.
model.score(X,y)

# Export to CSV for quick access

In [None]:
output_folder = '../data/preprocessed'
output_file = 'preproc.csv'

# Construct the full path
output_path = f'{output_folder}/{output_file}'

# Save the DataFrame to the specified path
df_fishing.to_csv(output_path, index=False)

# Map data on world map with geopandas

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import descartes
import geopandas as gpd
from shapely.geometry import Point, Polygon

%matplotlib inline

In [None]:
geometry = [Point(xy) for xy in zip(df_fishing['lon'], df_fishing['lat'])]
crs = {'init':'epsg:4326'}
geo_df = gpd.GeoDataFrame(df_fishing, #specify our data
                          crs=crs, #specify our coordinate reference system
                          geometry=geometry) #specify the geometry list we created
geo_df.head()

## Mapping one boat

In [None]:
boat = geo_df._get_value(60649, 'mmsi')
one_boat = geo_df.loc[geo_df['mmsi']== boat]

In [None]:
# Getting world map data from geo pandas
worldmap = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres"))

# Creating axes and plotting world map
fig, ax = plt.subplots(figsize=(16, 10))
worldmap.plot(color="lightgrey", ax=ax)

# Plotting Longitudes and Latitudes of one boat
x = one_boat['lon']
y = one_boat['lat']
plt.scatter(x, y, cmap='autumn')

# Creating axis limits and title
plt.xlim([-180, 180])
plt.ylim([-90, 90])

plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.show()

## Mapping all boats

In [None]:
# Creating axes and plotting world map
fig, ax = plt.subplots(figsize=(16, 10))
worldmap.plot(color="lightgrey", ax=ax)

# Plotting Longitudes and Latitudes of one boat
x = geo_df['lon']
y = geo_df['lat']
boats = geo_df['mmsi']
plt.scatter(x, y, c=boats, cmap='autumn')

# Creating axis limits and title
plt.xlim([-180, 180])
plt.ylim([-90, 90])

plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.show()