In [32]:
import warnings
warnings.filterwarnings('ignore')

In [33]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [34]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [59]:
#Standard columns for all datasets
columns = [
    "Fishing", "TugTow", "Recreational", "Passenger" , "Cargo", "Tanker", "Other", "Unavailable", "Total"
]

target_fishing = "Fishing"
target_tugtow = "TugTow"
target_rec = "Recreational"
target_passenger = "Passenger"
target_cargo = "Cargo"
target_tanker = "Tanker"
target_other = "Other"
target_unavailable = "Unavailable"
target_total = "Total"

In [60]:
#this will take in the clean data from the SQLite database
#set up engine will take place here first. 

file_path = Path('ml_data.csv')
df = pd.read_csv(file_path, index_col=0)


# Drop the null rows
df = df.dropna()


df.head(15)

Unnamed: 0,Fishing,TugTow,Recreational,Passenger,Cargo,Tanker,Other,Unavailable
100,11,42,291,40,44,10,27,7
101,12,41,285,37,42,10,26,9
102,12,45,212,41,51,10,29,5
103,9,45,197,40,52,12,26,4
104,14,39,212,40,52,12,29,9
105,16,43,220,42,50,12,23,12
106,16,39,244,47,44,15,24,6
107,13,42,300,42,54,12,25,7
108,11,40,292,34,56,14,23,6
109,12,48,216,50,52,14,30,4


In [61]:
df.dtypes

Fishing         int64
TugTow          int64
Recreational    int64
Passenger       int64
Cargo           int64
Tanker          int64
Other           int64
Unavailable     int64
dtype: object

In [62]:
# Create our target for fishing model
y = df[target_fishing]
print(y)
# Create our features
X =  df.drop('Fishing', axis=1)

100    11
101    12
102    12
103     9
104    14
       ..
96     13
97     15
98     13
99     12
9      13
Name: Fishing, Length: 365, dtype: int64


In [63]:
X.describe()

Unnamed: 0,TugTow,Recreational,Passenger,Cargo,Tanker,Other,Unavailable
count,365.0,365.0,365.0,365.0,365.0,365.0,365.0
mean,47.030137,227.29589,42.547945,34.769863,11.063014,26.071233,14.709589
std,8.768492,46.659806,11.266308,9.277568,3.510405,4.631518,12.468115
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,40.0,196.0,35.0,27.0,9.0,23.0,6.0
50%,47.0,222.0,45.0,33.0,12.0,26.0,9.0
75%,54.0,256.0,51.0,43.0,13.0,29.0,16.0
max,69.0,411.0,69.0,56.0,21.0,42.0,53.0


In [64]:
#split into training data 
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
X_train.shape

(273, 7)

In [65]:
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=500,
                                random_state=1)
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=500, random_state=1)

In [67]:
y_pred = classifier.predict(X_test)
#print(y_pred)
#print(y_test)
fishing_results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
fishing_results.head(20)

Unnamed: 0,Prediction,Actual
0,12,16
1,14,13
2,12,7
3,12,9
4,11,12
5,11,10
6,10,12
7,13,14
8,16,16
9,11,16
