In [10]:
# import packages
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error,r2_score
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

Adapting datasets to be interpreted

In [11]:
X_sample = pd.read_csv("data/ais_train.csv", delimiter='|')
X_sample.to_csv('data/ais_train_modified.csv', index=False)
extra_vessels = pd.read_csv("data/vessels.csv", on_bad_lines='skip', delimiter='|')
extra_vessels.to_csv('data/vessels_modified.csv', index=False)
extra_ports = pd.read_csv("data/ports.csv", on_bad_lines='skip', delimiter='|')
extra_ports.to_csv('data/ports_modified.csv', index=False)
extra_schedules = pd.read_csv("data/schedules_to_may_2024.csv", on_bad_lines='skip', delimiter='|')
extra_schedules.to_csv('data/schedules_to_may_2024_modified.csv', index=False)

In [22]:
X_evaluation = pd.read_csv("data/ais_test.csv",)
extra_ports = pd.read_csv("data/ports_modified.csv")
extra_vessels = pd.read_csv("data/vessels_modified.csv")
extra_schedules = pd.read_csv("data/schedules_to_may_2024_modified.csv")
X_original = pd.read_csv("data/ais_train_modified.csv")

Changing the dates into a different columns

In [13]:
X_original['time'] = pd.to_datetime(X_original['time'])
X_original['year'] = X_original['time'].dt.year
X_original['month'] = X_original['time'].dt.month
X_original['day'] = X_original['time'].dt.day
X_original['hour'] = X_original['time'].dt.hour
X_original['minute'] = X_original['time'].dt.minute
X_original['second'] = X_original['time'].dt.second

X_original['etaRaw'] = pd.to_datetime(X_original['etaRaw'], format='%m-%d %H:%M', errors='coerce')
X_original['eta_month'] = X_original['etaRaw'].dt.month
X_original['eta_day'] = X_original['etaRaw'].dt.day
X_original['eta_hour'] = X_original['etaRaw'].dt.hour
X_original['eta_minute'] = X_original['etaRaw'].dt.minute

X_original = X_original.drop(['time','etaRaw'], axis=1)


In [82]:
X_original['time'] = pd.to_datetime(X_original['time'])
unique = X_original['vesselId'].unique()
print(X_original['latitude'].dtype)

float64


In [85]:
def past_course(original):
    original=original.reset_index()

    original['prev_lat'] = original['latitude'].shift(1).fillna(original['latitude'].iloc[0])
    original['prev_lon'] = original['longitude'].shift(1).fillna(original['longitude'].iloc[0])
    original['cog'] = original['cog'].shift(1)
    original['sog'] = original['sog'].shift(1)
    original['rot'] = original['rot'].shift(1)
    original['heading'] = original['heading'].shift(1)
    original['navstat'] = original['navstat'].shift(1)
    original.loc[0,['cog','sog','rot','heading','navstat']]=[0,0,0,0,0]

    return original

In [76]:

y.head()

Unnamed: 0,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,vesselId,portId
1270965,2024-04-17 09:16:31,180.3,10.6,0,181,0,04-18 07:10,34.81396,136.7634,6326e9a9c46d6a20d22ca317,61d37a0f1366c3998241d902
1504929,2024-05-06 14:54:43,124.8,0.1,0,117,5,05-06 03:00,37.95911,23.60948,61e9f440b937134a3c4c017b,61d3763e93c6feb83e5eb468
963761,2024-03-23 10:19:38,359.8,4.7,12,0,0,03-23 10:00,18.45563,-69.89051,61e9f444b937134a3c4c01a1,61d373443aeaecc07011a58f
220045,2024-01-20 11:17:03,178.0,3.5,-127,186,0,01-30 21:00,38.19092,-76.25695,61e9f3abb937134a3c4bfe1f,61d375e793c6feb83e5eb3e2
850958,2024-03-14 22:17:04,75.4,0.0,0,82,5,03-14 22:00,55.62337,12.99615,61e9f42ab937134a3c4c00eb,61d37ffc29b60f6113c89ef0


In [78]:
filtered = y[y['vesselId'] == '6326e9a9c46d6a20d22ca317'].copy()
filtered.head()


Unnamed: 0,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,vesselId,portId
1270965,2024-04-17 09:16:31,180.3,10.6,0,181,0,04-18 07:10,34.81396,136.7634,6326e9a9c46d6a20d22ca317,61d37a0f1366c3998241d902
1410642,2024-04-28 18:58:01,260.5,17.4,0,261,0,04-29 07:30,34.48601,137.59084,6326e9a9c46d6a20d22ca317,61d37a361366c3998241d950
1128271,2024-04-06 05:14:37,196.8,19.0,0,197,0,04-07 06:10,34.15957,134.94198,6326e9a9c46d6a20d22ca317,61d37a3a1366c3998241d957
880538,2024-03-17 04:49:20,306.3,0.0,0,0,5,03-17 13:30,34.72777,137.30101,6326e9a9c46d6a20d22ca317,61d37a361366c3998241d950
630178,2024-02-24 08:52:03,179.9,12.8,0,178,0,02-25 09:00,35.23634,139.77113,6326e9a9c46d6a20d22ca317,61d37a3e1366c3998241d95f


In [108]:
def new_dataframe(original):
    unique = original['vesselId'].unique()
    new = original.copy()
    new['prev_lat']=original['latitude']
    new['prev_lon']=original['longitude']
    new = new.reset_index()
    final = pd.DataFrame(columns=new.columns)
    for c in unique:
        filtered = new[new['vesselId'] == c].copy()
        new_filtered = past_course(filtered)
        final = pd.concat([final, new_filtered], ignore_index=True)
    final = final.sort_values(by='index')
    final = final.drop(['index'],axis=1)
    final = final.reset_index(drop=True)
    return(final)

In [109]:
X_original['latitude'] = X_original['latitude'].astype('float32')
X_original['longitude'] = X_original['longitude'].astype('float32')
y = X_original.sample(frac=0.5, random_state=42)
testeo = new_dataframe(X_original)
testeo.head()

  final = pd.concat([final, new_filtered], ignore_index=True)


Unnamed: 0,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,vesselId,portId,prev_lat,prev_lon,level_0
0,2024-01-01 00:00:25,0.0,0.0,0.0,0.0,0.0,01-09 23:00,-34.743698,-57.851299,61e9f3a8b937134a3c4bfdf7,61d371c43aeaecc07011a37f,-34.743698,-57.851299,0.0
1,2024-01-01 00:00:36,0.0,0.0,0.0,0.0,0.0,12-29 20:00,8.8944,-79.479393,61e9f3d4b937134a3c4bff1f,634c4de270937fc01c3a7689,8.8944,-79.479393,1.0
2,2024-01-01 00:01:45,0.0,0.0,0.0,0.0,0.0,01-02 09:00,39.190651,-76.47567,61e9f436b937134a3c4c0131,61d3847bb7b7526e1adf3d19,39.190651,-76.47567,2.0
3,2024-01-01 00:03:11,0.0,0.0,0.0,0.0,0.0,12-31 20:00,-34.411888,151.020676,61e9f3b4b937134a3c4bfe77,61d36f770a1807568ff9a126,-34.411888,151.020676,3.0
4,2024-01-01 00:03:51,0.0,0.0,0.0,0.0,0.0,01-25 12:00,35.883789,-5.91636,61e9f41bb937134a3c4c0087,634c4de270937fc01c3a74f3,35.883789,-5.91636,4.0


In [105]:
filtered = testeo[testeo['vesselId'] == '6326e9a9c46d6a20d22ca317'].copy()
X_original.head()


Unnamed: 0,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,vesselId,portId
0,2024-01-01 00:00:25,284.0,0.7,0,88,0,01-09 23:00,-34.743698,-57.851299,61e9f3a8b937134a3c4bfdf7,61d371c43aeaecc07011a37f
1,2024-01-01 00:00:36,109.6,0.0,-6,347,1,12-29 20:00,8.8944,-79.479393,61e9f3d4b937134a3c4bff1f,634c4de270937fc01c3a7689
2,2024-01-01 00:01:45,111.0,11.0,0,112,0,01-02 09:00,39.190651,-76.47567,61e9f436b937134a3c4c0131,61d3847bb7b7526e1adf3d19
3,2024-01-01 00:03:11,96.4,0.0,0,142,1,12-31 20:00,-34.411888,151.020676,61e9f3b4b937134a3c4bfe77,61d36f770a1807568ff9a126
4,2024-01-01 00:03:51,214.0,19.7,0,215,0,01-25 12:00,35.883789,-5.91636,61e9f41bb937134a3c4c0087,634c4de270937fc01c3a74f3


In [102]:
filtered = X_original[X_original['vesselId'] == '6326e9a9c46d6a20d22ca317'].copy()
filtered.head()

Unnamed: 0,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,vesselId,portId
7671,2024-01-01 16:37:28,22.3,0.0,0,115,5,12-30 10:00,35.389961,139.647842,6326e9a9c46d6a20d22ca317,61d379f61366c3998241d8d2
8503,2024-01-01 18:31:28,22.3,0.0,0,115,5,12-30 10:00,35.389938,139.647827,6326e9a9c46d6a20d22ca317,61d379f61366c3998241d8d2
12168,2024-01-02 01:43:28,22.3,0.0,0,115,5,12-30 10:00,35.389999,139.647812,6326e9a9c46d6a20d22ca317,61d379f61366c3998241d8d2
13323,2024-01-02 04:13:28,22.3,0.0,0,115,5,12-30 10:00,35.389969,139.647812,6326e9a9c46d6a20d22ca317,61d379f61366c3998241d8d2
13590,2024-01-02 04:37:28,22.3,0.0,0,115,5,12-30 10:00,35.389992,139.647827,6326e9a9c46d6a20d22ca317,61d379f61366c3998241d8d2


Code to generate a merged dataset with the vessels (non functional)

In [None]:
prueba = pd.merge(X_train,extra_vessels, on='vesselId',how='left')
prueba.head()

Splitting the dataset into a train/test one and x-(all the info)  y-(the results longitud/latitud)

In [14]:
#Generating one test file
X_original = X_original.drop(['vesselId','portId'], axis=1)

x_test=X_original.sample(frac=0.2, random_state=42)
x_train=X_original.drop(x_test.index)

y_test_lon=x_test.loc[:,['longitude']]
y_test_lat=x_test.loc[:,['latitude']]
x_test=x_test.drop(['longitude','latitude'],axis=1)

y_train_lon=x_train.loc[:,['longitude']]
y_train_lat=x_train.loc[:,['latitude']]
x_train = x_train.drop(['longitude','latitude'],axis=1)

In [None]:
#verification (length must match)
print(f"x_test shape: {x_test.shape}")
print(f"y_test_lon shape: {y_test_lon.shape}")
print(f"y_test_lat shape: {y_test_lat.shape}")

In [15]:
#Generating three sets of training data to train 3 different models
#Split of the data in 3 equal slices
x_original_1 = X_original.sample(frac=1/3, random_state=42)
X_original_trans = X_original.drop(x_original_1.index)
x_original_2 = X_original_trans.sample(frac=0.5, random_state=42)
x_original_3 = X_original_trans.drop(x_original_2.index)

#Adapting the slices into x and y
#slice 1
y_test_1_lon = x_original_1.loc[:,['longitude']]
y_test_1_lat = x_original_1.loc[:,['latitude']]
x_test_1 = x_original_1.drop(['longitude','latitude'],axis=1)
x_train_1 = X_original.drop(x_original_1.index)
y_train_1_lon = x_train_1.loc[:,['longitude']]
y_train_1_lat = x_train_1.loc[:,['latitude']]
x_train_1 = x_train_1.drop(['longitude','latitude'],axis=1)
#slice 2
y_test_2_lon=x_original_2.loc[:,['longitude']]
y_test_2_lat=x_original_2.loc[:,['latitude']]
x_test_2 = x_original_2.drop(['longitude','latitude'],axis=1)
x_train_2 = X_original.drop(x_original_2.index)
y_train_2_lon = x_train_2.loc[:,['longitude']]
y_train_2_lat = x_train_2.loc[:,['latitude']]
x_train_2 = x_train_2.drop(['longitude','latitude'],axis=1)
#slice 3
y_test_3_lon=x_original_3.loc[:,['longitude']]
y_test_3_lat=x_original_3.loc[:,['latitude']]
x_test_3 = x_original_3.drop(['longitude','latitude'],axis=1)
x_train_3 = X_original.drop(x_original_3.index)
y_train_3_lon = x_train_3.loc[:,['longitude']]
y_train_3_lat = x_train_3.loc[:,['latitude']]
x_train_3 = x_train_3.drop(['longitude','latitude'],axis=1)

In [18]:
#verification (length must match)
print(f"X_original shape: {X_original.shape}")
print(f"x_train_1 shape: {x_train_1.shape}")
print(f"y_train_1_lon shape: {y_train_1_lon.shape}")
print(f"y_train_1_lat shape: {y_train_1_lat.shape}")
print(f"x_test_1 shape: {x_test_1.shape}")
print(f"y_test_1_lon shape: {y_test_1_lon.shape}")
print(f"y_test_1_lat shape: {y_test_1_lat.shape}")



X_original shape: (1522065, 17)
x_train_1 shape: (1014710, 15)
y_train_1_lon shape: (1014710, 1)
y_train_1_lat shape: (1014710, 1)
x_test_1 shape: (507355, 15)
y_test_1_lon shape: (507355, 1)
y_test_1_lat shape: (507355, 1)


Function to evaluate results

In [41]:
# define a utility function to print out the prediction performance
def evaluate_result(y_test, y_pred):
    print(mean_absolute_error(y_test, y_pred))
    print(r2_score(y_test, y_pred))

Model generation of random forest

In [None]:
clf = RandomForestRegressor(n_estimators=100, random_state=42)
clf.fit(x_train, y_train_lon.values.ravel())
y_pred = clf.predict(x_test)

In [68]:
evaluate_result(y_test_lon,y_pred,clf)

15.217318774346422
0.7901448046453818


Hyperparameter tuning for random forest regressor

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# model definition
rf_regressor = RandomForestRegressor()

# Search settings
grid_search = GridSearchCV(estimator=rf_regressor, param_grid=param_grid, 
                           cv=3, scoring='neg_mean_squared_error', n_jobs=-1)

# adjust the model
grid_search.fit(x_train, y_train_lon)

# see best parameters
print(grid_search.best_params_)

Function to build the submission file

In [1]:
def result_merger(longitude,latitude):
    submission = pd.merge(longitude, latitude, left_index=True, right_index=True, how='inner')
    submission.insert(0, 'ID', range(len(df)))
    submission.to_csv('submission.csv', index=False)
    return submission