In [1]:
# import packages
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error,r2_score
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

In [3]:
X_sample = pd.read_csv("data/ais_train.csv", delimiter='|')
X_sample.to_csv('data/ais_train_modified.csv', index=False)
extra_vessels = pd.read_csv("data/vessels.csv", on_bad_lines='skip', delimiter='|')
extra_vessels.to_csv('data/vessels_modified.csv', index=False)
extra_ports = pd.read_csv("data/ports.csv", on_bad_lines='skip', delimiter='|')
extra_ports.to_csv('data/ports_modified.csv', index=False)
extra_schedules = pd.read_csv("data/schedules_to_may_2024.csv", on_bad_lines='skip', delimiter='|')
extra_schedules.to_csv('data/schedules_to_may_2024_modified.csv', index=False)

In [4]:
X_sample = pd.read_csv("data/ais_sample_submission.csv", header=None)
X_evaluation = pd.read_csv("data/ais_test.csv",)
extra_ports = pd.read_csv("data/ports_modified.csv")
extra_vessels = pd.read_csv("data/vessels_modified.csv")
extra_schedules = pd.read_csv("data/schedules_to_may_2024_modified.csv")
X_original = pd.read_csv("data/ais_train_modified.csv")

In [6]:
X_original['time'] = pd.to_datetime(X_original['time'])
X_original['year'] = X_original['time'].dt.year
X_original['month'] = X_original['time'].dt.month
X_original['day'] = X_original['time'].dt.day
X_original['hour'] = X_original['time'].dt.hour
X_original['minute'] = X_original['time'].dt.minute
X_original['second'] = X_original['time'].dt.second

X_original['etaRaw'] = pd.to_datetime(X_original['etaRaw'], format='%m-%d %H:%M', errors='coerce')
X_original['eta_month'] = X_original['etaRaw'].dt.month
X_original['eta_day'] = X_original['etaRaw'].dt.day
X_original['eta_hour'] = X_original['etaRaw'].dt.hour
X_original['eta_minute'] = X_original['etaRaw'].dt.minute

X_original = X_original.drop(['time','etaRaw'], axis=1)


KeyError: 'time'

In [7]:
X_original.head()

Unnamed: 0,cog,sog,rot,heading,navstat,latitude,longitude,vesselId,portId,year,month,day,hour,minute,second,eta_month,eta_day,eta_hour,eta_minute
0,284.0,0.7,0,88,0,-34.7437,-57.8513,61e9f3a8b937134a3c4bfdf7,61d371c43aeaecc07011a37f,2024,1,1,0,0,25,1.0,9.0,23.0,0.0
1,109.6,0.0,-6,347,1,8.8944,-79.47939,61e9f3d4b937134a3c4bff1f,634c4de270937fc01c3a7689,2024,1,1,0,0,36,12.0,29.0,20.0,0.0
2,111.0,11.0,0,112,0,39.19065,-76.47567,61e9f436b937134a3c4c0131,61d3847bb7b7526e1adf3d19,2024,1,1,0,1,45,1.0,2.0,9.0,0.0
3,96.4,0.0,0,142,1,-34.41189,151.02067,61e9f3b4b937134a3c4bfe77,61d36f770a1807568ff9a126,2024,1,1,0,3,11,12.0,31.0,20.0,0.0
4,214.0,19.7,0,215,0,35.88379,-5.91636,61e9f41bb937134a3c4c0087,634c4de270937fc01c3a74f3,2024,1,1,0,3,51,1.0,25.0,12.0,0.0


In [55]:
X_original = X_original.drop(['vesselId','portId'], axis=1)

x_test=X_original.sample(frac=0.2, random_state=42)
x_train=X_original.drop(x_test.index)
y_train_lon=x_train.loc[:,['longitude']]
y_train_lat=x_train.loc[:,['latitude']]
y_test_lon=x_test.loc[:,['longitude']]
y_test_lat=x_test.loc[:,['latitude']]
x_test=x_test.drop(['longitude','latitude'],axis=1)
x_train=x_train.drop(['longitude','latitude'],axis=1)

In [56]:
print(f"x_train shape: {x_train.shape}")
print(f"y_train_lon shape: {y_train_lon.shape}")
print(f"y_train_lat shape: {y_train_lat.shape}")
print(f"x_test shape: {x_test.shape}")
print(f"y_test_lon shape: {y_test_lon.shape}")
print(f"y_test_lat shape: {y_test_lat.shape}")

x_train shape: (1217652, 15)
y_train_lon shape: (1217652, 1)
y_train_lat shape: (1217652, 1)
x_test shape: (304413, 15)
y_test_lon shape: (304413, 1)
y_test_lat shape: (304413, 1)


In [None]:
prueba = pd.merge(X_train,extra_vessels, on='vesselId',how='left')
prueba.head()

In [65]:
# define a utility function to print out the prediction performance
def evaluate_result(y_test, y_pred):
   # print(f'mean_squared: {mean_squared_error(y_test, y_pred):.4f}')
    print(mean_absolute_error(y_test, y_pred))
    print(r2_score(y_test, y_pred))

In [None]:
clf = RandomForestRegressor(n_estimators=100, random_state=42)
clf.fit(x_train, y_train_lon.values.ravel())
y_pred = clf.predict(x_test)

In [68]:
evaluate_result(y_test_lon,y_pred,clf)

15.217318774346422
0.7901448046453818


In [70]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Definir el modelo
rf_regressor = RandomForestRegressor()

# Configurar la búsqueda en cuadrícula con validación cruzada
grid_search = GridSearchCV(estimator=rf_regressor, param_grid=param_grid, 
                           cv=3, scoring='neg_mean_squared_error', n_jobs=-1)

# Ajustar el modelo
grid_search.fit(x_train, y_train_lon)

# Ver los mejores parámetros
print(grid_search.best_params_)

KeyboardInterrupt: 