In [54]:
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.isotonic import IsotonicRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

In [2]:
import pandas as pd
import numpy as np

# Datasets

In [3]:
maersk      = pd.read_csv("preprocess-20190425/Maersk-train.csv", index_col = "container")
hapag_lloyd = pd.read_csv("preprocess-20190425/Hapag-Lloyd-train.csv", index_col = "container")
evergreen   = pd.read_csv("preprocess-20190425/Evergreen-train.csv", index_col = "container")

**Usamos las coordenadas geográficas para poder predecir el tiempo de llegada de los contenedores.**

Conservamos la columna del contenedor para poder apreciar la data de forma legible.<br>
Al momento de entrenar con las técnicas de aprendizaje de máquina a usar, se limpiará esta columna.

In [4]:
columns = ["carrier", "first_latitude", "first_longitude", "last_latitude", "last_longitude", "timedelta"]

maersk      = maersk[columns]
hapag_lloyd = hapag_lloyd[columns]
evergreen   = evergreen[columns]

### Maersk

In [5]:
maersk.head()

Unnamed: 0_level_0,carrier,first_latitude,first_longitude,last_latitude,last_longitude,timedelta
container,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
MRKU2577274,1,13.097055,100.915227,-33.045846,-71.619675,50.211111
MNBU3341806,1,31.225344,121.488892,-12.066667,-77.15,43.877083
MSKU3220679,1,31.225344,121.488892,-12.066667,-77.15,40.819444
MSKU6382164,1,31.225344,121.488892,-12.066667,-77.15,40.902083
MNBU3258585,1,22.350627,114.184916,-12.066667,-77.15,45.9125


In [6]:
maersk.tail()

Unnamed: 0_level_0,carrier,first_latitude,first_longitude,last_latitude,last_longitude,timedelta
container,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
MRKU5849530,1,49.453872,11.077298,-33.045846,-71.619675,39.093056
MRKU7584912,1,49.453872,11.077298,-33.045846,-71.619675,43.445833
MSKU0099666,1,49.453872,11.077298,-33.045846,-71.619675,39.134028
MRSU4150258,1,48.395497,10.00589,-33.045846,-71.619675,42.896528
MRKU4464913,1,48.395497,10.00589,-33.045846,-71.619675,42.550694


### Hapag-Lloyd

In [7]:
hapag_lloyd.head()

Unnamed: 0_level_0,carrier,first_latitude,first_longitude,last_latitude,last_longitude,timedelta
container,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
TGHU0538924,2,25.798845,-100.372833,-12.066667,-77.15,38.034028
UACU5989812,2,19.432601,-99.133342,-12.066667,-77.15,20.341667
TGHU6111924,2,22.3308,91.841286,-12.066667,-77.15,58.904167
HLBU1218814,2,22.3308,91.841286,-12.066667,-77.15,58.649306
HLXU8347810,2,22.3308,91.841286,-12.066667,-77.15,56.311806


In [8]:
hapag_lloyd.tail()

Unnamed: 0_level_0,carrier,first_latitude,first_longitude,last_latitude,last_longitude,timedelta
container,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
HLXU5085471,2,27.211944,119.571389,-12.066667,-77.15,59.839583
HLXU3255650,2,-6.175394,106.827183,-12.066667,-77.15,55.025694
HLXU5266968,2,23.182451,113.476086,-12.066667,-77.15,57.204861
UACU8282470,2,19.432601,-99.133342,-12.066667,-77.15,25.288194
FSCU5670046,2,51.22111,4.399708,36.846292,-76.292925,28.386111


### Evergreen

In [9]:
evergreen.head()

Unnamed: 0_level_0,carrier,first_latitude,first_longitude,last_latitude,last_longitude,timedelta
container,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
EISU5704208,3,22.620335,120.312038,-33.580861,-71.613238,61.0
TEMU3454933,3,-12.066667,-77.15,8.347996,-78.897173,11.0
EGHU3277563,3,-12.066667,-77.15,19.127657,-104.284126,12.0
EGHU3275196,3,-12.066667,-77.15,19.127657,-104.284126,12.0
EGHU9142586,3,-12.066667,-77.15,8.347996,-78.897173,11.0


In [10]:
evergreen.tail()

Unnamed: 0_level_0,carrier,first_latitude,first_longitude,last_latitude,last_longitude,timedelta
container,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
HMCU9167806,3,-12.066667,-77.15,3.889933,-77.078605,4.0
EGSU3036148,3,-12.066667,-77.15,8.347996,-78.897173,7.0
EISU2086127,3,-12.066667,-77.15,8.347996,-78.897173,7.0
EMCU6081938,3,-12.066667,-77.15,8.347996,-78.897173,7.0
EGSU9089973,3,-12.066667,-77.15,23.182451,113.476086,35.0


### Juntar datasets

In [11]:
dataset = pd.concat([maersk, hapag_lloyd, evergreen])
print(len(dataset), "containers")

4986 containers


In [12]:
X = dataset.drop("timedelta", axis = 1, inplace = False).values
Y = dataset["timedelta"].values

print(X.shape)
print(Y.shape)

(4986, 5)
(4986,)


In [13]:
X

array([[  1.        ,  13.0970555 , 100.9152266 , -33.0458456 ,
        -71.6196749 ],
       [  1.        ,  31.2253441 , 121.4888922 , -12.066667  ,
        -77.15      ],
       [  1.        ,  31.2253441 , 121.4888922 , -12.066667  ,
        -77.15      ],
       ...,
       [  3.        , -12.066667  , -77.15      ,   8.3479957 ,
        -78.89717298],
       [  3.        , -12.066667  , -77.15      ,   8.3479957 ,
        -78.89717298],
       [  3.        , -12.066667  , -77.15      ,  23.1824507 ,
        113.4760861 ]])

In [14]:
Y

array([50.21111111, 43.87708333, 40.81944444, ...,  7.        ,
        7.        , 35.        ])

# Regresión/Predicción usando ML

In [15]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)

In [16]:
print(len(X_train), "to train,", len(X_test), "to test")

3739 to train, 1247 to test


### Random forests

In [77]:
regr = RandomForestRegressor(n_estimators = 1000, random_state = 0)
regr.fit(X_train, Y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [78]:
print(regr.feature_importances_)

[0.04123115 0.21304011 0.53074729 0.08760478 0.12737667]


In [79]:
regr.score(X_test, Y_test)

0.7782161092446388

### Decision trees

In [120]:
regr = DecisionTreeRegressor(random_state = 0)
regr.fit(X_train, Y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [121]:
print(regr.feature_importances_)

[0.02747649 0.20596234 0.55978234 0.08461522 0.12216361]


In [122]:
regr.score(X_test, Y_test)

0.7572128530822195

### Support Vector Machines para regresión

In [36]:
svr  = SVR(gamma = "auto")
regr = GridSearchCV(svr, )

In [38]:
regr.fit(X_train, Y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.2, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [39]:
regr.score(X_test, Y_test)

0.38074622527983587

### Perceptrón multicapa

In [52]:
# hidden_layer_sizes = (100, 100)
regr = MLPRegressor(random_state = 0, max_iter = 5000, activation = "logistic", hidden_layer_sizes = (100, 100))
regr.fit(X_train, Y_train)

MLPRegressor(activation='logistic', alpha=0.0001, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 100), learning_rate='constant',
       learning_rate_init=0.001, max_iter=5000, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=0, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [53]:
regr.score(X_test, Y_test)

0.6902932457581137