In [1]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

  from numpy.core.umath_tests import inner1d


In [2]:
import pandas as pd
import numpy as np

# Datasets

In [3]:
directory = "preprocess-detailed/"

maersk_train = pd.read_csv(directory + "Maersk-train.csv")
# maersk_loc   = pd.read_csv(directory + "Maersk-repeated-locations.csv")
# maersk_stat  = pd.read_csv(directory + "Maersk-repeated-statuses.csv")

hapag_lloyd_train = pd.read_csv(directory + "Hapag-Lloyd-train.csv")
# hapag_lloyd_loc   = pd.read_csv(directory + "Hapag-Lloyd-repeated-locations.csv")
# hapag_lloyd_stat  = pd.read_csv(directory + "Hapag-Lloyd-repeated-statuses.csv")

evergreen_train = pd.read_csv(directory + "Evergreen-train.csv")
# evergreen_loc   = pd.read_csv(directory + "Evergreen-repeated-locations.csv")
# evergreen_stat  = pd.read_csv(directory + "Evergreen-repeated-statuses.csv")

**Usamos las coordenadas geográficas para poder predecir el tiempo de llegada de los contenedores.**

Conservamos la columna del contenedor para poder apreciar la data de forma legible.<br>
Al momento de entrenar con las técnicas de aprendizaje de máquina a usar, se limpiará esta columna.

In [4]:
columns = ["container", "carrier", "first_latitude", "first_longitude", "second_latitude", "second_longitude", "elapsed_days"]

maersk      = maersk_train[columns]
hapag_lloyd = hapag_lloyd_train[columns]
evergreen   = evergreen_train[columns]

### Maersk

In [5]:
maersk.head()

Unnamed: 0,container,carrier,first_latitude,first_longitude,second_latitude,second_longitude,elapsed_days
0,MRKU2577274,1,13.097055,100.915227,13.097055,100.915227,3.635417
1,MRKU2577274,1,13.097055,100.915227,22.350627,114.184916,4.088889
2,MRKU2577274,1,22.350627,114.184916,-33.045846,-71.619675,28.361806
3,MNBU3341806,1,31.225344,121.488892,31.225344,121.488892,2.747222
4,MNBU3341806,1,31.225344,121.488892,-12.066667,-77.15,33.025694


In [6]:
maersk.tail()

Unnamed: 0,container,carrier,first_latitude,first_longitude,second_latitude,second_longitude,elapsed_days
2589,MNBU3273487,1,-12.066667,-77.15,-12.066667,-77.15,2.625
2590,MNBU3273487,1,-12.066667,-77.15,8.347996,-78.897173,4.979861
2591,MNBU0086836,1,-12.066667,-77.15,-12.066667,-77.15,1.149306
2592,MNBU0120822,1,-12.066667,-77.15,-12.066667,-77.15,1.102083
2593,MRSU3685980,1,25.774266,-80.193659,25.774266,-80.193659,1.030556


### Hapag-Lloyd

In [7]:
hapag_lloyd.head()

Unnamed: 0,container,carrier,first_latitude,first_longitude,second_latitude,second_longitude,elapsed_days
0,TGHU0538924,2,25.798845,-100.372833,19.127657,-104.284126,6.068056
1,TGHU0538924,2,19.127657,-104.284126,-12.066667,-77.15,8.020833
2,UACU5989812,2,19.432601,-99.133342,19.127657,-104.284126,3.047917
3,TGHU6111924,2,22.3308,91.841286,1.340853,103.878447,4.490278
4,TGHU6111924,2,1.340853,103.878447,22.279328,114.162813,6.620833


In [8]:
hapag_lloyd.tail()

Unnamed: 0,container,carrier,first_latitude,first_longitude,second_latitude,second_longitude,elapsed_days
3211,AMFU8543420,2,22.279328,114.162813,40.341186,-78.400292,28.1
3212,HLBU2130224,2,19.127657,-104.284126,34.053683,-118.242767,3.5125
3213,HLXU5266968,2,23.182451,113.476086,22.279328,114.162813,1.410417
3214,UACU8282470,2,19.432601,-99.133342,19.127657,-104.284126,1.694444
3215,FSCU5670046,2,51.22111,4.399708,36.846292,-76.292925,11.233333


### Evergreen

In [9]:
evergreen.head()

Unnamed: 0,container,carrier,first_latitude,first_longitude,second_latitude,second_longitude,elapsed_days
0,EISU5704208,3,22.620335,120.312038,-33.580861,-71.613238,61.0
1,TEMU3454933,3,-12.066667,-77.15,8.347996,-78.897173,9.0
2,EGHU3277563,3,-12.066667,-77.15,19.127657,-104.284126,12.0
3,EGHU9277088,3,-12.066667,-77.15,3.889933,-77.078605,4.0
4,HMCU3057340,3,-12.066667,-77.15,-2.189877,-79.887715,9.0


In [10]:
evergreen.tail()

Unnamed: 0,container,carrier,first_latitude,first_longitude,second_latitude,second_longitude,elapsed_days
42,EGHU9390258,3,-33.437797,-70.650445,-33.580861,-71.613238,3.0
43,EGHU9395476,3,-33.437797,-70.650445,-33.580861,-71.613238,5.0
44,EGSU3036148,3,-12.066667,-77.15,8.347996,-78.897173,7.0
45,EGSU9089973,3,-12.066667,-77.15,22.350627,114.184916,34.0
46,EGSU9089973,3,22.350627,114.184916,23.182451,113.476086,1.0


### Juntar datasets

In [11]:
dataset = pd.concat([maersk, hapag_lloyd, evergreen])
print(len(dataset), "containers")

5857 containers


In [12]:
X = dataset.drop(["container", "elapsed_days"], axis = 1, inplace = False).values
Y = dataset["elapsed_days"].values

print(X.shape)
print(Y.shape)

(5857, 5)
(5857,)


In [13]:
X

array([[  1.        ,  13.0970555 , 100.9152266 ,  13.0970555 ,
        100.9152266 ],
       [  1.        ,  13.0970555 , 100.9152266 ,  22.350627  ,
        114.1849161 ],
       [  1.        ,  22.350627  , 114.1849161 , -33.0458456 ,
        -71.6196749 ],
       ...,
       [  3.        , -12.066667  , -77.15      ,   8.3479957 ,
        -78.89717298],
       [  3.        , -12.066667  , -77.15      ,  22.350627  ,
        114.1849161 ],
       [  3.        ,  22.350627  , 114.1849161 ,  23.1824507 ,
        113.4760861 ]])

In [14]:
Y

array([ 3.63541667,  4.08888889, 28.36180556, ...,  7.        ,
       34.        ,  1.        ])

# Regresión/Predicción usando ML

In [15]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)

In [16]:
print(len(X_train), "to train,", len(X_test), "to test")

4392 to train, 1465 to test


### Random forests

In [17]:
regr = RandomForestRegressor(n_estimators = 1000, random_state = 0)
regr.fit(X_train, Y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [18]:
print(regr.feature_importances_)

[0.00613882 0.07228222 0.31739495 0.09283169 0.51135233]


In [19]:
regr.score(X_test, Y_test)

0.9033823056554233

### Decision trees

In [20]:
regr = DecisionTreeRegressor(random_state = 0)
regr.fit(X_train, Y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=0, splitter='best')

In [21]:
print(regr.feature_importances_)

[0.0044861  0.05153293 0.31554148 0.06976772 0.55867178]


In [22]:
regr.score(X_test, Y_test)

0.8957655105916839

### Support Vector Machines para regresión

In [23]:
regr = SVR(gamma = "auto")
regr.fit(X_train, Y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [24]:
regr.score(X_test, Y_test)

0.6453364790386377

### Perceptrón multicapa

In [25]:
regr = MLPRegressor(random_state = 0, max_iter = 5000, activation = "logistic", hidden_layer_sizes = (100,))
regr.fit(X_train, Y_train)

MLPRegressor(activation='logistic', alpha=0.0001, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=5000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=0, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [26]:
regr.score(X_test, Y_test)

0.8311756535440245

In [27]:
regr = MLPRegressor(random_state = 0, max_iter = 5000, activation = "logistic", hidden_layer_sizes = (100, 100))
regr.fit(X_train, Y_train)

MLPRegressor(activation='logistic', alpha=0.0001, batch_size='auto',
       beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100, 100), learning_rate='constant',
       learning_rate_init=0.001, max_iter=5000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=0, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [28]:
regr.score(X_test, Y_test)

0.8601893120441767

# Cross validation

### Random forests

In [29]:
regr = RandomForestRegressor(n_estimators = 1000, random_state = 0)
scores = cross_val_score(regr, X, Y, cv = 10)
scores

array([0.95330038, 0.9137018 , 0.95991584, 0.96180577, 0.81443068,
       0.67775031, 0.77883546, 0.73277217, 0.82193633, 0.78231173])

In [30]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.84 (+/- 0.19)


### Decision trees

In [31]:
regr = DecisionTreeRegressor(random_state = 0)
scores = cross_val_score(regr, X, Y, cv = 10)
scores

array([0.93775968, 0.85119226, 0.95550912, 0.96235455, 0.78783172,
       0.64018313, 0.67751611, 0.68191839, 0.79426666, 0.76156548])

In [32]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.81 (+/- 0.23)


### Support Vector Machines para regresión

In [33]:
regr = SVR(gamma = "auto")
scores = cross_val_score(regr, X, Y, cv = 10)
scores

array([0.54854421, 0.18652248, 0.93102619, 0.91464305, 0.3309296 ,
       0.34962511, 0.26497948, 0.3097291 , 0.36847952, 0.12286069])

In [34]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.43 (+/- 0.53)


### Perceptrón multicapa

In [35]:
regr = MLPRegressor(random_state = 0, max_iter = 5000, activation = "logistic", hidden_layer_sizes = (100,))
scores = cross_val_score(regr, X, Y, cv = 10)
scores

array([0.91639315, 0.91528589, 0.94378822, 0.95669331, 0.72236725,
       0.58806295, 0.65078273, 0.65536435, 0.767087  , 0.70603068])

In [36]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.78 (+/- 0.26)


In [37]:
regr = MLPRegressor(random_state = 0, max_iter = 5000, activation = "logistic", hidden_layer_sizes = (100, 100))
scores = cross_val_score(regr, X, Y, cv = 10)
scores

array([0.93419798, 0.9024906 , 0.95112069, 0.9458457 , 0.75722163,
       0.62861822, 0.69742116, 0.71422751, 0.77897711, 0.71072367])

In [38]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy: 0.80 (+/- 0.23)
