# Predicción del precio del taxi

In [57]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [77]:
train = pd.read_csv("../datasets/taxi_fare/train.csv", nrows=6000000)
train.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1


In [78]:
test = pd.read_csv("../datasets/taxi_fare/test.csv")
test.head()

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24 UTC,-73.97332,40.763805,-73.98143,40.743835,1
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24 UTC,-73.986862,40.719383,-73.998886,40.739201,1
2,2011-10-08 11:53:44.0000002,2011-10-08 11:53:44 UTC,-73.982524,40.75126,-73.979654,40.746139,1
3,2012-12-01 21:12:12.0000002,2012-12-01 21:12:12 UTC,-73.98116,40.767807,-73.990448,40.751635,1
4,2012-12-01 21:12:12.0000003,2012-12-01 21:12:12 UTC,-73.966046,40.789775,-73.988565,40.744427,1


In [79]:
train.dtypes

key                   object
fare_amount          float64
pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
dtype: object

### Data preprocessing

In [80]:
train.isna().sum()

key                   0
fare_amount           0
pickup_datetime       0
pickup_longitude      0
pickup_latitude       0
dropoff_longitude    39
dropoff_latitude     39
passenger_count       0
dtype: int64

In [81]:
train["dropoff_latitude"] = train["dropoff_latitude"].fillna(train.dropoff_latitude.mean())
train["dropoff_longitude"] = train["dropoff_longitude"].fillna(train.dropoff_longitude.mean())

In [82]:
train.isna().sum()

key                  0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
dtype: int64

In [83]:
train["year"] = [int(x[:4]) for x in train.pickup_datetime]
train["month"] = [int(x[5:7]) for x in train.pickup_datetime]
train["day"] = [int(x[8:10]) for x in train.pickup_datetime]
train["hour"] = [int(x[11:13]) for x in train.pickup_datetime]

train = train.drop(columns=["pickup_datetime"])
train.head()

Unnamed: 0,key,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,month,day,hour
0,2009-06-15 17:26:21.0000001,4.5,-73.844311,40.721319,-73.84161,40.712278,1,2009,6,15,17
1,2010-01-05 16:52:16.0000002,16.9,-74.016048,40.711303,-73.979268,40.782004,1,2010,1,5,16
2,2011-08-18 00:35:00.00000049,5.7,-73.982738,40.76127,-73.991242,40.750562,2,2011,8,18,0
3,2012-04-21 04:30:42.0000001,7.7,-73.98713,40.733143,-73.991567,40.758092,1,2012,4,21,4
4,2010-03-09 07:51:00.000000135,5.3,-73.968095,40.768008,-73.956655,40.783762,1,2010,3,9,7


In [84]:
train.dtypes

key                   object
fare_amount          float64
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
year                   int64
month                  int64
day                    int64
hour                   int64
dtype: object

In [85]:
test["year"] = [int(x[:4]) for x in test.pickup_datetime]
test["month"] = [int(x[5:7]) for x in test.pickup_datetime]
test["day"] = [int(x[8:10]) for x in test.pickup_datetime]
test["hour"] = [int(x[11:13]) for x in test.pickup_datetime]

test = test.drop(columns=["pickup_datetime"])

In [86]:
train.drop("key", axis=1, inplace=True)
test.drop("key", axis=1, inplace=True)

In [87]:
train_input = train.drop("fare_amount", axis=1)
train_target = train["fare_amount"]

train_input[["pickup_longitude","pickup_latitude","dropoff_longitude","dropoff_latitude"]] = MinMaxScaler().fit_transform(train_input[["pickup_longitude","pickup_latitude","dropoff_longitude","dropoff_latitude"]])

In [88]:
train_input.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,month,day,hour
0,0.488312,0.51647,0.485979,0.516358,1,2009,6,15,17
1,0.488287,0.516468,0.485959,0.516369,1,2010,1,5,16
2,0.488291,0.516476,0.485957,0.516364,2,2011,8,18,0
3,0.488291,0.516472,0.485957,0.516365,1,2012,4,21,4
4,0.488294,0.516477,0.485963,0.516369,1,2010,3,9,7


In [89]:
test[["pickup_longitude","pickup_latitude","dropoff_longitude","dropoff_latitude"]] = MinMaxScaler().fit_transform(test[["pickup_longitude","pickup_latitude","dropoff_longitude","dropoff_latitude"]])
test.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,month,day,hour
0,0.220338,0.167776,0.221502,0.15506,1,2015,1,27,13
1,0.209638,0.128686,0.207781,0.15095,1,2015,1,27,13
2,0.213066,0.156736,0.222898,0.157102,1,2011,10,8,11
3,0.214143,0.171297,0.214414,0.161976,1,2012,12,1,21
4,0.226085,0.190628,0.215894,0.155584,1,2012,12,1,21


In [102]:
x_data = np.array(train_input)
target = np.array(train_target)
x_test = np.array(test)

In [103]:
x_data.shape

(6000000, 9)

In [104]:
x_train, x_val, y_train, y_val = train_test_split(x_data, target, test_size=0.1, random_state=101)

In [105]:
x_train.shape

(5400000, 9)

### Creación del modelo

In [106]:
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization
from keras.optimizers import Adam

In [107]:
model = Sequential()

model.add(Dense(units=256, input_dim=(x_data.shape[1]), activation="relu"))
model.add(BatchNormalization())

model.add(Dense(units=128, activation="relu"))
model.add(BatchNormalization())

model.add(Dense(units=64, activation="relu"))
model.add(BatchNormalization())

model.add(Dense(units=32, activation="relu"))
model.add(BatchNormalization())

model.add(Dense(units=16, activation="relu"))
model.add(BatchNormalization())

model.add(Dense(units=8, activation="relu"))
model.add(BatchNormalization())

# Última capa (predicción)
model.add(Dense(units=1))

In [111]:
model.compile(loss="mse", optimizer=Adam(0.001), metrics=["mae"])

In [112]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_15 (Dense)             (None, 256)               2560      
_________________________________________________________________
batch_normalization_13 (Batc (None, 256)               1024      
_________________________________________________________________
dense_16 (Dense)             (None, 128)               32896     
_________________________________________________________________
batch_normalization_14 (Batc (None, 128)               512       
_________________________________________________________________
dense_17 (Dense)             (None, 64)                8256      
_________________________________________________________________
batch_normalization_15 (Batc (None, 64)                256       
_________________________________________________________________
dense_18 (Dense)             (None, 32)                2080      
__________

### Entrenamiento

In [113]:
training = model.fit(x=x_train, 
                     y=y_train, 
                     batch_size=300, 
                     epochs=20, 
                     validation_data=(x_val, y_val), 
                     verbose=1, 
                     shuffle=True)

Train on 5400000 samples, validate on 600000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20

KeyboardInterrupt: 