# NY CAB DATA
## Price and Trip duration Prediction

### IMPORTANT: Data is stored into Postgres Image

In [1]:
import pandas as pd
from pyspark.sql import SparkSession
import psycopg2 as pg
import numpy as np 
import warnings
import os
warnings.filterwarnings('ignore')
pd.options.plotting.backend = "plotly"

In [2]:
from pyspark.sql.functions import *

### Setting Up Spark 

In [3]:
jar_path = os.path.join(os.path.dirname(os.path.abspath('')),'docker_sql','postgresql-42.5.0.jar')
jar_path, os.path.isfile(jar_path)

('c:\\Users\\Olist\\OneDrive\\Ambiente de Trabalho\\Projects\\ny_cab_app\\docker_sql\\postgresql-42.5.0.jar',
 True)

In [4]:
postgres_url = f"jdbc:postgresql://localhost:5432/ny_taxi"

In [5]:
spark = SparkSession.builder.appName("ML_model").config("spark.jars", jar_path).config("spark.driver.memory", "15g").getOrCreate()

### Loading Data

In [6]:
df = spark.read.format("jdbc").options(
                url=postgres_url,
                driver="org.postgresql.Driver",
                dbtable='ny_taxi',
                user='root',
                password='root'
                ).load()
df.show(5)

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       1| 2022-06-01 00:25:41|  2022-06-01 00:48:22|            1.0|         11.0|       1.0|                 N|          70|          48|           1|       32.0|  3.0|    0.5|       2.

In [7]:
df.printSchema()

root
 |-- VendorID: long (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)



## Data Cleaning And Validation

In [8]:
df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       0|                   0|                    0|         132448|            0|    132448|            132448|           0|           0|           0|          0|    0|      0|         

In [9]:
df = df.withColumn('pickup_day_shift', round(hour(df.tpep_pickup_datetime)/5,0))

In [10]:
df = df.withColumn('trip_duration', (col("tpep_dropoff_datetime").cast("long") - col('tpep_pickup_datetime').cast("long"))/3600)

In [11]:
df.describe().toPandas()

Unnamed: 0,summary,VendorID,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,pickup_day_shift,trip_duration
0,count,3558124.0,3425676.0,3558124.0,3425676.0,3425676,3558124.0,3558124.0,3558124.0,3558124.0,3558124.0,3558124.0,3558124.0,3558124.0,3558124.0,3558124.0,3425676.0,3425676.0,3558124.0,3558124.0
1,mean,1.71421625553241,1.3990418241538312,5.968216000903217,1.4181633055782277,,164.60850830381403,162.41222846646153,1.1818587547820143,15.249263010518249,1.0210053809254511,0.4884199370229933,2.7958426406719723,0.562215619806497,0.2960561239765024,22.118415234565592,2.2823746320434277,0.0957336595755115,2.8289208020855936,7.021806643751993
2,stddev,0.4877038929139574,0.960712571772369,594.1291221670459,5.702573931569726,,65.54112126953186,70.16921804762987,0.5104356496092229,212.1835543481579,1.2546159272315158,0.0937885957220193,3.581684820270223,2.121172505401807,0.0481092526932312,212.46228920765853,0.7490436767387179,0.3357336222458381,1.212871821309035,1075.2233002350804
3,min,1.0,0.0,0.0,1.0,N,1.0,1.0,0.0,-907.0,-7.0,-0.5,-80.08,-63.2,-0.3,-911.55,-2.5,-1.25,0.0,-11.306666666666668
4,max,6.0,9.0,307007.11,99.0,Y,265.0,265.0,4.0,395844.94,8.25,3.3,1400.16,800.09,0.3,395848.24,2.75,1.25,5.0,172034.25305555554


In [12]:
df.approxQuantile("trip_duration", [0.8],0.05)

[0.3877777777777778]

#### Drop Non Numeric columns

In [13]:
df = df.drop(*('tpep_pickup_datetime','tpep_dropoff_datetime','store_and_fwd_flag'))

#### Filtering All Negative Values

In [14]:
df.select([(when(col(c)>=0, col(c))).alias(c) for c in df.columns]).describe().toPandas()

Unnamed: 0,summary,VendorID,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,pickup_day_shift,trip_duration
0,count,3558124.0,3425676.0,3558124.0,3425676.0,3558124.0,3558124.0,3558124.0,3535708.0,3547064.0,3536255.0,3557709.0,3557040.0,3535453.0,3535411.0,3408036.0,3423248.0,3558124.0,3556903.0
1,mean,1.71421625553241,1.3990418241538312,5.968216000903217,1.4181633055782277,164.60850830381403,162.41222846646153,1.1818587547820143,15.437022924415029,1.026672763727971,0.4945324276671227,2.796418324263827,0.5647434468008284,0.2998783183000379,22.37457250883357,2.307128211086972,0.0966881452935925,2.8289208020855936,7.0242221347066325
2,stddev,0.4877038929139574,0.960712571772369,594.1291221670459,5.702573931569726,65.54112126953186,70.16921804762987,0.5104356496092229,212.8345808604909,1.2518560199878157,0.0526468901083348,3.5804086537432727,2.115188681057374,0.0060406721239371,213.11168712546333,0.6670658499919979,0.3339335514012864,1.212871821309035,1075.4078257923095
3,min,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,max,6.0,9.0,307007.11,99.0,265.0,265.0,4.0,395844.94,8.25,3.3,1400.16,800.09,0.3,395848.24,2.75,1.25,5.0,172034.25305555554


In [19]:
df.select([(when(col(c)>=0, col(c))).otherwise(None).alias(c) for c in df.columns]).dropna().describe().toPandas()

Unnamed: 0,summary,VendorID,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,pickup_day_shift,trip_duration
0,count,3402996.0,3402996.0,3402996.0,3402996.0,3402996.0,3402996.0,3402996.0,3402996.0,3402996.0,3402996.0,3402996.0,3402996.0,3402996.0,3402996.0,3402996.0,3402996.0,3402996.0,3402996.0
1,mean,1.7021098467350535,1.3995026735265044,3.638474059328231,1.419442456000536,164.69242338221966,162.74214662609066,1.2137557610999248,15.197886374245227,1.0670598643078015,0.4945802963036104,2.7678593715652853,0.5610226958855934,0.2998786951447074,22.09809153993338,2.310544443778365,0.0972635583468214,2.834960428986693,6.8215638575850175
2,stddev,0.4573310303421309,0.9618302087930902,116.120020328652,5.721189634212782,65.13671764539471,70.07469085739774,0.4273746420296052,216.9212335596505,1.260960334271295,0.0523652674660543,3.581400112337852,2.1051586831417928,0.0060313147761382,217.1834310896354,0.6616210236680115,0.3348421734167634,1.2069815123200245,1059.1713120893264
3,min,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,max,2.0,9.0,184340.8,99.0,265.0,265.0,4.0,395844.94,8.25,3.3,1400.16,800.09,0.3,395848.24,2.75,1.25,5.0,172034.25305555554


In [20]:
df = df.select([(when(col(c)>=0, col(c))).otherwise(None).alias(c) for c in df.columns]).dropna()

##### Filtering Outliers and Bad Data with Quantiles

In [17]:
feature_quantiles = dict([(c,df.approxQuantile(c, [0.9],0.05)) for c in df.columns])

In [22]:
df.select([when(col(c)<=feature_quantiles[c][0],col(c)).otherwise(None).alias(c) for c in df.columns]).dropna().describe().toPandas()

Unnamed: 0,summary,VendorID,passenger_count,trip_distance,RatecodeID,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,pickup_day_shift,trip_duration
0,count,1234753.0,1234753.0,1234753.0,1234753.0,1234753.0,1234753.0,1234753.0,1234753.0,1234753.0,1234753.0,1234753.0,1234753.0,1234753.0,1234753.0,1234753.0,1234753.0,1234753.0,1234753.0
1,mean,1.7979247671396628,1.1364208064284922,1.5772419018217674,1.0,154.83719496935825,149.62970569822463,1.2276358105629224,8.726900068272746,0.7473944019573149,0.4998295448563396,1.6900097752358436,0.0,0.2999241953585965,13.899566504545634,2.404175167017209,0.0,2.678932547643132,0.174221216263037
2,stddev,0.4015482084512156,0.3865025447603541,0.8537409736543244,0.0,60.98369166031255,60.44249811031583,0.419306440092952,3.0746384300921736,0.922946433392858,0.0092114902499139,1.1778792292018736,0.0,0.0047681927869831,3.700485216296376,0.4799790312300309,0.0,1.1003958199343669,0.0836867796508464
3,min,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,max,2.0,2.0,4.61,1.0,237.0,236.0,2.0,18.0,2.5,0.5,3.76,0.0,0.3,27.31,2.5,0.0,4.0,0.3877777777777778


In [None]:
df = df.select([when(col(c)<=feature_quantiles[c][0],col(c)).otherwise(None).alias(c) for c in df.columns]).dropna()

## Data Analisys

## Model Training

## Model Evaluation


## Model Export