# NY CAB DATA
## Price and Trip duration Prediction

### IMPORTANT: Data is stored into Postgres Image

In [1]:
import pandas as pd
from pyspark.sql import SparkSession
import psycopg2 as pg
import numpy as np 
import warnings
import os
warnings.filterwarnings('ignore')
pd.options.plotting.backend = "plotly"

In [13]:
from pyspark.sql.functions import *

### Setting Up Spark 

In [2]:
jar_path = os.path.join(os.path.dirname(os.path.abspath('')),'docker_sql','postgresql-42.5.0.jar')
jar_path, os.path.isfile(jar_path)

('c:\\Users\\Olist\\OneDrive\\Ambiente de Trabalho\\Projects\\ny_cab_app\\docker_sql\\postgresql-42.5.0.jar',
 True)

In [3]:
postgres_url = f"jdbc:postgresql://localhost:5432/ny_taxi"

In [4]:
spark = SparkSession.builder.appName("ML_model").config("spark.jars", jar_path).config("spark.driver.memory", "15g").getOrCreate()

### Loading Data

In [5]:
df = spark.read.format("jdbc").options(
                url=postgres_url,
                driver="org.postgresql.Driver",
                dbtable='ny_taxi',
                user='root',
                password='root'
                ).load()
df.show(5)

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       2| 2022-06-30 17:03:42|  2022-06-30 17:20:15|            2.0|          1.1|       1.0|                 N|         142|          48|           1|       11.0|  1.0|    0.5|      3.8

In [6]:
df.printSchema()

root
 |-- VendorID: long (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)



## Data Cleaning And Validation

In [9]:
df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       0|                   0|                    0|         132448|            0|    132448|            132448|           0|           0|           0|          0|    0|      0|         

In [17]:
df = df.withColumn('pickup_day_shift', round(hour(df.tpep_pickup_datetime)/5,0))

In [18]:
df = df.withColumn('trip_duration', (col("tpep_dropoff_datetime").cast("long") - col('tpep_pickup_datetime').cast("long"))/3600)

In [19]:
df.describe().toPandas()

Unnamed: 0,summary,VendorID,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,pickup_day_shift,trip_duration
0,count,3558124.0,3425676.0,3558124.0,3425676.0,3425676,3558124.0,3558124.0,3558124.0,3558124.0,3558124.0,3558124.0,3558124.0,3558124.0,3558124.0,3558124.0,3425676.0,3425676.0,3558124.0,3558124.0
1,mean,1.71421625553241,1.3990418241538312,5.968216000903992,1.4181633055782277,,164.60850830381403,162.41222846646153,1.1818587547820143,15.249263010511488,1.0210053809254511,0.4884199370229933,2.79584264067268,0.5622156198065205,0.2960561239765024,22.11841523449738,2.2823746320434277,0.0957336595755115,2.8289208020855936,7.021806643752303
2,stddev,0.4877038929139458,0.9607125717723144,594.1291221670822,5.702573931569924,,65.54112126952948,70.16921804762775,0.5104356496092155,212.1835543481664,1.2546159272315107,0.0937885957220222,3.5816848202701226,2.1211725054018005,0.048109252693229,212.4622892076511,0.7490436767386998,0.3357336222458647,1.212871821309086,1075.2233002350897
3,min,1.0,0.0,0.0,1.0,N,1.0,1.0,0.0,-907.0,-7.0,-0.5,-80.08,-63.2,-0.3,-911.55,-2.5,-1.25,0.0,-11.306666666666668
4,max,6.0,9.0,307007.11,99.0,Y,265.0,265.0,4.0,395844.94,8.25,3.3,1400.16,800.09,0.3,395848.24,2.75,1.25,5.0,172034.25305555554


In [25]:
df.approxQuantile("trip_duration", [0.8],0.05)

[0.38333333333333336]

#### Drop Non Numeric columns

In [29]:
df = df.drop(*('tpep_pickup_datetime','tpep_dropoff_datetime','store_and_fwd_flag'))

#### Filtering All Negative Values

In [34]:
df.select([(when(col(c)>=0, col(c))).alias(c) for c in df.columns]).describe().toPandas()

Unnamed: 0,summary,VendorID,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,pickup_day_shift,trip_duration
0,count,3558124.0,3425676.0,3558124.0,3425676.0,0.0,3558124.0,3558124.0,3558124.0,3535708.0,3547064.0,3536255.0,3557709.0,3557040.0,3535453.0,3535411.0,3408036.0,3423248.0,3558124.0,3556903.0
1,mean,1.71421625553241,1.3990418241538312,5.968216000903992,1.4181633055782277,,164.60850830381403,162.41222846646153,1.1818587547820143,15.43702292440813,1.026672763727971,0.4945324276671227,2.7964183242645344,0.5647434468008516,0.2998783183000379,22.37457250876367,2.307128211086972,0.0966881452935925,2.8289208020855936,7.024222134706972
2,stddev,0.4877038929139458,0.9607125717723144,594.1291221670822,5.702573931569924,,65.54112126952948,70.16921804762775,0.5104356496092155,212.8345808604923,1.251856019987749,0.0526468901083366,3.580408653743414,2.1151886810572997,0.0060406721239368,213.1116871254695,0.6670658499920066,0.3339335514012846,1.212871821309086,1075.4078257922831
3,min,1.0,0.0,0.0,1.0,,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,max,6.0,9.0,307007.11,99.0,,265.0,265.0,4.0,395844.94,8.25,3.3,1400.16,800.09,0.3,395848.24,2.75,1.25,5.0,172034.25305555554


In [35]:
df  = df.select([(when(col(c)>=0, col(c))).alias(c) for c in df.columns])

In [37]:
dict([(c,df.approxQuantile(c, [0.8],0.05)) for c in df.columns])

IllegalArgumentException: requirement failed: Quantile calculation for column store_and_fwd_flag with data type StringType is not supported.

## Data Analisys

## Model Training

## Model Evaluation


## Model Export