# Clean Taxi Datasets

In [None]:
import os
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [None]:
packages="""io.delta:delta-core_2.12:1.0.0,org.apache.hadoop:hadoop-aws:3.2.0"""

os.environ['PYSPARK_SUBMIT_ARGS'] = "--packages io.delta:delta-core_2.12:1.0.0,org.apache.hadoop:hadoop-aws:3.2.0 pyspark-shell"

In [None]:
# resolve ip address for hive thrift connection cause it is weird
import socket
hive_metastore_address = socket.gethostbyname('hive_metastore')

In [None]:
spark = SparkSession \
            .builder \
            .config("spark.executor.cores", 4) \
            .config("spark.executor.memory", "4g") \
            .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
            .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
            .config("spark.master", "spark://spark-master:7077") \
            .config("spark.hadoop.fs.s3a.access.key", "AKIAIOSFODNN7EXAMPLE") \
            .config("spark.hadoop.fs.s3a.secret.key", "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY") \
            .config("spark.hadoop.fs.s3a.endpoint", "minio:9000") \
            .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
            .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
            .config("spark.hadoop.metastore.catalog.default", "hive") \
            .config("spark.sql.warehouse.dir", "s3a://storage/warehouse") \
            .config("spark.hadoop.fs.s3a.path.style.access", "true") \
            .config("spark.hadoop.fs.s3a.connection.maximum", "50") \
            .config("spark.hive.metastore.uris", "thrift://{0}:9083".format(hive_metastore_address)) \
            .appName("Jupyter Testing Clean Data") \
            .enableHiveSupport() \
            .getOrCreate()

In [None]:
yellow_raw = spark.sql("""select * from default.yellow_taxi_pre2015""")

In [None]:
yellow_raw.printSchema()

In [None]:
yellow_raw.select("passenger_count").distinct().collect()

In [None]:
yellow_processed = yellow_raw \
    .withColumn('pickup_datetime', F.to_timestamp('pickup_datetime')) \
    .withColumn('dropoff_datetime', F.to_timestamp('dropoff_datetime')) \
    .withColumn('passenger_count', F.col('passenger_count').cast('integer')) \
    .withColumn('trip_distance', F.col('trip_distance').cast('float')) \
    .withColumn('pickup_longitude', F.col('pickup_longitude').cast('float')) \
    .withColumn('pickup_latitude', F.col('pickup_latitude').cast('float')) \
    .withColumn('rate_code_id', F.col('rate_code')) \
    .withColumn('dropoff_longitude', F.col('dropoff_longitude').cast('float')) \
    .withColumn('dropoff_latitude', F.col('dropoff_latitude').cast('float')) \
    .withColumn('payment_type', F.col('payment_type').cast('integer')) \
    .withColumn('fare_amount', F.col('fare_amount').cast('float')) \
    .withColumn('improvement_surcharge', F.col('surcharge').cast('float')) \
    .withColumn('mta_tax', F.col('mta_tax').cast('float')) \
    .withColumn('tip_amount', F.col('tip_amount').cast('float')) \
    .withColumn('tolls_amount', F.col('tolls_amount').cast('float')) \
    .withColumn('total_amount', F.col('total_amount').cast('float')) \
    .drop('surcharge')

In [None]:
yellow_processed.printSchema()

In [None]:
yellow_processed.select('pickup_datetime').take(10)

In [None]:
spark.stop()