pip install pyspark
pip install shapely

In [2]:
#import spark functions
from pyspark.sql import SparkSession
from pyspark.sql.functions import when, col#, to_timestamp, lit, concat, udf, expr, round
from pyspark.sql.column import Column#, _to_java_column, _to_seq
from pyspark.sql.types import StringType, StructType, StructField, FloatType, IntegerType, BooleanType, TimestampType, DoubleType
from pyspark import SparkContext
import pandas as pd
# Import the storage module
from google.cloud import storage
import requests

In [3]:
# Create SparkContext
sc = SparkContext.getOrCreate()

# Set the log level to ERROR to suppress INFO messages
sc.setLogLevel("ERROR")

In [4]:
#fix the formating of the shows, so they don't overlap.
def hscroll(activate=True):
  """activate/deactivate horizontal scrolling for wide output cells"""
  from IPython.display import display, HTML
  style = ('pre-wrap','pre')[activate] # select white-space style
  display(HTML("<style>pre {white-space: %s !important}</style>" % style))
hscroll()

In [5]:
def store_and_fwd_flag_filling(df): #cleans store_and_fwd_flag data

    # Replace "0" and "Null" with "False" and "1" with "True" in store_and_fwd_flag column
    df = df.withColumn("store_and_fwd_flag",
                       when((df["store_and_fwd_flag"] == "0") | (df["store_and_fwd_flag"] == "N"), False)
                       .when((df["store_and_fwd_flag"] == "1") | (df["store_and_fwd_flag"] == "Y"), True) 
                       .otherwise(False)
                       .cast("boolean"))
    return df

In [6]:
def month_year(blob):
    # Create a dictionary mapping month names to numbers
    month_mapping = {
        "January": "01",
        "February": "02",
        "March": "03",
        "April": "04",
        "May": "05",
        "June": "06",
        "July": "07",
        "August": "08",
        "September": "09",
        "October": "10",
        "November": "11",
        "December": "12"
    }

    # Extract year and month name from the blob name
    year_month = blob.name.split('/')[2].split('_')
    year = int(year_month[0])
    month_name = year_month[1].split('.')[0]

    # Get the numeric value of the month from the dictionary
    month_number = month_mapping.get(month_name)

    return year, month_number

In [7]:
def transform_schema(df):
    # Convert columns to the desired data types
    df_transformed = df.withColumn("store_and_fwd_flag", col("store_and_fwd_flag").cast(BooleanType())) \
        .withColumn("tpep_pickup_datetime", col("tpep_pickup_datetime").cast(TimestampType())) \
        .withColumn("tpep_dropoff_datetime", col("tpep_dropoff_datetime").cast(TimestampType())) \
        .withColumn("VendorID", col("VendorID").cast(IntegerType())) \
        .withColumn("PULocationID", col("PULocationID").cast(IntegerType())) \
        .withColumn("DOLocationID", col("DOLocationID").cast(IntegerType())) \
        .withColumn("payment_type", col("payment_type").cast(IntegerType())) \
        .withColumn("passenger_count", col("passenger_count").cast(IntegerType())) \
        .withColumn("RatecodeID", col("RatecodeID").cast(IntegerType())) \
        .withColumn("fare_amount", col("fare_amount").cast(DoubleType())) \
        .withColumn("extra", col("extra").cast(DoubleType())) \
        .withColumn("mta_tax", col("mta_tax").cast(DoubleType())) \
        .withColumn("tip_amount", col("tip_amount").cast(DoubleType())) \
        .withColumn("tolls_amount", col("tolls_amount").cast(DoubleType())) \
        .withColumn("improvement_surcharge", col("improvement_surcharge").cast(DoubleType())) \
        .withColumn("total_amount", col("total_amount").cast(DoubleType())) \
        .withColumn("congestion_surcharge", col("congestion_surcharge").cast(DoubleType())) \
        .withColumn("airport_fee", col("airport_fee").cast(DoubleType()))

    return df_transformed

In [None]:
# Create SparkSession
spark = SparkSession.builder.getOrCreate()

# Create a client object that points to GCS
storage_client = storage.Client()

# Get a list of the 'blobs' (objects or files) in the bucket
blobs = storage_client.list_blobs('my-bigdata-project-cm', prefix="landing/")

#run through the my-bigdata-project-cm bucket.
for blob in blobs:
    file_path = f'gs://my-bigdata-project-cm/{blob.name}'
    
    year, month = month_year(blob)#call returns the month and year of the file
    
    #exscluding the file title.
    if not blob.name.endswith('.parquet') or blob.name == 'landing/':
        print(f"Skipping file {blob.name}")
        continue
    
    if year <= 2010:# skip 2009 and 2010
        print(f"Skipping file {blob.name} from the year {year}")
        continue
    
    try:
        # Read Parquet file from Google Cloud Storage
        df = spark.read.parquet(file_path)
        
        print(f"Processing {blob.name}:")
        # Print the row count pre-cleaning.
        #print("Number of rows pre-cleaning:", df.count())

        # Show the first row of the DataFrame pre-cleaning
        #print(df.show(1))

    except Exception as e:
        print(f"An error occurred on {blob.name}:", str(e))
        continue

    #drop duplicate column
    df = df.dropDuplicates()

    #clean store_and_fwd_flag
    df = store_and_fwd_flag_filling(df)

    #change passenger_count to int
    df = df.withColumn("passenger_count", col("passenger_count").cast("int"))

    #change RatecodeID to int
    df = df.withColumn("RatecodeID", col("RatecodeID").cast("int"))

    #Replace null values in the "congestion_surcharge" and "Airport_fee" columns with 0
    df = df.fillna({'congestion_surcharge': 0.0, 'Airport_fee': 0.0})

    # Drop rows where trip_distance is equal to 0.0
    #df = df.filter(df["trip_distance"] != 0.0) #leaving incase i want to look at cancled trips.

    #drop all null rows.
    df = df.dropna()

    df = transform_schema(df)#enforce schema
    
    # Print the row count post-cleaning.
    #print("Number of rows post-cleaning:", df.count())

    # Show the first row of the DataFrame post-cleaning
    #print(df.show(1))
    
    # Print schema to see data types of all columns
    #print(df.printSchema())
    
    # Save the cleaned dataframe as Parquet
    output_file_path=f"gs://my-bigdata-project-cm/cleaned/yellow_tripdata_{year}-{month}.parquet"
    df.write.parquet(output_file_path)
    print("File successfully processed and uploaded.\n")
print("All files finished processing.")
spark.stop()

Skipping file landing/
Skipping file landing/2009/2009_April.parquet from the year 2009
Skipping file landing/2009/2009_August.parquet from the year 2009
Skipping file landing/2009/2009_December.parquet from the year 2009
Skipping file landing/2009/2009_February.parquet from the year 2009
Skipping file landing/2009/2009_January.parquet from the year 2009
Skipping file landing/2009/2009_July.parquet from the year 2009
Skipping file landing/2009/2009_June.parquet from the year 2009
Skipping file landing/2009/2009_March.parquet from the year 2009
Skipping file landing/2009/2009_May.parquet from the year 2009
Skipping file landing/2009/2009_November.parquet from the year 2009
Skipping file landing/2009/2009_October.parquet from the year 2009
Skipping file landing/2009/2009_September.parquet from the year 2009
Skipping file landing/2010/2010_April.parquet from the year 2010
Skipping file landing/2010/2010_August.parquet from the year 2010
Skipping file landing/2010/2010_December.parquet fro

                                                                                

Processing landing/2011/2011_April.parquet:


                                                                                

Number of rows pre-cleaning: 14718973


                                                                                

Number of rows post-cleaning: 14718854


                                                                                

File sucesfully processed and uploaded.



[Stage 13:>                                                         (0 + 1) / 1]                                                                                

Processing landing/2011/2011_August.parquet:




Number of rows pre-cleaning: 13262441


                                                                                

Number of rows post-cleaning: 13262350


                                                                                

File sucesfully processed and uploaded.



                                                                                

Processing landing/2011/2011_December.parquet:


                                                                                

Number of rows pre-cleaning: 14925983


                                                                                

Number of rows post-cleaning: 14925842


                                                                                

File sucesfully processed and uploaded.

Processing landing/2011/2011_February.parquet:


                                                                                

Number of rows pre-cleaning: 14202809


                                                                                

Number of rows post-cleaning: 14202689


                                                                                

File sucesfully processed and uploaded.

Processing landing/2011/2011_January.parquet:


                                                                                

Number of rows pre-cleaning: 13464997


                                                                                

Number of rows post-cleaning: 13464885


                                                                                

File sucesfully processed and uploaded.



[Stage 65:>                                                         (0 + 1) / 1]                                                                                

Processing landing/2011/2011_July.parquet:


                                                                                

Number of rows pre-cleaning: 14742561


                                                                                

Number of rows post-cleaning: 14742455


                                                                                

File sucesfully processed and uploaded.



                                                                                

Processing landing/2011/2011_June.parquet:


                                                                                

Number of rows pre-cleaning: 15097861


24/04/01 22:09:21 ERROR TransportResponseHandler: Still have 1 requests outstanding when connection from /10.128.0.26:54824 is closed
24/04/01 22:09:21 WARN BlockManagerMasterEndpoint: Error trying to remove shuffle 21 from block manager BlockManagerId(70, cluster-f375-w-2.us-central1-b.c.cis-4130-project-413301.internal, 37461, None)
java.io.IOException: Connection from /10.128.0.26:54824 closed
	at org.apache.spark.network.client.TransportResponseHandler.channelInactive(TransportResponseHandler.java:147) ~[spark-network-common_2.12-3.5.0.jar:3.5.0]
	at org.apache.spark.network.server.TransportChannelHandler.channelInactive(TransportChannelHandler.java:117) ~[spark-network-common_2.12-3.5.0.jar:3.5.0]
	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelInactive(AbstractChannelHandlerContext.java:305) ~[netty-transport-4.1.100.Final.jar:4.1.100.Final]
	at io.netty.channel.AbstractChannelHandlerContext.invokeChannelInactive(AbstractChannelHandlerContext.java:281) ~[netty-tra

Number of rows post-cleaning: 15097741


                                                                                

File sucesfully processed and uploaded.



[Stage 91:>                                                         (0 + 1) / 1]                                                                                

Processing landing/2011/2011_March.parquet:


                                                                                

Number of rows pre-cleaning: 16066351


                                                                                

Number of rows post-cleaning: 16066191


                                                                                

File sucesfully processed and uploaded.

Processing landing/2011/2011_May.parquet:


                                                                                

Number of rows pre-cleaning: 15554868


                                                                                

Number of rows post-cleaning: 15554750


                                                                                

File sucesfully processed and uploaded.



[Stage 117:>                                                        (0 + 1) / 1]                                                                                

Processing landing/2011/2011_November.parquet:


                                                                                

Number of rows pre-cleaning: 14525863


                                                                                

Number of rows post-cleaning: 14525737


                                                                                

File sucesfully processed and uploaded.



                                                                                

Processing landing/2011/2011_October.parquet:


                                                                                

Number of rows pre-cleaning: 15697804


                                                                                

Number of rows post-cleaning: 15697666


                                                                                

File sucesfully processed and uploaded.



                                                                                

Processing landing/2011/2011_September.parquet:


                                                                                

Number of rows pre-cleaning: 14626748


                                                                                

Number of rows post-cleaning: 14626616


                                                                                

File sucesfully processed and uploaded.



[Stage 156:>                                                        (0 + 1) / 1]                                                                                

Processing landing/2012/2012_April.parquet:


                                                                                

Number of rows pre-cleaning: 13427802


                                                                                

Number of rows post-cleaning: 13427720


                                                                                

File sucesfully processed and uploaded.



                                                                                

Processing landing/2012/2012_August.parquet:


                                                                                

Number of rows pre-cleaning: 14381752




Number of rows post-cleaning: 14381514


                                                                                

File sucesfully processed and uploaded.



[Stage 182:>                                                        (0 + 1) / 1]                                                                                

Processing landing/2012/2012_December.parquet:


                                                                                

Number of rows pre-cleaning: 14696610


                                                                                

Number of rows post-cleaning: 14696490


                                                                                

File sucesfully processed and uploaded.



[Stage 195:>                                                        (0 + 1) / 1]                                                                                

Processing landing/2012/2012_February.parquet:


                                                                                

Number of rows pre-cleaning: 13361345


                                                                                

Number of rows post-cleaning: 13361253


                                                                                

File sucesfully processed and uploaded.



                                                                                

Processing landing/2012/2012_January.parquet:


                                                                                

Number of rows pre-cleaning: 13058348


                                                                                

Number of rows post-cleaning: 13058279


                                                                                

File sucesfully processed and uploaded.

Processing landing/2012/2012_July.parquet:


                                                                                

Number of rows pre-cleaning: 14379307


                                                                                

Number of rows post-cleaning: 14379174


                                                                                

File sucesfully processed and uploaded.



[Stage 247:>                                                        (0 + 1) / 1]                                                                                

Processing landing/2012/2012_March.parquet:


                                                                                

Number of rows pre-cleaning: 16146923


                                                                                

Number of rows post-cleaning: 16146743


                                                                                

File sucesfully processed and uploaded.



[Stage 260:>                                                        (0 + 1) / 1]                                                                                

Processing landing/2012/2012_May.parquet:


                                                                                

Number of rows pre-cleaning: 13965253


                                                                                

Number of rows post-cleaning: 13965172


                                                                                

File sucesfully processed and uploaded.



[Stage 273:>                                                        (0 + 1) / 1]                                                                                

Processing landing/2012/2012_November.parquet:


                                                                                

Number of rows pre-cleaning: 13776030


                                                                                

Number of rows post-cleaning: 13775828


                                                                                

File sucesfully processed and uploaded.



[Stage 286:>                                                        (0 + 1) / 1]                                                                                

Processing landing/2012/2012_October.parquet:


                                                                                

Number of rows pre-cleaning: 14522315


                                                                                

Number of rows post-cleaning: 14522180


                                                                                

File sucesfully processed and uploaded.

Processing landing/2012/2012_September.parquet:


                                                                                

Number of rows pre-cleaning: 14546854


                                                                                

Number of rows post-cleaning: 14546701


                                                                                

File sucesfully processed and uploaded.



                                                                                

Processing landing/2013/2013_April.parquet:


                                                                                

Number of rows pre-cleaning: 14652628


                                                                                

Number of rows post-cleaning: 14652459


                                                                                

File sucesfully processed and uploaded.



[Stage 325:>                                                        (0 + 1) / 1]                                                                                

Processing landing/2013/2013_August.parquet:


                                                                                

Number of rows pre-cleaning: 11681525


                                                                                

Number of rows post-cleaning: 11681426


