spark

In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
from google.cloud import storage

from pyspark.sql.functions import *
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, MinMaxScaler
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.regression import LinearRegression, GeneralizedLinearRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator, RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
import numpy as np
import seaborn as sns
import io
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Create SparkContext
sc = SparkContext.getOrCreate()

# Set the log level to ERROR to suppress INFO messages
sc.setLogLevel("ERROR")

In [3]:
#fix the formating of the shows, so they don't overlap.
def hscroll(activate=True):
  """activate/deactivate horizontal scrolling for wide output cells"""
  from IPython.display import display, HTML
  style = ('pre-wrap','pre')[activate] # select white-space style
  display(HTML("<style>pre {white-space: %s !important}</style>" % style))
hscroll()

In [4]:
# Create SparkSession
spark = SparkSession.builder.getOrCreate()

file_path = f'gs://my-bigdata-project-cm/cleaned/yellow_tripdata_2011-01.parquet'
try:
        # Read Parquet file from Google Cloud Storage
        sdf = spark.read.parquet(file_path)
        sdf = sdf.drop('store_and_fwd_flag')
        
        # Sum up the numerical values of the trip fee columns
        sdf = sdf.withColumn("combined_fee",
            sum(sdf[col] for col in ["total_amount", "congestion_surcharge", "airport_fee"]).cast("double"))
        # Drop the columns related to the trip fee, as they are now redundant
        columns_to_drop = ["fare_amount","extra","mta_tax","tolls_amount","improvement_surcharge","total_amount","congestion_surcharge","airport_fee"]
        sdf = sdf.drop(*columns_to_drop)
        
        # Show the first row of the DataFrame
        print(sdf.show(5))
        
except Exception as e:
    print(f"An error occurred on {file_path}:", str(e))

                                                                                

+--------+--------------------+---------------------+---------------+-------------+----------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       1| 2011-01-01 15:44:50|  2011-01-01 16:13:58|              2|         17.2|         3|         170|           1|           1|       53.5|  0.0|    0.0|      9.52|        10.0|                  0.0|       73.02|                 0.0|     

In [2]:
sdf.describe()

NameError: name 'sdf' is not defined

## Handle outliers - TO DO

# Print schema to see data types of all columns
print(sdf.printSchema())

# Predict the pasanger count

## Indexing

# Creates a hour column as a double
def hour_index(sdf, input_column):
    name = input_column.split("_")[1]
    
    # Extract hour from datetime column
    sdf = sdf.withColumn(f"{name}_hour_index", hour(input_column).cast("double"))
    
    return sdf

# Returns a DataFrame with a new column "pickup_hour_index" containing the indexed column of hours for the "tpep_pickup_datetime" column
sdf = hour_index(sdf, "tpep_pickup_datetime")
# Returns a DataFrame with a new column "dropoff_hour_index" containing the indexed column of hours for the "tpep_dropoff_datetime" column
sdf = hour_index(sdf, "tpep_dropoff_datetime")

In [None]:
# Creates a hour column as a double
def time_index_creator(sdf, input_column):
    name = input_column.split("_")[1]
    
    # Extract hour from datetime column
    sdf = sdf.withColumn(f"{name}_hour_index", hour(input_column).cast("double"))
    
    # Extract day of the week from datetime column
    sdf = sdf.withColumn(f"{name}_day_index", day(input_column).cast("double"))
    
    # Extract month from datetime column
    sdf = sdf.withColumn(f"{name}_month_index", month(input_column).cast("double"))
    
    return sdf

# Returns a DataFrame with a new column "pickup_hour_index" containing the indexed column of hours for the "tpep_pickup_datetime" column
sdf = time_index_creator(sdf, "tpep_pickup_datetime")
# Returns a DataFrame with a new column "dropoff_hour_index" containing the indexed column of hours for the "tpep_dropoff_datetime" column
sdf = time_index_creator(sdf, "tpep_dropoff_datetime")

In [6]:
# Split the data into training and test sets
trainingData, testData = sdf.randomSplit([0.70, 0.3], seed=42)

## Encoding

In [7]:
# Create an encoder for the three indexes and the age integer column.
encoder = OneHotEncoder(inputCols=["pickup_hour_index", "dropoff_hour_index",'pickup_day_index','dropoff_day_index','pickup_month_index','dropoff_month_index', "VendorID", "RatecodeID", "PULocationID", "DOLocationID", "payment_type", "passenger_count"],
                        outputCols=["pickupHourVector", "dropoffHourVector",'pickupDayVector','dropoffDayVector','pickupMonthVector','dropoffMonthVector', "VendorIDVector", "RatecodeIDVector", "PULocationIDVector", "DOLocationIDVector", "paymentTypeVector", "passengerCountVector"], dropLast=True, handleInvalid="keep")

## Scaling

altered_names = []
for i in columns_to_minmax:
    words = i.split("_")#remove the underscores
    # Capitalize the first letter of the second word
    if len(words) > 1:
        words[1] = words[1].capitalize()
    altered_names.append("".join(words))

assemblers = {}
scalers = {}
#vector assembele than scale the columns_to_minmax.
for col_index in range(len(columns_to_minmax)):
    assemblers[altered_names[col_index]] = VectorAssembler(inputCols=[columns_to_minmax[col_index]], outputCol=f"{altered_names[col_index]}Vector")
    scalers[altered_names[col_index]] = MinMaxScaler(inputCol={assemblers[col_index]}, outputCol=f"{altered_names[col_index]}Scaled")

columns_to_minmax = ['trip_distance','fare_amount','extra','mta_tax','tolls_amount','improvement_surcharge','total_amount','congestion_surcharge','airport_fee']

# Scale the columns_to_minmax
assembler_columns = VectorAssembler(inputCols=columns_to_minmax, outputCol='columnsVector')
columns_scaler = MinMaxScaler(inputCol="columnsVector", outputCol="columnsScaled")

In [None]:
#assemble and scale trip_distance
assembler_columns = VectorAssembler(inputCols=['trip_distance'], outputCol='tripDistanceVector')
columns_scaler = MinMaxScaler(inputCol="tripDistanceVector", outputCol="tripDistanceScaled")

#assemble and scale combined_fee
assembler_columns = VectorAssembler(inputCols=['combined_fee'], outputCol='combinedFeeVector')
columns_scaler = MinMaxScaler(inputCol="combinedFeeVector", outputCol="combinedFeeScaled")

## Aassembling 

In [9]:
# Create an assembler for the individual feature vectors and the float/double columns
assembler = VectorAssembler(inputCols=['pickupHourVector','dropoffHourVector','VendorIDVector','RatecodeIDVector', 'PULocationIDVector', 'DOLocationIDVector', 'paymentTypeVector', 'passengerCountVector', 'tripDistanceScaled','combinedFeeScaled'], outputCol="features")

In [10]:
# Create a Ridge Regression Estimator
ridge_reg = LinearRegression(labelCol='tip_amount',  elasticNetParam=0, regParam=0.1)

# Create a regression evaluator (to get RMSE, R2, RME, etc.)
evaluator = RegressionEvaluator(labelCol='tip_amount')

# Create the pipeline Indexer is stage 0 and Ridge Regression (ridge_reg)  is stage 3
regression_pipe = Pipeline(stages=[encoder, assembler_columns, columns_scaler, assembler, ridge_reg])

# Create a grid to hold hyperparameters 
grid = ParamGridBuilder()

# Two ways to try .fitIntercept
params = ParamGridBuilder() \
.addGrid(ridge_reg.fitIntercept, [True, False]) \
.addGrid(ridge_reg.regParam, [0.001, 0.01, 0.1, 1, 10]) \
.addGrid(ridge_reg.elasticNetParam, [0, 0.25, 0.5, 0.75, 1]) \
.build()

# Build the parameter grid
grid = grid.build()

print('Number of models to be tested: ', len(params))

# Create the CrossValidator using the hyperparameter grid
cv = CrossValidator(estimator=regression_pipe, 
                    estimatorParamMaps=grid, 
                    evaluator=evaluator, 
                    numFolds=5,seed=42)

# Train the models
all_models  = cv.fit(trainingData)

# Show the average performance over the three folds for each grid combination
print(f"Average metric {all_models.avgMetrics}")

# Get the best model from all of the models trained
bestModel = all_models.bestModel

# Use the model 'bestModel' to predict the test set
test_results = bestModel.transform(testData)

# Show the predicted tip
test_results.select('tip_amount', 'prediction').show(truncate=False)

Number of models to be tested:  50


                                                                                

Average metric [0.9067747474975025]


[Stage 43:>                                                         (0 + 1) / 1]

+------------+----------+--------------------+
|total_amount|tip_amount|prediction          |
+------------+----------+--------------------+
|5.9         |0.0       |-0.25321305015598705|
|8.3         |0.0       |-0.13185146219096977|
|10.46       |1.36      |1.711747975113023   |
|15.1        |0.0       |0.2728012422181101  |
|12.7        |0.0       |0.1208213161713998  |
|17.9        |0.0       |0.49510423627183653 |
|21.5        |0.0       |0.6476153939390767  |
|21.37       |4.27      |2.5594969527895763  |
|18.7        |0.0       |0.5029547479272758  |
|9.1         |0.0       |-0.29006132089966086|
|11.1        |0.0       |0.05617593030326207 |
|3.9         |0.0       |-0.41887240601661535|
|15.1        |0.0       |0.3793553028241545  |
|18.7        |0.0       |0.4953396631852629  |
|3.5         |0.0       |-0.8257632315042596 |
|13.5        |0.0       |0.20562018370896062 |
|4.3         |0.0       |-0.4420384605630854 |
|12.7        |0.0       |0.08599473560534565 |
|9.1         

                                                                                

# Save the best model
model_path = "gs://my-bigdata-project-cm/models/taxi_tip_linear_regression_model_v2"
bestModel.write().overwrite().save(model_path)

In [13]:
# RMSE measures the differences between what the model predicted ('prediction') and the actual values ('tip').
rmse = evaluator.evaluate(test_results, {evaluator.metricName:'rmse'})
# R-Squared measures how much of the variability in the target variable (tip) can be explained by the model
r2 =evaluator.evaluate(test_results,{evaluator.metricName:'r2'})
print(f"RMSE: {rmse}  R-squared:{r2}")



RMSE: 0.9258666059008394  R-squared:0.6711496845510172


                                                                                

print(bestModel.stages)

coefficients = bestModel.stages[3].coefficients
print("bestModel coefficients", coefficients)
intercept = bestModel.stages[3].intercept
print("bestModel intercept", intercept)

# Visualize regression results

# The Spark dataframe test_results holds the original 'tip' as well as the 'prediction'
# Select and convert to a Pandas dataframe
df = test_results.select('tip_amount','prediction').toPandas()

# Set the style for Seaborn plots
sns.set_style("white")
 
# Create a relationship plot between tip and prediction
sns.lmplot(x='tip_amount', y='prediction', data=df)

# residuals = bestModel.stages[3].residuals
# test_results.residuals

df = test_results.select('tip_amount','prediction').toPandas()
df['residuals'] = df['tip_amount'] - df['prediction']

# Set the style for Seaborn plots
sns.set_style("white")
 
# Create a relationship plot between tip and prediction
sns.regplot(x = 'prediction', y = 'residuals', data = df, scatter = True, color = 'red') 

# TODO: Add more visualizations for Regression performance metrics
# Loop through the features to extract the original column names. Store in the var_index dictionary
var_index = dict()
for variable_type in ['numeric', 'binary']:
    for variable in test_results.schema["features"].metadata["ml_attr"]["attrs"][variable_type]:
         print("Found variable:", variable)
         idx = variable['idx']
         name = variable['name']
         var_index[idx] = name      # Add the name of the column to the dictionary

# Loop through all of the variables found and print out the associated coefficients
for i in range(len(var_index)):
    print(i, var_index[i], coeff[i])

# Close connection to Spark
spark.stop()