In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import when
from pyspark.sql.functions import corr
from pyspark.sql.functions import explode, col, lit
from pyspark.sql.functions import month, year, sum as spark_sum
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import RegressionEvaluator

In [2]:
spark = SparkSession.builder \
    .appName("DecisionTreeRegression") \
    .master("local[*]") \
    .getOrCreate()

file_paths = [
    "C:/Users/Tarun/Documents/BDP_Project/fhvhv_tripdata_2021-01.parquet"
]

dfs = []
for file_path in file_paths:
    df = spark.read.parquet(file_path)
    dfs.append(df)

stacked_df = dfs[0]
for df in dfs[1:]:
    stacked_df = stacked_df.union(df)

stacked_df = stacked_df.withColumn("airport_fee", when(stacked_df["airport_fee"].isNull(), 0).otherwise(stacked_df["airport_fee"]))
stacked_df.show(5)

main_df = stacked_df.select(
    "hvfhs_license_num",
    "pickup_datetime",
    "dropoff_datetime",
    "PULocationID",
    "DOLocationID",
    "trip_miles",
    "trip_time",
    "base_passenger_fare",
    "tolls",
    "bcf",
    "sales_tax",
    "congestion_surcharge",
    "airport_fee",
    "tips",
    "driver_pay"
)
main_df.show(5)

main_df = main_df.na.fill(0, subset=[
    'base_passenger_fare',
    'tolls',
    'bcf',
    'sales_tax',
    'congestion_surcharge',
    'airport_fee',
    'tips'
])

comp_code = {
    "HV0003": "Uber",
    "HV0004": "Via",
    "HV0005": "Lyft"
}

df_uber = main_df.filter(main_df['hvfhs_license_num'] == 'HV0003')
df_via = main_df.filter(main_df['hvfhs_license_num'] == 'HV0004')
df_lyft = main_df.filter(main_df['hvfhs_license_num'] == 'HV0005')

df_uber.show(5)
df_via.show(5)
df_lyft.show(5)


#spark.stop()

+-----------------+--------------------+--------------------+-------------------+-------------------+-------------------+-------------------+------------+------------+----------+---------+-------------------+-----+----+---------+--------------------+-----------+----+----------+-------------------+-----------------+------------------+----------------+--------------+
|hvfhs_license_num|dispatching_base_num|originating_base_num|   request_datetime|  on_scene_datetime|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|trip_miles|trip_time|base_passenger_fare|tolls| bcf|sales_tax|congestion_surcharge|airport_fee|tips|driver_pay|shared_request_flag|shared_match_flag|access_a_ride_flag|wav_request_flag|wav_match_flag|
+-----------------+--------------------+--------------------+-------------------+-------------------+-------------------+-------------------+------------+------------+----------+---------+-------------------+-----+----+---------+--------------------+-----------+--

In [3]:
# Start Spark session
"""spark = SparkSession.builder \
    .appName("WeatherDataProcessing") \
    .getOrCreate()"""

# Read the CSV file into a PySpark DataFrame
weather_df = spark.read.csv("C:/Users/Tarun/Documents/BDP_Project/nyc 2021-01-01 to 2021-12-31.csv", header=True, inferSchema=True)

# Show the first few rows of the weather DataFrame
weather_df.show(5)

# Remove specified columns from the DataFrame
columns_to_drop = ["name", "address", "resolvedAddress", "severerisk"]
weather_df = weather_df.drop(*columns_to_drop)

# Replace null values in the windgust column with 0
weather_df = weather_df.fillna({'windgust': 0})

# Show the modified DataFrame
weather_df.show(5)

# Stop Spark session
#spark.stop()


+----+-------+--------------------+----------+----+---------+----+--------+------+----------+----------+----+---------+--------+---------+-------+----------------+----------+----------+-------+----------+
|name|address|     resolvedAddress|  datetime|temp|feelslike| dew|humidity|precip|precipprob|preciptype|snow|snowdepth|windgust|windspeed|winddir|sealevelpressure|cloudcover|visibility|uvindex|severerisk|
+----+-------+--------------------+----------+----+---------+----+--------+------+----------+----------+----+---------+--------+---------+-------+----------------+----------+----------+-------+----------+
| nyc|    nyc|New York, NY, Uni...|2021-01-01| 2.5|     -0.2|-3.0|    67.8| 15.33|       100|      rain| 0.0|      0.0|    NULL|     15.5|   69.8|          1028.9|      50.6|      14.0|      3|      NULL|
| nyc|    nyc|New York, NY, Uni...|2021-01-02| 5.8|      3.6| 1.2|    74.0|  2.38|       100|      rain| 1.9|      0.6|    54.6|     25.5|  246.9|          1012.4|      63.9|      

In [4]:
data = df_uber

In [5]:
type(data)

pyspark.sql.dataframe.DataFrame

In [6]:
from pyspark.sql.functions import col, unix_timestamp

data = data.withColumn('pickup_timestamp', unix_timestamp('pickup_datetime'))
data = data.withColumn('dropoff_timestamp', unix_timestamp('dropoff_datetime'))
data = data.withColumn('trip_duration', (col('dropoff_timestamp') - col('pickup_timestamp')) / 60)
data = data.withColumn('driver_pay_per_mile', col('driver_pay') / col('trip_miles'))
data = data.withColumn('driver_pay_per_minute', col('driver_pay') / col('trip_duration'))

data.show()

+-----------------+-------------------+-------------------+------------+------------+----------+---------+-------------------+-----+----+---------+--------------------+-----------+----+----------+----------------+-----------------+------------------+-------------------+---------------------+
|hvfhs_license_num|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|trip_miles|trip_time|base_passenger_fare|tolls| bcf|sales_tax|congestion_surcharge|airport_fee|tips|driver_pay|pickup_timestamp|dropoff_timestamp|     trip_duration|driver_pay_per_mile|driver_pay_per_minute|
+-----------------+-------------------+-------------------+------------+------------+----------+---------+-------------------+-----+----+---------+--------------------+-----------+----+----------+----------------+-----------------+------------------+-------------------+---------------------+
|           HV0003|2021-01-01 00:33:44|2021-01-01 00:49:07|         230|         166|      5.26|      923|              2

In [7]:
data.columns

['hvfhs_license_num',
 'pickup_datetime',
 'dropoff_datetime',
 'PULocationID',
 'DOLocationID',
 'trip_miles',
 'trip_time',
 'base_passenger_fare',
 'tolls',
 'bcf',
 'sales_tax',
 'congestion_surcharge',
 'airport_fee',
 'tips',
 'driver_pay',
 'pickup_timestamp',
 'dropoff_timestamp',
 'trip_duration',
 'driver_pay_per_mile',
 'driver_pay_per_minute']

In [8]:
weather_df.columns

['datetime',
 'temp',
 'feelslike',
 'dew',
 'humidity',
 'precip',
 'precipprob',
 'preciptype',
 'snow',
 'snowdepth',
 'windgust',
 'windspeed',
 'winddir',
 'sealevelpressure',
 'cloudcover',
 'visibility',
 'uvindex']

In [9]:
from pyspark.sql.functions import to_date
data = data.withColumn('pickup_date', to_date(col('pickup_datetime')))
merged_df = data.join(weather_df, data.pickup_date == weather_df.datetime, 'inner')
merged_df = merged_df.drop('pickup_date')
merged_df.show()


+-----------------+-------------------+-------------------+------------+------------+----------+---------+-------------------+-----+----+---------+--------------------+-----------+----+----------+----------------+-----------------+------------------+-------------------+---------------------+----------+----+---------+----+--------+------+----------+----------+----+---------+--------+---------+-------+----------------+----------+----------+-------+
|hvfhs_license_num|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|trip_miles|trip_time|base_passenger_fare|tolls| bcf|sales_tax|congestion_surcharge|airport_fee|tips|driver_pay|pickup_timestamp|dropoff_timestamp|     trip_duration|driver_pay_per_mile|driver_pay_per_minute|  datetime|temp|feelslike| dew|humidity|precip|precipprob|preciptype|snow|snowdepth|windgust|windspeed|winddir|sealevelpressure|cloudcover|visibility|uvindex|
+-----------------+-------------------+-------------------+------------+------------+----------+--

In [10]:
merged_df.columns

['hvfhs_license_num',
 'pickup_datetime',
 'dropoff_datetime',
 'PULocationID',
 'DOLocationID',
 'trip_miles',
 'trip_time',
 'base_passenger_fare',
 'tolls',
 'bcf',
 'sales_tax',
 'congestion_surcharge',
 'airport_fee',
 'tips',
 'driver_pay',
 'pickup_timestamp',
 'dropoff_timestamp',
 'trip_duration',
 'driver_pay_per_mile',
 'driver_pay_per_minute',
 'datetime',
 'temp',
 'feelslike',
 'dew',
 'humidity',
 'precip',
 'precipprob',
 'preciptype',
 'snow',
 'snowdepth',
 'windgust',
 'windspeed',
 'winddir',
 'sealevelpressure',
 'cloudcover',
 'visibility',
 'uvindex']

In [11]:
merged_df.printSchema()

root
 |-- hvfhs_license_num: string (nullable = true)
 |-- pickup_datetime: timestamp_ntz (nullable = true)
 |-- dropoff_datetime: timestamp_ntz (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- trip_miles: double (nullable = true)
 |-- trip_time: long (nullable = true)
 |-- base_passenger_fare: double (nullable = false)
 |-- tolls: double (nullable = false)
 |-- bcf: double (nullable = false)
 |-- sales_tax: double (nullable = false)
 |-- congestion_surcharge: double (nullable = false)
 |-- airport_fee: double (nullable = false)
 |-- tips: double (nullable = false)
 |-- driver_pay: double (nullable = true)
 |-- pickup_timestamp: long (nullable = true)
 |-- dropoff_timestamp: long (nullable = true)
 |-- trip_duration: double (nullable = true)
 |-- driver_pay_per_mile: double (nullable = true)
 |-- driver_pay_per_minute: double (nullable = true)
 |-- datetime: date (nullable = true)
 |-- temp: double (nullable = true)
 |-- feelsli

In [12]:
# List of columns to drop
columns_to_drop = ['preciptype','pickup_datetime', 'hvfhs_license_num', 'dropoff_datetime', 'PULocationID', 'DOLocationID','datetime','pickup_timestamp','dropoff_timestamp','windgust']

# Drop the specified columns
merged_df = merged_df.drop(*columns_to_drop)

# Display the schema of the updated DataFrame
merged_df.printSchema()

root
 |-- trip_miles: double (nullable = true)
 |-- trip_time: long (nullable = true)
 |-- base_passenger_fare: double (nullable = false)
 |-- tolls: double (nullable = false)
 |-- bcf: double (nullable = false)
 |-- sales_tax: double (nullable = false)
 |-- congestion_surcharge: double (nullable = false)
 |-- airport_fee: double (nullable = false)
 |-- tips: double (nullable = false)
 |-- driver_pay: double (nullable = true)
 |-- trip_duration: double (nullable = true)
 |-- driver_pay_per_mile: double (nullable = true)
 |-- driver_pay_per_minute: double (nullable = true)
 |-- temp: double (nullable = true)
 |-- feelslike: double (nullable = true)
 |-- dew: double (nullable = true)
 |-- humidity: double (nullable = true)
 |-- precip: double (nullable = true)
 |-- precipprob: integer (nullable = true)
 |-- snow: double (nullable = true)
 |-- snowdepth: double (nullable = true)
 |-- windspeed: double (nullable = true)
 |-- winddir: double (nullable = true)
 |-- sealevelpressure: double (

In [13]:
# Drop rows with null values
merged_df = merged_df.na.drop()

# Check column data types
merged_df.printSchema()


root
 |-- trip_miles: double (nullable = true)
 |-- trip_time: long (nullable = true)
 |-- base_passenger_fare: double (nullable = false)
 |-- tolls: double (nullable = false)
 |-- bcf: double (nullable = false)
 |-- sales_tax: double (nullable = false)
 |-- congestion_surcharge: double (nullable = false)
 |-- airport_fee: double (nullable = false)
 |-- tips: double (nullable = false)
 |-- driver_pay: double (nullable = true)
 |-- trip_duration: double (nullable = true)
 |-- driver_pay_per_mile: double (nullable = true)
 |-- driver_pay_per_minute: double (nullable = true)
 |-- temp: double (nullable = true)
 |-- feelslike: double (nullable = true)
 |-- dew: double (nullable = true)
 |-- humidity: double (nullable = true)
 |-- precip: double (nullable = true)
 |-- precipprob: integer (nullable = true)
 |-- snow: double (nullable = true)
 |-- snowdepth: double (nullable = true)
 |-- windspeed: double (nullable = true)
 |-- winddir: double (nullable = true)
 |-- sealevelpressure: double (

In [14]:
for column in merged_df.columns:
    merged_df = merged_df.withColumn(column, col(column).cast("decimal(20,2)"))
merged_df.show()

+----------+---------+-------------------+-----+----+---------+--------------------+-----------+----+----------+-------------+-------------------+---------------------+----+---------+-----+--------+------+----------+----+---------+---------+-------+----------------+----------+----------+-------+
|trip_miles|trip_time|base_passenger_fare|tolls| bcf|sales_tax|congestion_surcharge|airport_fee|tips|driver_pay|trip_duration|driver_pay_per_mile|driver_pay_per_minute|temp|feelslike|  dew|humidity|precip|precipprob|snow|snowdepth|windspeed|winddir|sealevelpressure|cloudcover|visibility|uvindex|
+----------+---------+-------------------+-----+----+---------+--------------------+-----------+----+----------+-------------+-------------------+---------------------+----+---------+-----+--------+------+----------+----+---------+---------+-------+----------------+----------+----------+-------+
|      5.26|   923.00|              22.28| 0.00|0.67|     1.98|                2.75|       0.00|0.00|     14.

In [15]:
features = merged_df.columns 
features.remove('driver_pay')

In [16]:
assembler = VectorAssembler(inputCols=features, outputCol='features')

In [17]:
data = assembler.transform(merged_df)

In [18]:
dt = DecisionTreeRegressor(featuresCol='features', labelCol='driver_pay')

In [19]:
dt_model = dt.fit(data)

In [20]:
predictions = dt_model.transform(data)

# Evaluate the model's performance
evaluator = RegressionEvaluator(labelCol='driver_pay', predictionCol='prediction', metricName='rmse')
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) =", rmse)

Root Mean Squared Error (RMSE) = 3.7544254946589124


## LR

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

features = [ 'trip_miles','trip_miles','bcf','trip_time','sales_tax','tolls','tips','congestion_surcharge','humidity','precipprob','temp']
target_variable = 'driver_pay'
# Select features and target variable
X = merged_df[features].values
y = merged_df[target_variable].values

# Splitting the dataset into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardizing the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Assuming X_train and y_train are already defined

# Reshape X_train if necessary
X_train_reshaped = X_train.reshape(X_train.shape[0], -1)

# Create and train the Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train_reshaped, y_train)
# Predict on training data
lr_predictions_train = lr_model.predict(X_train.reshape(X_train.shape[0], -1))

# Predict on test data
lr_predictions_test = lr_model.predict(X_test.reshape(X_test.shape[0], -1))
# Assuming y_train, y_test, lr_predictions_train, and lr_predictions_test are defined

# Calculate Root Mean Squared Error for training data
lr_train_rmse = mean_squared_error(y_train, lr_predictions_train, squared=False)

# Calculate Root Mean Squared Error for test data
lr_test_rmse = mean_squared_error(y_test, lr_predictions_test, squared=False)

# Calculate R2 Score
lr_r2_score = r2_score(y_test, lr_predictions_test)

# Print the evaluation metrics
print("Linear Regression Train RMSE:", lr_train_rmse)
print("Linear Regression Test RMSE:", lr_test_rmse)
print("Linear Regression R2 Score:", lr_r2_score)