In [55]:
from pyspark.sql import SparkSession

In [79]:
spark = SparkSession.builder.appName("taxi-fare-prediciton").getOrCreate()

In [65]:
import os
cwd = os.getcwd()
trip_data_path = os.path.join(cwd, '../learning_spark_data', 'trips', '*.csv')
trip_data_path

'/home/jovyan/work/linuxPython/../learning_spark_data/trips/*.csv'

In [66]:
file_path = f"file:///{trip_data_path.replace(os.sep,'/') }"
file_path

'file:////home/jovyan/work/linuxPython/../learning_spark_data/trips/*.csv'

In [80]:
trip_df = spark.read.csv(file_path, inferSchema=True, header=True)
# trip_df.printSchema()

In [81]:
trip_df.show(5)

+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+
|       2| 2021-03-01 00:22:02|  2021-03-01 00:23:22|              1|          0.0|         1|                 N|         264|         264|           2|        3.0|  0.5|    0.5|       0.0|         0.0|                  0.3

In [90]:
trip_df.createOrReplaceTempView('trips')

In [91]:
query = """
SELECT
    trip_distance,
    total_amount
FROM trips
WHERE total_amount < 5000
  AND total_amount > 0
  AND trip_distance > 0
  AND trip_distance < 500
  AND passenger_count < 4
  AND TO_DATE(tpep_pickup_datetime) >= "2021-01-01"
  AND TO_DATE(tpep_pickup_datetime) < "2021-08-01"
"""

In [96]:
trips = spark.sql(query)

In [127]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

In [133]:
p = {"inputCols":['trip_distance'], "outputCol":'features'}

In [149]:
def learning(machine, data, param, isVassembler=False, Vparam=p):
    train_data, test_data = data.randomSplit([0.7,0.3], seed=12)

    if isVassembler:
        vassembler = VectorAssembler(**Vparam)
        train_data = vassembler.transform(train_data)
        test_data = vassembler.transform(test_data)
    
    learning = machine(**param)
    model = learning.fit(train_data)
    
    predic = model.transform(test_data)
    
    print(f'rootMeanSquaredError: {model.summary.rootMeanSquaredError}')
    print(f"r2: {model.summary.r2}")
    predic.show()

In [150]:
para= {"maxIter":50, "labelCol":'total_amount', "featuresCol":'features'}

In [151]:
learning(LinearRegression, trips, para, True, p)

rootMeanSquaredError: 6.283870631154954
r2: 0.7662583373680235
+-------------+------------+--------+-----------------+
|trip_distance|total_amount|features|       prediction|
+-------------+------------+--------+-----------------+
|         0.01|         3.3|  [0.01]|9.404887108114131|
|         0.01|         3.3|  [0.01]|9.404887108114131|
|         0.01|         3.3|  [0.01]|9.404887108114131|
|         0.01|         3.3|  [0.01]|9.404887108114131|
|         0.01|         3.3|  [0.01]|9.404887108114131|
|         0.01|         3.3|  [0.01]|9.404887108114131|
|         0.01|         3.3|  [0.01]|9.404887108114131|
|         0.01|         3.3|  [0.01]|9.404887108114131|
|         0.01|         3.3|  [0.01]|9.404887108114131|
|         0.01|         3.3|  [0.01]|9.404887108114131|
|         0.01|         3.3|  [0.01]|9.404887108114131|
|         0.01|         3.3|  [0.01]|9.404887108114131|
|         0.01|         3.3|  [0.01]|9.404887108114131|
|         0.01|         3.3|  [0.01]|9.40

In [152]:
spark.stop()