# New Section

# New Section

In [5]:
!pip install pyspark
#Initialize spark
from pyspark.sql import SparkSession
spark = SparkSession.builder \
        .appName("NYC Taxi Trip Analysis") \
        .getOrCreate()
 #load the data
file_path ="/content/sample_taxi_nyc.csv - Sheet1.csv"
df = spark.read.csv(file_path, header=True, inferSchema=True)
print("Schema:")
df.printSchema()
print("Sample Data:")
#preview data
df.show(5)
# data cleaning
from pyspark.sql.functions import col
# filter out trip with negative or invalid values
df_clean = df.filter(
    (col("trip_distance")>0) &
    (col("fare_amount")>0) &
    (col("passenger_count")>0)
)
# feature engineering
from pyspark.sql.functions import hour, to_timestamp
df_features = df_clean.withColumn("pickup_hour", hour(to_timestamp(col("tpep_pickup_datetime"))))
# aggregation - total trips with hour
trips_by_hour = df_features.groupBy("pickup_hour").count().orderBy("pickup_hour")
#aggregation - average of fare per file
from pyspark.sql.functions import avg
fare_per_mile = df_features.withColumn("fare_per_mile",col("fare_amount")/col("trip_distance"))
avg_fare = fare_per_mile.select(avg("fare_per_mile").alias("avg_fare_per_mile"))
avg_fare.show()
# top 10 longest trip
longest_trips = df_features.orderBy(col("trip_distance").desc()).select("tpep_pickup_datetime","tpep_dropoff_datetime","trip_distance","fare_amount").limit(10)
# save if needed
# longest_trip.write.csv("output/longest_trips.csv",header = true)
# stop spark session
spark.stop()

Schema:
root
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- fare_amount: double (nullable = true)

Sample Data:
+--------------------+---------------------+---------------+-------------+-----------+
|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|fare_amount|
+--------------------+---------------------+---------------+-------------+-----------+
| 2022-01-03 16:33:41|  2022-01-03 17:25:41|              4|         0.88|       3.32|
| 2022-01-08 09:01:55|  2022-01-08 09:35:55|              3|         0.84|       3.02|
| 2022-01-15 11:49:25|  2022-01-15 12:06:25|              2|         2.46|       6.58|
| 2022-01-20 09:41:33|  2022-01-20 10:00:33|              2|         0.01|       3.27|
| 2022-01-09 06:02:02|  2022-01-09 06:35:02|              5|         2.95|       10.9|
+--------------------+--------