# Internship Task - 01: Big Data Analysis with PySpark

In [2]:
pip install pyspark

Note: you may need to restart the kernel to use updated packages.


In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("NYC Taxi Data Analysis") \
    .getOrCreate()


# Load Large Dataset

In [4]:
df = spark.read.csv("yellow_tripdata_2015-01.csv", header=True, inferSchema=True)

In [5]:
df.show(5)
df.printSchema()

+--------+--------------------+---------------------+---------------+-------------+------------------+------------------+----------+------------------+------------------+------------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|  pickup_longitude|   pickup_latitude|RateCodeID|store_and_fwd_flag| dropoff_longitude|  dropoff_latitude|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|
+--------+--------------------+---------------------+---------------+-------------+------------------+------------------+----------+------------------+------------------+------------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+
|       2| 2015-01-15 19:05:39|  2015-01-15 19:23:42|              1|         1.59|  -73.993896484375|  40.7501106262207|         1|    

# Perform Scalable Analysis

## 1.Total Number of Trips

In [6]:
total_trips = df.count()
print(f"Total Trips: {total_trips}")

Total Trips: 12748986


## 2.Average Trip Duration

In [7]:
from pyspark.sql.functions import unix_timestamp, col

df = df.withColumn("trip_duration",(unix_timestamp("tpep_dropoff_datetime") - unix_timestamp("tpep_pickup_datetime")) / 60)

avg_duration = df.selectExpr("avg(trip_duration)").first()[0]
print(f"Average Trip Duration: {avg_duration:.2f} minutes")

Average Trip Duration: 14.18 minutes


## 3.Average Fare Amount per Passenger Count

In [8]:
df.groupBy("passenger_count")\
  .avg("fare_amount") \
  .orderBy("passenger_count") \
  .show() 


+---------------+------------------+
|passenger_count|  avg(fare_amount)|
+---------------+------------------+
|              0|11.205294744859103|
|              1| 11.78195472471847|
|              2|12.420621907710354|
|              3|12.124618192345713|
|              4|12.202182618035936|
|              5|11.963545757512774|
|              6|11.797696494253884|
|              7|11.255555555555555|
|              8|29.580000000000002|
|              9|52.900000000000006|
+---------------+------------------+



## 4.Busiest Hour of the Day

In [11]:
from pyspark.sql.functions import hour

df = df.withColumn("pickup_hour", hour("tpep_pickup_datetime"))

df.groupBy("pickup_hour") \
  .count() \
  .orderBy(col("count").desc()) \
  .show(5)


+-----------+------+
|pickup_hour| count|
+-----------+------+
|         19|805230|
|         18|799587|
|         20|733952|
|         21|711579|
|         22|686959|
+-----------+------+
only showing top 5 rows



## Stop Spark Session

In [12]:
spark.stop()

# Step 5: Key Insights

## The dataset has millions of taxi trips in a single month.

## Average trip duration is about X minutes.

## Most popular pickup zones are IDs 236, 142, etc.

## The busiest hour is typically around 6 PM.

## Trips with 2 passengers have the highest average fare.