In [0]:
import requests
from urllib.parse import urlparse
import os
from io import BytesIO
import pyarrow.parquet as pq
import pandas as pd
from pyspark.sql import SparkSession


spark = SparkSession.builder.getOrCreate()


url = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet"


file_name = os.path.basename(urlparse(url).path)
print("File will be called:", file_name)


print("⬇️ Downloading file into memory...")
response = requests.get(url)
response.raise_for_status()  
data_bytes = BytesIO(response.content)


print(" Reading into Pandas DataFrame...")
pandas_df = pq.read_table(data_bytes).to_pandas()
print(" Pandas DataFrame loaded! Shape:", pandas_df.shape)


df = spark.createDataFrame(pandas_df)
print(" Spark DataFrame created!")

display(df.limit(5))  # preview first 5 rows


File will be called: yellow_tripdata_2024-01.parquet
⬇️ Downloading file into memory...
📥 Reading into Pandas DataFrame...
✅ Pandas DataFrame loaded! Shape: (2964624, 19)
✅ Spark DataFrame created!


VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
2,2024-01-01T00:57:55.000Z,2024-01-01T01:17:43.000Z,1.0,1.72,1.0,N,186,79,2,17.7,1.0,0.5,0.0,0.0,1.0,22.7,2.5,0.0
1,2024-01-01T00:03:00.000Z,2024-01-01T00:09:36.000Z,1.0,1.8,1.0,N,140,236,1,10.0,3.5,0.5,3.75,0.0,1.0,18.75,2.5,0.0
1,2024-01-01T00:17:06.000Z,2024-01-01T00:35:01.000Z,1.0,4.7,1.0,N,236,79,1,23.3,3.5,0.5,3.0,0.0,1.0,31.3,2.5,0.0
1,2024-01-01T00:36:38.000Z,2024-01-01T00:44:56.000Z,1.0,1.4,1.0,N,79,211,1,10.0,3.5,0.5,2.0,0.0,1.0,17.0,2.5,0.0
1,2024-01-01T00:46:51.000Z,2024-01-01T00:52:57.000Z,1.0,0.8,1.0,N,211,148,1,7.9,3.5,0.5,3.2,0.0,1.0,16.1,2.5,0.0


In [0]:
df.columns


['VendorID',
 'tpep_pickup_datetime',
 'tpep_dropoff_datetime',
 'passenger_count',
 'trip_distance',
 'RatecodeID',
 'store_and_fwd_flag',
 'PULocationID',
 'DOLocationID',
 'payment_type',
 'fare_amount',
 'extra',
 'mta_tax',
 'tip_amount',
 'tolls_amount',
 'improvement_surcharge',
 'total_amount',
 'congestion_surcharge',
 'Airport_fee']

In [0]:
pandas_df.write.format("delta").mode("overwrite").saveAsTable(
    "trip_data.trip_source.raw"
)


[0;31m---------------------------------------------------------------------------[0m
[0;31mAttributeError[0m                            Traceback (most recent call last)
[0;32m~/.ipykernel/4291/command-8934655427727657-337580409[0m in [0;36m?[0;34m()[0m
[0;32m----> 1[0;31m pandas_df.write.format("delta").mode("overwrite").saveAsTable(
[0m[1;32m      2[0m     [0;34m"trip_data.trip_source.raw"[0m[0;34m[0m[0;34m[0m[0m
[1;32m      3[0m [0;34m)[0m[0;34m[0m[0;34m[0m[0m

[0;32m/databricks/python/lib/python3.12/site-packages/pandas/core/generic.py[0m in [0;36m?[0;34m(self, name)[0m
[1;32m   6295[0m             [0;32mand[0m [0mname[0m [0;32mnot[0m [0;32min[0m [0mself[0m[0;34m.[0m[0m_accessors[0m[0;34m[0m[0;34m[0m[0m
[1;32m   6296[0m             [0;32mand[0m [0mself[0m[0;34m.[0m[0m_info_axis[0m[0;34m.[0m[0m_can_hold_identifiers_and_holds_name[0m[0;34m([0m[0mname[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[1;32m   6297[0m  

In [0]:
df.write.format("delta").mode("overwrite").saveAsTable(
    "trip_data.trip_source.raw"
)



In [0]:
%sql
SELECT * FROM trip_data.trip_source.raw

VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
2,2024-01-13T19:35:58.000Z,2024-01-13T19:43:42.000Z,2.0,1.05,1.0,N,144,113,1,9.3,0.0,0.5,2.0,0.0,1.0,15.3,2.5,0.0
2,2024-01-13T19:50:51.000Z,2024-01-13T19:57:41.000Z,1.0,0.88,1.0,N,234,170,1,7.9,0.0,0.5,2.38,0.0,1.0,14.28,2.5,0.0
1,2024-01-13T19:15:22.000Z,2024-01-13T19:22:32.000Z,2.0,0.8,1.0,N,114,264,1,7.9,2.5,0.5,2.35,0.0,1.0,14.25,2.5,0.0
1,2024-01-13T19:34:11.000Z,2024-01-13T19:52:22.000Z,2.0,2.6,1.0,N,264,170,1,17.7,2.5,0.5,2.0,0.0,1.0,23.7,2.5,0.0
2,2024-01-13T19:16:13.000Z,2024-01-13T19:33:56.000Z,2.0,3.25,1.0,N,211,33,1,19.1,0.0,0.5,4.62,0.0,1.0,27.72,2.5,0.0
2,2024-01-13T19:43:24.000Z,2024-01-13T20:02:18.000Z,1.0,2.08,1.0,N,137,148,1,17.0,0.0,0.5,4.2,0.0,1.0,25.2,2.5,0.0
2,2024-01-13T19:26:59.000Z,2024-01-13T19:40:51.000Z,1.0,1.1,1.0,N,79,234,1,12.8,0.0,0.5,3.36,0.0,1.0,20.16,2.5,0.0
2,2024-01-13T19:46:45.000Z,2024-01-13T19:56:07.000Z,1.0,0.87,1.0,N,90,107,1,9.3,0.0,0.5,2.66,0.0,1.0,15.96,2.5,0.0
2,2024-01-13T19:12:05.000Z,2024-01-13T19:21:10.000Z,1.0,1.4,1.0,N,140,237,1,10.7,0.0,0.5,2.94,0.0,1.0,17.64,2.5,0.0
2,2024-01-13T19:38:20.000Z,2024-01-13T19:47:32.000Z,1.0,1.94,1.0,N,239,75,1,12.1,0.0,0.5,0.9,0.0,1.0,17.0,2.5,0.0
