In [1]:
import pyspark
from pyspark.sql import SparkSession

In [2]:
# import os

# os.environ["PYSPARK_SUBMIT_ARGS"] = "--executor-memory 2g"
# os.environ["PYSPARK_SUBMIT_ARGS"] = "--driver-memory 2g"

In [33]:
spark = SparkSession.builder \
	.master("local[*]") \
	.appName('test') \
	.getOrCreate()
	
spark.sparkContext.setLogLevel('ERROR')

In [5]:
# !wget https://github.com/DataTalksClub/nyc-tlc-data/releases/download/fhvhv/fhvhv_tripdata_2021-01.csv.gz

In [6]:
!wc -l fhvhv_tripdata_2021-01.csv.gz

508066 fhvhv_tripdata_2021-01.csv.gz


In [7]:
df = spark.read \
    .option("header", "true") \
    .csv('fhvhv_tripdata_2021-01.csv.gz')

In [8]:
df.schema

StructType([StructField('hvfhs_license_num', StringType(), True), StructField('dispatching_base_num', StringType(), True), StructField('pickup_datetime', StringType(), True), StructField('dropoff_datetime', StringType(), True), StructField('PULocationID', StringType(), True), StructField('DOLocationID', StringType(), True), StructField('SR_Flag', StringType(), True)])

In [9]:
!head -n 1001 fhvhv_tripdata_2021-01.csv > head.csv

In [10]:
import pandas as pd


df.write.parquet('fhvhv/2021/01/') results in below error

```bash
24/02/27 02:18:17 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for ## writers
```

[resolution](https://stackoverflow.com/questions/53407442/total-allocation-exceeds-95-00-960-285-889-bytes-of-heap-memory-pyspark-erro) on above error is to increase the memory.

Consequence of above error was also that the RAM is eaten up and not released after writing? As vscode hangs and cannot save new edits.

In [11]:
df_pandas = pd.read_csv('head.csv')

In [12]:
df_pandas.dtypes

hvfhs_license_num        object
dispatching_base_num     object
pickup_datetime          object
dropoff_datetime         object
PULocationID              int64
DOLocationID              int64
SR_Flag                 float64
dtype: object

Use above dtypes to create a spark dataframe

In [13]:
spark.createDataFrame(df_pandas).schema

StructType([StructField('hvfhs_license_num', StringType(), True), StructField('dispatching_base_num', StringType(), True), StructField('pickup_datetime', StringType(), True), StructField('dropoff_datetime', StringType(), True), StructField('PULocationID', LongType(), True), StructField('DOLocationID', LongType(), True), StructField('SR_Flag', DoubleType(), True)])

Integer - 4 bytes

Long - 8 bytes

In [14]:
from pyspark.sql import types

change LongType to Integer, StructType is from scala, reformat to PySpark StructType
just so it is more readable from the long string in cell#15

In [15]:
schema = types.StructType([
    types.StructField('hvfhs_license_num', types.StringType(), True),
    types.StructField('dispatching_base_num', types.StringType(), True),
    types.StructField('pickup_datetime', types.TimestampType(), True),
    types.StructField('dropoff_datetime', types.TimestampType(), True),
    types.StructField('PULocationID', types.IntegerType(), True),
    types.StructField('DOLocationID', types.IntegerType(), True),
    types.StructField('SR_Flag', types.StringType(), True)
])

In [16]:
df = spark.read \
    .option("header", "true") \
    .schema(schema) \
    .csv('fhvhv_tripdata_2021-01.csv')

In [17]:
df = df.repartition(24) # makes 24 partitions 0-23

In [18]:
df.write.parquet('fhvhv/2021/01/', mode="overwrite")

                                                                                

In [19]:
df = spark.read.parquet('fhvhv/2021/01/')

In [20]:
df.printSchema()

root
 |-- hvfhs_license_num: string (nullable = true)
 |-- dispatching_base_num: string (nullable = true)
 |-- pickup_datetime: timestamp (nullable = true)
 |-- dropoff_datetime: timestamp (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- SR_Flag: string (nullable = true)



SELECT * FROM df WHERE hvfhs_license_num =  HV0003

In [21]:
from pyspark.sql import functions as F

In [22]:
df.show(5)

+-----------------+--------------------+-------------------+-------------------+------------+------------+-------+
|hvfhs_license_num|dispatching_base_num|    pickup_datetime|   dropoff_datetime|PULocationID|DOLocationID|SR_Flag|
+-----------------+--------------------+-------------------+-------------------+------------+------------+-------+
|           HV0005|              B02510|2021-01-01 05:59:23|2021-01-01 06:11:32|          87|         170|   NULL|
|           HV0003|              B02765|2021-01-02 00:12:06|2021-01-02 00:24:21|         180|          10|   NULL|
|           HV0005|              B02510|2021-01-01 13:46:38|2021-01-01 13:57:51|         151|          41|   NULL|
|           HV0003|              B02864|2021-01-02 04:22:08|2021-01-02 04:30:20|          42|          75|   NULL|
|           HV0003|              B02395|2021-01-01 12:02:28|2021-01-01 12:30:09|         230|           1|   NULL|
+-----------------+--------------------+-------------------+-------------------+

In [23]:
def crazy_stuff(base_num):
    num = int(base_num[1:])
    if num % 7 == 0:
        return f's/{num:03x}'
    elif num % 3 == 0:
        return f'a/{num:03x}'
    else:
        return f'e/{num:03x}'

In [24]:
crazy_stuff('B02884')

's/b44'

In [25]:
crazy_stuff_udf = F.udf(crazy_stuff, returnType=types.StringType())

In [26]:
df \
    .withColumn('pickup_date', F.to_date(df.pickup_datetime)) \
    .withColumn('dropoff_date', F.to_date(df.dropoff_datetime)) \
    .withColumn('base_id', crazy_stuff_udf(df.dispatching_base_num)) \
    .select('base_id', 'pickup_date', 'dropoff_date', 'PULocationID', 'DOLocationID') \
    .show()

[Stage 6:>                                                          (0 + 1) / 1]

+-------+-----------+------------+------------+------------+
|base_id|pickup_date|dropoff_date|PULocationID|DOLocationID|
+-------+-----------+------------+------------+------------+
|  e/9ce| 2021-01-01|  2021-01-01|          87|         170|
|  s/acd| 2021-01-02|  2021-01-02|         180|          10|
|  e/9ce| 2021-01-01|  2021-01-01|         151|          41|
|  e/b30| 2021-01-02|  2021-01-02|          42|          75|
|  e/95b| 2021-01-01|  2021-01-01|         230|           1|
|  e/9ce| 2021-01-02|  2021-01-02|          18|         169|
|  e/9ce| 2021-01-01|  2021-01-01|          41|         168|
|  e/9ce| 2021-01-01|  2021-01-01|         162|         263|
|  s/acd| 2021-01-01|  2021-01-01|          76|          35|
|  s/af0| 2021-01-02|  2021-01-02|         195|          76|
|  e/b3b| 2021-01-02|  2021-01-02|         129|          56|
|  e/b42| 2021-01-01|  2021-01-01|          75|         262|
|  e/9ce| 2021-01-01|  2021-01-01|          71|          61|
|  e/9ce| 2021-01-02|  2

                                                                                

In [27]:
df.select('pickup_datetime', 'dropoff_datetime', 'PULocationID', 'DOLocationID') \
  .filter(df.hvfhs_license_num == 'HV0003')


DataFrame[pickup_datetime: timestamp, dropoff_datetime: timestamp, PULocationID: int, DOLocationID: int]

In [28]:
!head -n 10 head.csv

hvfhs_license_num,dispatching_base_num,pickup_datetime,dropoff_datetime,PULocationID,DOLocationID,SR_Flag
HV0003,B02682,2021-01-01 00:33:44,2021-01-01 00:49:07,230,166,
HV0003,B02682,2021-01-01 00:55:19,2021-01-01 01:18:21,152,167,
HV0003,B02764,2021-01-01 00:23:56,2021-01-01 00:38:05,233,142,
HV0003,B02764,2021-01-01 00:42:51,2021-01-01 00:45:50,142,143,
HV0003,B02764,2021-01-01 00:48:14,2021-01-01 01:08:42,143,78,
HV0005,B02510,2021-01-01 00:06:59,2021-01-01 00:43:01,88,42,
HV0005,B02510,2021-01-01 00:50:00,2021-01-01 01:04:57,42,151,
HV0003,B02764,2021-01-01 00:14:30,2021-01-01 00:50:27,71,226,
HV0003,B02875,2021-01-01 00:22:54,2021-01-01 00:30:20,112,255,


In [29]:
spark.sparkContext.stop()