In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_date, trim

spark = SparkSession.builder \
    .appName('KT4_Yellow_Taxi') \
    .getOrCreate()

# Локальный путь внутри контейнера
csv_path = "file:///home/jovyan/kt4/yellow_tripdata_small_2015-01.csv"

# Чтение локального CSV
df = spark.read.csv(csv_path, header=True, inferSchema=True)
df.printSchema()
df.show(5)

# Нормализация и категории
df = df.filter(col('passenger_count') > 0)
df = df.withColumn('category', (col('passenger_count') > 2).cast('int'))
df = df.withColumn('pickup_date', to_date(col('tpep_pickup_datetime')))
df.show(5)

# HDFS путь для записи
hdfs_path = "hdfs://namenode:9000/jupyter/yellow_tripdata"

df.write \
  .partitionBy("category", "pickup_date") \
  .mode("overwrite") \
  .parquet(hdfs_path)


root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- pickup_longitude: double (nullable = true)
 |-- pickup_latitude: double (nullable = true)
 |-- RateCodeID: integer (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- dropoff_longitude: double (nullable = true)
 |-- dropoff_latitude: double (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)

+--------+--------------------+---------------------+---------------+-------------+------------------+---------------