In [1]:
pip install prophet

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pyspark.sql.functions as F
from prophet import Prophet

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window

from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import LinearRegression

Importing plotly failed. Interactive plots will not work.


In [3]:
SPARK_MASTER_IP = '172.18.0.2' 
spark = SparkSession.builder.appName("pyspark-taxi-forecasting") \
    .master(f"spark://{SPARK_MASTER_IP}:7077") \
    .config("spark.executor.cores", 2) \
    .config('spark.local.dir', 'spark_tmp/') \
    .config("spark.task.cpus", 2) \
    .getOrCreate()

spark = SparkSession.builder.appName("Introduction to Spark").getOrCreate()

In [4]:
spark

In [5]:
dfsp22 = spark.read.csv("Taxi_Trips_-_2022.csv", header = True, inferSchema = True)
dfsp23 = spark.read.csv("Taxi_Trips_-_2023.csv", header = True, inferSchema = True)
lim23 = '2023-07-31 23:00'
dfsp23 = dfsp23.filter(F.to_timestamp(F.col('Trip Start Timestamp'), 'MM/dd/yyyy hh:mm:ss a')<lim23)
dfsp = dfsp22.union(dfsp23) \
    .filter(F.to_timestamp(F.col('Trip Start Timestamp'), 'MM/dd/yyyy hh:mm:ss a')<'2023-07-31 23:00') 
#переименуем колонки в df
for col in dfsp.columns:
    dfsp = dfsp.withColumnRenamed(col, col.lower().replace(' ', '_'))

In [6]:
test = spark.read.csv("y_true_2023-07-31_23-00_UTC0.csv", header = True, inferSchema = True)

test = test.withColumn('ds', test['hours']) \
    .withColumn('pickup_community_area', test['Pickup Community Area']) \
    .withColumn('y', test['trips_count'])

In [7]:
#Обработка данных
dfsp = dfsp.fillna(0, subset=['pickup_community_area']) \
    .drop('pickup_census_tract', 'pickup_centroid_location', 
                 'tips', 'tolls', 'extras', 
                 "fare", 'company', 'payment_type', 
                 'dropoff_census_tract', 'dropoff_centroid_latitude', 'dropoff_centroid_longitude', 'dropoff_centroid__location' ) \
    .withColumn("trip_end_timestamp", F.date_trunc("hour", to_timestamp("trip_end_timestamp", "MM/dd/yyyy hh:mm:ss a"))) \
    .withColumn("trip_start_timestamp", F.date_trunc("hour", to_timestamp("trip_start_timestamp", "MM/dd/yyyy hh:mm:ss a"))) \
    .filter((dfsp.trip_seconds <= dfsp.approxQuantile("trip_seconds", [0.025, 0.99], 0.005)[1]) \
                    |(dfsp.trip_miles <= dfsp.approxQuantile("trip_miles", [0.025, 0.99], 0.005)[1]) \
                     |(dfsp.trip_total <= dfsp.approxQuantile("trip_total", [0.025, 0.99], 0.005)[1]))

In [8]:
# Группировка
dfsp_grouped = dfsp \
    .groupby('pickup_community_area', "trip_start_timestamp").agg(
    count('trip_id').alias('trips_count'), 
    median("trip_total").alias('cost_median'), 
    median('trip_miles').alias('miles_median'),
    median("trip_seconds").alias('seconds_median'),
    median("pickup_centroid_latitude").alias('centroid_lat_median'),
    median("pickup_centroid_longitude").alias('centroid_long_median'),
    countDistinct('taxi_id').alias('taxi_countdist')
).cache()

In [9]:
#Создание полной временной шкалы и включение туда данных
max_hour = '2023-07-31 23:00:00'#max(dfsp_timed.trip_start_timestamp) 
min_hour = '2022-01-01 00:00:00' #min(dfsp_timed.trip_start_timestamp)# 
hours = spark.createDataFrame([(min_hour, max_hour)], ["min_hour", "max_hour"]) \
    .selectExpr("explode(sequence(to_timestamp(min_hour), to_timestamp(max_hour), interval 1 hour)) as time") 
hours = hours.select(F.date_trunc('hour', hours.time).alias("hour_cons")) 

allhours_schema = StructType([ \
                                StructField("hour_cons",TimestampType(),True), \
                                StructField("pickup_community_area",IntegerType (),True) \
                              ])
all_hours = spark.createDataFrame([], allhours_schema)
for i in range(78):
    temp = hours.withColumn('pickup_community_area', lit(i))
    all_hours = all_hours.union(temp)

all_hours = all_hours \
    .join(dfsp_grouped, (all_hours.hour_cons == dfsp_grouped.trip_start_timestamp) &
                (all_hours.pickup_community_area == dfsp_grouped.pickup_community_area), 'left_outer') \
                .select(
                all_hours.pickup_community_area,
                all_hours.hour_cons,
                dfsp_grouped.taxi_countdist,
                dfsp_grouped.trips_count,
                dfsp_grouped.cost_median,
                dfsp_grouped.miles_median,
                dfsp_grouped.seconds_median
                ) \
    .na.fill(value=0).cache()

In [None]:
all_hours.toPandas().to_csv('all_hours_short_v2.csv')