In [1]:
pip install prophet

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pyspark.sql.functions as F
from prophet import Prophet

from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window

from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.regression import LinearRegression, RandomForestRegressor

Importing plotly failed. Interactive plots will not work.


In [3]:
SPARK_MASTER_IP = '172.18.0.2' 
spark = SparkSession.builder.appName("pyspark-taxi-forecasting_stage2") \
    .master(f"spark://{SPARK_MASTER_IP}:7077") \
    .config('spark.local.dir', 'spark_tmp/') \
    .config("spark.executor.cores", 1) \
    .config("spark.task.cpus", 1) \
    .getOrCreate()

spark = SparkSession.builder.appName("Introduction to Spark").getOrCreate()

In [4]:
spark

# Создадим признаки задерки и скользящего среднего

In [5]:
all_hours = spark.read.csv("all_hours_onlycode.csv", header = True, inferSchema = True)

In [6]:
all_hours.show(5, False)
all_hours.printSchema()
all_hours.count()

+---+---------------------+-------------------+--------------+-----------+-----------+------------+--------------+
|_c0|pickup_community_area|hour_cons          |taxi_countdist|trips_count|cost_median|miles_median|seconds_median|
+---+---------------------+-------------------+--------------+-----------+-----------+------------+--------------+
|0  |0                    |2022-01-01 05:00:00|0             |0          |0.0        |0.0         |0.0           |
|1  |0                    |2022-01-10 11:00:00|0             |0          |0.0        |0.0         |0.0           |
|2  |0                    |2022-01-16 14:00:00|0             |0          |0.0        |0.0         |0.0           |
|3  |0                    |2022-01-19 06:00:00|0             |0          |0.0        |0.0         |0.0           |
|4  |0                    |2022-02-16 06:00:00|0             |0          |0.0        |0.0         |0.0           |
+---+---------------------+-------------------+--------------+-----------+------

1080222

In [7]:
column_list = ["pickup_community_area"]
  

Windowspec = Window.partitionBy(["pickup_community_area"]).orderBy(all_hours.hour_cons.desc())
  
all_hours_lagged = all_hours.filter(all_hours.hour_cons < '2023-07-31 23:00:00').withColumn(
    'med_cost_lagged1', lag(all_hours['cost_median'], -1).over(Windowspec)).withColumn(
    'med_miles_lagged1', lag(all_hours['miles_median'], -1).over(Windowspec)).withColumn(
    'med_seconds_lagged1', lag(all_hours['seconds_median'], -1).over(Windowspec)).withColumn(
    'trips_count_lagged1', lag(all_hours['trips_count'], -1).over(Windowspec)).withColumn(
    'trips_count_lagged2', lag(all_hours['trips_count'], -2).over(Windowspec)).withColumn(
    'trips_count_lagged3', lag(all_hours['trips_count'], -3).over(Windowspec)).withColumn(
    'trips_count_lagged12', lag(all_hours['trips_count'], -12).over(Windowspec)).withColumn(
    'trips_count_lagged24', lag(all_hours['trips_count'], -24).over(Windowspec)).withColumn(
    'trips_count_lagged_week', lag(all_hours['trips_count'], -24*7).over(Windowspec))

all_hours_lagged = all_hours_lagged.withColumn(
    'rolling_average_on3hours', avg(all_hours_lagged['trips_count_lagged1'],).over(Windowspec.rowsBetween(-3, -0))) \
    .withColumn('rolling_average_on24hours', avg(all_hours_lagged['trips_count_lagged1'],).over(Windowspec.rowsBetween(-24, -0)))

all_hours_lagged.filter('rolling_average_on3hours is NULL').show()

+---+---------------------+---------+--------------+-----------+-----------+------------+--------------+----------------+-----------------+-------------------+-------------------+-------------------+-------------------+--------------------+--------------------+-----------------------+------------------------+-------------------------+
|_c0|pickup_community_area|hour_cons|taxi_countdist|trips_count|cost_median|miles_median|seconds_median|med_cost_lagged1|med_miles_lagged1|med_seconds_lagged1|trips_count_lagged1|trips_count_lagged2|trips_count_lagged3|trips_count_lagged12|trips_count_lagged24|trips_count_lagged_week|rolling_average_on3hours|rolling_average_on24hours|
+---+---------------------+---------+--------------+-----------+-----------+------------+--------------+----------------+-----------------+-------------------+-------------------+-------------------+-------------------+--------------------+--------------------+-----------------------+------------------------+----------------

In [8]:
all_hours_lagged = all_hours_lagged.na.drop('any').cache()

In [9]:
all_hours_lagged.count()

1066962

# Выделим тестовую выборку

In [10]:
train_df = all_hours_lagged.filter((all_hours_lagged.hour_cons < '2023-06-01 00:00:00')) 
train_df.show(5)
train_df.count()

+-------+---------------------+-------------------+--------------+-----------+-----------+------------+--------------+----------------+-----------------+-------------------+-------------------+-------------------+-------------------+--------------------+--------------------+-----------------------+------------------------+-------------------------+
|    _c0|pickup_community_area|          hour_cons|taxi_countdist|trips_count|cost_median|miles_median|seconds_median|med_cost_lagged1|med_miles_lagged1|med_seconds_lagged1|trips_count_lagged1|trips_count_lagged2|trips_count_lagged3|trips_count_lagged12|trips_count_lagged24|trips_count_lagged_week|rolling_average_on3hours|rolling_average_on24hours|
+-------+---------------------+-------------------+--------------+-----------+-----------+------------+--------------+----------------+-----------------+-------------------+-------------------+-------------------+-------------------+--------------------+--------------------+-----------------------

952848

In [11]:
valid_df = all_hours_lagged.filter(all_hours_lagged.hour_cons >= '2023-06-01 00:00:00')

# Проведем предсказания с помощью модели Prophet

## Разделим данные для Prophet

In [12]:
train_proph = train_df.selectExpr( 'pickup_community_area',
    'hour_cons as ds', 'trips_count as y')
# Partition the data dfsp_partitionned
train_proph.createOrReplaceTempView("pickup_community_area")
sql = "select * from pickup_community_area"
train_proph = (spark.sql(sql)\
   .repartition(spark.sparkContext.defaultParallelism, 
   ['pickup_community_area'])).cache()
train_proph.explain()

== Physical Plan ==
InMemoryTableScan [pickup_community_area#18, ds#1917, y#1918]
   +- InMemoryRelation [pickup_community_area#18, ds#1917, y#1918], StorageLevel(disk, memory, deserialized, 1 replicas)
         +- Exchange hashpartitioning(pickup_community_area#18, 4), REPARTITION_BY_NUM, [plan_id=385]
            +- *(1) Project [pickup_community_area#18, hour_cons#19 AS ds#1917, trips_count#21 AS y#1918]
               +- *(1) Filter (isnotnull(hour_cons#19) AND (hour_cons#19 < 2023-06-01 00:00:00))
                  +- InMemoryTableScan [hour_cons#19, pickup_community_area#18, trips_count#21], [isnotnull(hour_cons#19), (hour_cons#19 < 2023-06-01 00:00:00)]
                        +- InMemoryRelation [_c0#17, pickup_community_area#18, hour_cons#19, taxi_countdist#20, trips_count#21, cost_median#22, miles_median#23, seconds_median#24, med_cost_lagged1#88, med_miles_lagged1#98, med_seconds_lagged1#109, trips_count_lagged1#121, trips_count_lagged2#134, trips_count_lagged3#148, trips_co

In [13]:
valid_proph = all_hours_lagged.filter(all_hours_lagged.hour_cons >= '2023-06-01 00:00:00')
valid_proph = valid_proph.selectExpr( 'pickup_community_area',
    'hour_cons as ds', 'trips_count as y')
valid_proph = valid_proph.withColumn("y",col("y").cast(DoubleType()))
valid_proph.sort('ds', 'pickup_community_area').show(5)

valid_proph.count()

+---------------------+-------------------+---+
|pickup_community_area|                 ds|  y|
+---------------------+-------------------+---+
|                    0|2023-06-01 00:00:00|0.0|
|                    1|2023-06-01 00:00:00|0.0|
|                    2|2023-06-01 00:00:00|0.0|
|                    3|2023-06-01 00:00:00|5.0|
|                    4|2023-06-01 00:00:00|1.0|
+---------------------+-------------------+---+
only showing top 5 rows



114114

## Проведем обучение и предсказание модели через функцию Pandas_udf

In [14]:
# Define a schema
schema = StructType([ \
                     StructField('pickup_community_area', IntegerType()), 
                     StructField('ds', TimestampType()),
                     StructField('y', FloatType()),
                     StructField('yhat', DoubleType()),
                     StructField('daily', DoubleType()),
                     StructField('weekly', DoubleType())
                    ])

In [15]:
# define the Pandas UDF
@pandas_udf(schema, PandasUDFType.GROUPED_MAP)
def apply_model(store_pd):  # instantiate the model and set parameters
  model = Prophet(
      interval_width=0.95,
      growth='linear',
      n_changepoints = 150,
      daily_seasonality=True,
      weekly_seasonality=True,
      yearly_seasonality=True,
      seasonality_mode='additive'
  )  # fit the model to historical data
  model.fit(store_pd)  # Create a data frame that lists 90 dates starting from Jan 1 2018
  future = model.make_future_dataframe(
      periods=1465, #valid_proph.filter(valid_proph.pickup_community_area==community_num).count(),
      freq='h',
      include_history=True
   )  # Out of sample prediction
  prediction = model.predict(future)  # Create a data frame that contains store, item, y, and yhat
  f_pd = prediction[['ds', 'yhat', 'daily', 'weekly']]
  st_pd = store_pd[['ds', 'pickup_community_area', 'y']]
  result_pd = f_pd.join(st_pd.set_index('ds'), on='ds', how='left')  # fill store and item
  result_pd['pickup_community_area'] = store_pd['pickup_community_area'].iloc[0]
  return result_pd[['pickup_community_area', 'ds', 'y', 'yhat',
                    'daily', 'weekly']]# Apply the function to all store-items
# Print the results - calculate the time to run
results = train_proph.groupby(['pickup_community_area']).apply(apply_model).cache()

results.show()



+---------------------+-------------------+---+-------------------+--------------------+--------------------+
|pickup_community_area|                 ds|  y|               yhat|               daily|              weekly|
+---------------------+-------------------+---+-------------------+--------------------+--------------------+
|                   12|2022-01-08 00:00:00|0.0|0.20231954443989342| -0.1784954999970698|0.021810553109612267|
|                   12|2022-01-08 01:00:00|0.0|  0.151072439417259| -0.2472482910063795| 0.03707963538849078|
|                   12|2022-01-08 02:00:00|0.0|0.20872559994415502| -0.2060371224482073| 0.05128641702274067|
|                   12|2022-01-08 03:00:00|0.0|0.28963271068141927|-0.14027894934349808| 0.06420160776229275|
|                   12|2022-01-08 04:00:00|0.0|0.35326562049559396|-0.09029063828459362| 0.07561399314801055|
|                   12|2022-01-08 05:00:00|0.0|  0.408986484687335|-0.04651993686699541| 0.08533354616960559|
|         

In [16]:
results[['yhat']].count()

1067118

## Оценим полученный прогноз

In [17]:
predictions = results.filter(results['ds'] >= '2023-06-01 00:00:00').select(
    'pickup_community_area', 'ds', 'yhat')
predictions = predictions.withColumn("yhat_int",F.round(predictions["yhat"],0))
predictions = predictions.withColumn('yhat_int', F.when((F.col("yhat_int") <= 0), 0)\
    .otherwise(predictions.yhat_int)).cache()
    

predictions.filter('pickup_community_area == 76').show(5)
predictions.count()

+---------------------+-------------------+------------------+--------+
|pickup_community_area|                 ds|              yhat|yhat_int|
+---------------------+-------------------+------------------+--------+
|                   76|2023-06-01 00:00:00|-4.593405868776557|     0.0|
|                   76|2023-06-01 01:00:00| 31.00775394244262|    31.0|
|                   76|2023-06-01 02:00:00| 83.14185643054287|    83.0|
|                   76|2023-06-01 03:00:00| 107.8617906222579|   108.0|
|                   76|2023-06-01 04:00:00|113.59148657154336|   114.0|
+---------------------+-------------------+------------------+--------+
only showing top 5 rows



114270

In [18]:
valid_df_withyhat = valid_proph.join(predictions, on=['ds','pickup_community_area'] , how='inner')  # fill store and item

In [19]:
valid_df_withyhat.show(5)
valid_df_withyhat.count()

+-------------------+---------------------+---+------------------+--------+
|                 ds|pickup_community_area|  y|              yhat|yhat_int|
+-------------------+---------------------+---+------------------+--------+
|2023-07-31 22:00:00|                   31|0.0|0.7175533403456423|     1.0|
|2023-07-31 21:00:00|                   31|0.0|0.9173915116418299|     1.0|
|2023-07-31 20:00:00|                   31|0.0|1.3922614451832205|     1.0|
|2023-07-31 19:00:00|                   31|0.0| 1.790951213205458|     2.0|
|2023-07-31 18:00:00|                   31|1.0|1.7873218122449517|     2.0|
+-------------------+---------------------+---+------------------+--------+
only showing top 5 rows



114114

In [20]:
evaluator = RegressionEvaluator(predictionCol="yhat_int", labelCol='y', metricName='mae')
print("Prphet MAE: {0}".format(evaluator.evaluate(valid_df_withyhat)))

Prphet MAE: 7.962528699370805


In [21]:
print('MAPE:',
    valid_df_withyhat.select(avg((100*abs((valid_df_withyhat.y - valid_df_withyhat.yhat_int) / valid_df_withyhat.y)))).collect())

MAPE: [Row(avg((abs(((y - yhat_int) / y)) * 100))=93.11438089016634)]


    мае - 7,9. Предсказания модели Prоphet показали низкое качество.
    Мы будем обучать другие модели, из Prophet можем взять данные о сезонности.

## Выделим из прогноза данные о сезонности

In [22]:
results.columns

['pickup_community_area', 'ds', 'y', 'yhat', 'daily', 'weekly']

# Проведем стандартизацию и соберем все признаки в 1 вектор

In [23]:
train_lr = all_hours_lagged.filter((all_hours_lagged.hour_cons < '2023-06-01 00:00:00')) \
    .selectExpr( 'pickup_community_area',
    'med_cost_lagged1','med_miles_lagged1','med_seconds_lagged1',
    'trips_count_lagged1', 'trips_count_lagged2', 'trips_count_lagged3', 
    'trips_count_lagged12', 'trips_count_lagged24','trips_count_lagged_week',
    'rolling_average_on3hours',
    'rolling_average_on24hours',
    'hour_cons as ds', 'trips_count as y') \
        .join(results['pickup_community_area', 'ds', 'daily', 'weekly'], on=['ds','pickup_community_area'] , how='inner').cache()

In [24]:
train_lr.printSchema()
train_lr.show(5)
train_lr.count()

root
 |-- ds: timestamp (nullable = true)
 |-- pickup_community_area: integer (nullable = true)
 |-- med_cost_lagged1: double (nullable = true)
 |-- med_miles_lagged1: double (nullable = true)
 |-- med_seconds_lagged1: double (nullable = true)
 |-- trips_count_lagged1: integer (nullable = true)
 |-- trips_count_lagged2: integer (nullable = true)
 |-- trips_count_lagged3: integer (nullable = true)
 |-- trips_count_lagged12: integer (nullable = true)
 |-- trips_count_lagged24: integer (nullable = true)
 |-- trips_count_lagged_week: integer (nullable = true)
 |-- rolling_average_on3hours: double (nullable = true)
 |-- rolling_average_on24hours: double (nullable = true)
 |-- y: integer (nullable = true)
 |-- daily: double (nullable = true)
 |-- weekly: double (nullable = true)

+-------------------+---------------------+----------------+-----------------+-------------------+-------------------+-------------------+-------------------+--------------------+--------------------+---------------

952848

In [25]:
featureCols = [
    'med_cost_lagged1','med_miles_lagged1','med_seconds_lagged1',
    'trips_count_lagged1', 'trips_count_lagged2', 'trips_count_lagged3', 
    'trips_count_lagged12', 'trips_count_lagged24','trips_count_lagged_week',
    'rolling_average_on3hours',
    'rolling_average_on24hours',
     'daily', 'weekly'
]

In [26]:
# положить фичи в вектор
assembler = VectorAssembler(inputCols=featureCols, outputCol="features") 
assembled_df = assembler.transform(train_lr)

In [27]:
assembled_df.show(20, False)

+-------------------+---------------------+------------------+------------------+-------------------+-------------------+-------------------+-------------------+--------------------+--------------------+-----------------------+------------------------+-------------------------+---+----------------------+---------------------+--------------------------------------------------------------------------------------------------------------+
|ds                 |pickup_community_area|med_cost_lagged1  |med_miles_lagged1 |med_seconds_lagged1|trips_count_lagged1|trips_count_lagged2|trips_count_lagged3|trips_count_lagged12|trips_count_lagged24|trips_count_lagged_week|rolling_average_on3hours|rolling_average_on24hours|y  |daily                 |weekly               |features                                                                                                      |
+-------------------+---------------------+------------------+------------------+-------------------+-------------------+-

In [28]:
standardScaler = StandardScaler(inputCol="features", outputCol="features_scaled", withStd = True)

In [29]:
train_lr_scaled = standardScaler.fit(assembled_df).transform(assembled_df).cache()

In [30]:
train_lr_scaled.show(5)
train_lr_scaled.count()

+-------------------+---------------------+----------------+-----------------+-------------------+-------------------+-------------------+-------------------+--------------------+--------------------+-----------------------+------------------------+-------------------------+---+--------------------+--------------------+--------------------+--------------------+
|                 ds|pickup_community_area|med_cost_lagged1|med_miles_lagged1|med_seconds_lagged1|trips_count_lagged1|trips_count_lagged2|trips_count_lagged3|trips_count_lagged12|trips_count_lagged24|trips_count_lagged_week|rolling_average_on3hours|rolling_average_on24hours|  y|               daily|              weekly|            features|     features_scaled|
+-------------------+---------------------+----------------+-----------------+-------------------+-------------------+-------------------+-------------------+--------------------+--------------------+-----------------------+------------------------+-----------------------

952848

## Проведем стандартизацию тестовых признаков

In [31]:
valid_lr = all_hours_lagged.filter(all_hours_lagged.hour_cons >= '2023-06-01 00:00:00') \
    .selectExpr('pickup_community_area',
    'med_cost_lagged1','med_miles_lagged1','med_seconds_lagged1',
    'trips_count_lagged1', 'trips_count_lagged2', 'trips_count_lagged3', 
    'trips_count_lagged12', 'trips_count_lagged24','trips_count_lagged_week',
    'rolling_average_on3hours',
    'rolling_average_on24hours',
    'hour_cons as ds', 'trips_count as y') \
        .join(results['pickup_community_area', 'ds', 'daily', 'weekly'], on=['ds','pickup_community_area'] , how='inner').cache()
valid_lr.count()

114114

In [32]:
valid_lr_scaled = assembler.transform(valid_lr)

In [33]:
valid_lr_scaled = standardScaler.fit(train_lr_scaled[['features']]).transform(valid_lr_scaled)

In [34]:
valid_lr_scaled.cache()
valid_lr_scaled.show(5, False)
valid_lr_scaled.count()

+-------------------+---------------------+----------------+------------------+-------------------+-------------------+-------------------+-------------------+--------------------+--------------------+-----------------------+------------------------+-------------------------+---+-------------------+--------------------+-----------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|ds                 |pickup_community_area|med_cost_lagged1|med_miles_lagged1 |med_seconds_lagged1|trips_count_lagged1|trips_count_lagged2|trips_count_lagged3|trips_count_lagged12|trips_count_lagged24|trips_count_lagged_week|rolling_average_on3hours|rolling_average_on24hours|y  |daily              |weekly  

114114

# Проведем обучение линейной регрессии

In [35]:
train_data = train_lr_scaled.selectExpr('y',
    'features_scaled')

In [36]:
lr = (LinearRegression(featuresCol='features_scaled', labelCol="y", predictionCol='y_pred')) 
#                               maxIter=10, regParam=0.3, elasticNetParam=0.8, standardization=False))

In [37]:
linearModel = lr.fit(train_data)

In [38]:
coeff_df = pd.DataFrame({"Feature": ["Intercept"] + featureCols, "Co-efficients": np.insert(linearModel.coefficients.toArray(), 0, linearModel.intercept)})
coeff_df = coeff_df[["Feature", "Co-efficients"]]
display(coeff_df)

Unnamed: 0,Feature,Co-efficients
0,Intercept,0.099435
1,med_cost_lagged1,-0.07068
2,med_miles_lagged1,-0.124996
3,med_seconds_lagged1,-0.016574
4,trips_count_lagged1,9.636389
5,trips_count_lagged2,-3.873082
6,trips_count_lagged3,1.959165
7,trips_count_lagged12,-0.090755
8,trips_count_lagged24,1.709754
9,trips_count_lagged_week,2.371991


In [39]:
valid_data = valid_lr_scaled.selectExpr('y',
    'features_scaled')

In [40]:
# Прогнозы
predictions = linearModel.transform(valid_data)

In [41]:
# Вытаскиваем предсказания и истинные ответы)
predandlabels = predictions.select("y_pred", "y").withColumn("y_pred",F.round(predictions["y_pred"],0)) \
                                                        #.withColumn("y", F.when((F.col("y") <= 0), 0.0001))
predandlabels = predandlabels.withColumn('y_pred', F.when((F.col("y_pred") <= 0), 0)\
    .otherwise(predandlabels.y_pred)).cache()
predandlabels.show()
predandlabels.count()

+------+---+
|y_pred|  y|
+------+---+
|  30.0| 29|
|   1.0|  0|
|   3.0|  2|
|   2.0|  2|
|   1.0|  1|
|   9.0|  6|
| 275.0|261|
|  52.0| 62|
|   0.0|  0|
|   0.0|  0|
|   0.0|  0|
|   2.0|  3|
|   1.0|  1|
|   9.0|  9|
|   1.0|  1|
|   4.0|  1|
| 135.0|159|
|  52.0| 48|
|   0.0|  0|
|   0.0|  0|
+------+---+
only showing top 20 rows



114114

In [42]:
evaluator = RegressionEvaluator(predictionCol="y_pred", labelCol='y', metricName='mae')
print("LinearRegression MAE: {0}".format(evaluator.evaluate(predandlabels)))

LinearRegression MAE: 1.6327444485339222


In [43]:
print('MAPE:',
    predandlabels.select(avg((100*abs((predandlabels.y - predandlabels.y_pred) / predandlabels.y)))).collect())

MAPE: [Row(avg((abs(((y - y_pred) / y)) * 100))=44.100177629748266)]


    На просто сгенерированных фичах
    МАЕ: 1.632650739476678 
    MAPE: 44.18075074035098%

    на дополнительных сезонностях
    МАЕ: 1.632589481053645
    МАРЕ: 44.08505969798334%

In [44]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[assembler, standardScaler, lr])
piped_lr = pipeline.fit(train_lr)
piped_prediction = []
piped_prediction = piped_lr.transform(valid_lr)
piped_prediction = piped_prediction.select("y_pred", "y").withColumn("y_pred",F.round(piped_prediction["y_pred"],0)) 
                                                        #.withColumn("y", F.when((F.col("y") <= 0), 0.0001))
piped_prediction = piped_prediction.withColumn('y_pred', F.when((F.col("y_pred") <= 0), 0)\
    .otherwise(piped_prediction.y_pred)).cache()
print("LinearRegression MAE: {0}".format(evaluator.evaluate(piped_prediction)))
print('MAPE:',
    piped_prediction.select(avg((100*abs((
        piped_prediction.y - piped_prediction.y_pred) / piped_prediction.y)))).collect())
piped_prediction.show(10)
print(piped_prediction.count())

LinearRegression MAE: 1.6327444485339222
MAPE: [Row(avg((abs(((y - y_pred) / y)) * 100))=44.10017762974828)]
+------+---+
|y_pred|  y|
+------+---+
|  30.0| 29|
|   1.0|  0|
|   3.0|  2|
|   2.0|  2|
|   1.0|  1|
|   9.0|  6|
| 275.0|261|
|  52.0| 62|
|   0.0|  0|
|   0.0|  0|
+------+---+
only showing top 10 rows

114114


# Обучим лес случайных деревьев

In [45]:
%%time

rf = RandomForestRegressor(labelCol="y", featuresCol="features_scaled", predictionCol='y_pred',
                                  numTrees = 10, maxDepth = 10) \

rfmodel = rf.fit(train_data)
rfpredictions = rfmodel.transform(valid_data)
rfpredictions.show()
print("RandomForest MAE: {0}".format(evaluator.evaluate(rfpredictions)))

+---+--------------------+--------------------+
|  y|     features_scaled|              y_pred|
+---+--------------------+--------------------+
| 29|[2.04444672924244...|  27.104458488964866|
|  0|[1.03710354593242...|  0.5824481568718803|
|  2|[0.58055252842956...|  3.2171571503797822|
|  2|[1.32568888045275...|  1.6015143745430964|
|  1|[1.36401662019373...|   0.691571415729218|
|  6|[0.73183437176014...|     7.7213847135417|
|261|[0.52396274798847...|  242.44751672336685|
| 62|[0.72146333630081...|   65.25447601725183|
|  0|(13,[10,11,12],[0...|0.025250131309037582|
|  0|(13,[5,6,9,10,11,...| 0.24168031146172156|
|  0|(13,[7,8,10,11,12...| 0.07958598084131227|
|  3|(13,[5,6,8,9,10,1...|   2.051064791733465|
|  1|(13,[7,8,9,10,11,...|  0.6215974019653786|
|  9|[0.99201208741362...|  11.259924643173985|
|  1|[0.0,0.0,0.0,0.0,...|  1.9284229417318983|
|  1|[0.90182917037602...|    2.18429795933183|
|159|[0.55237036685531...|  117.44775690025092|
| 48|[2.14635342549493...|  36.929372238

    Лес деревьев потребляет много ресурсов и не дает высоких результатов. Кросс-валидация не проходит из-за высокого потребления ресурсов.
    МАЕ случайного леса 2.6353573762910427
    В дальнейшем будем использовать модели линейной регрессии

    Из трех моделей лучшие результаты показала линейная регрессия.
    попробуем обучить разные модели для разных районов.

# Реализуем регрессию для каждого региона через функцию

In [46]:
def modelsandpredictions(func_community):
        
    temp_train = train_lr.filter(train_lr.pickup_community_area == func_community)
    temp_valid = valid_lr.filter(valid_lr.pickup_community_area == func_community)
    
    temp_lr = pipeline.fit(temp_train)
    temp_pred = temp_lr.transform(temp_valid)
    temp_predandtrue = temp_pred.select(
                                        'pickup_community_area',
                                        'ds',
                                        "y_pred", 
                                        "y"
                                        ).withColumn("y_pred",F.round(temp_pred["y_pred"],0)) 
    temp_predandtrue = temp_predandtrue.withColumn('y_pred', F.when((F.col("y_pred") <= 0), 0)\
        .otherwise(temp_predandtrue.y_pred)).cache()
    #temp_lr = lr.fit(temp_train.union(temp_valid))
    #temp_lr.write().overwrite().save("/models/lr{0}".format(i))
    return temp_predandtrue

In [47]:
def modelsandpredictions(func_community):#,x_df, y_df, ):
        
    temp_train = train_lr_scaled.filter(train_lr_scaled.pickup_community_area == func_community).selectExpr(
        'pickup_community_area',
        'ds',
        'y',
        'features_scaled')
    temp_valid = valid_lr_scaled.filter(valid_lr_scaled.pickup_community_area == func_community).selectExpr(
        'pickup_community_area',
        'ds',
        'y',
        'features_scaled')
    
    temp_lr = lr.fit(temp_train)
    temp_pred = temp_lr.transform(temp_valid)
    temp_predandtrue = temp_pred.select(
                                        'pickup_community_area',
                                        'ds',
                                        "y_pred", 
                                        "y"
                                        ).withColumn("y_pred",F.round(temp_pred["y_pred"],0)) 
    temp_predandtrue = temp_predandtrue.withColumn('y_pred', F.when((F.col("y_pred") <= 0), 0)\
        .otherwise(temp_predandtrue.y_pred)).cache()
    #temp_lr = lr.fit(temp_train.union(temp_valid))
    #temp_lr.write().overwrite().save("/models/lr{0}".format(i))
    return temp_predandtrue

In [48]:
preds_schema = StructType([ \
    StructField("pickup_community_area",IntegerType (),True), \
    StructField("ds",TimestampType(),True), \
    StructField("y_pred",FloatType(),True), \
    StructField("y", FloatType(), True), \

  ])

In [49]:
%%time
valid_preds = spark.createDataFrame([],preds_schema)
for i in range(78):
    valid_preds = valid_preds.union(modelsandpredictions(i))
valid_preds = valid_preds.cache()

CPU times: user 2.99 s, sys: 1.68 s, total: 4.67 s
Wall time: 3min 32s


In [50]:
valid_preds.show(5)
valid_preds.count()

+---------------------+-------------------+------+---+
|pickup_community_area|                 ds|y_pred|  y|
+---------------------+-------------------+------+---+
|                    0|2023-06-02 01:00:00|   0.0|0.0|
|                    0|2023-06-04 06:00:00|   0.0|0.0|
|                    0|2023-06-05 18:00:00|   0.0|0.0|
|                    0|2023-06-13 12:00:00|   0.0|0.0|
|                    0|2023-06-16 21:00:00|   0.0|0.0|
+---------------------+-------------------+------+---+
only showing top 5 rows



114114

In [51]:
%%time
print("LinearRegression MAE: {0}".format(evaluator.evaluate(valid_preds)))

LinearRegression MAE: 1.5161329898172002
CPU times: user 1.29 s, sys: 493 ms, total: 1.78 s
Wall time: 4min 32s


In [52]:
%%time
print('MAPE:',
    valid_preds.select(avg((100*abs((valid_preds.y - valid_preds.y_pred) / valid_preds.y)))).collect())

MAPE: [Row(avg((abs(((y - y_pred) / y)) * 100))=39.00934602125235)]
CPU times: user 1.15 s, sys: 622 ms, total: 1.77 s
Wall time: 4min 26s


    Окончательные ошибки отдельных Линейных регрессий для каждого района составили
    МАЕ: 1.5162772381202416
    МАРЕ: 39.00271960895627

In [53]:
import tracemalloc
tracemalloc.start()
def mem_use():
    mem_size, mem_peak = tracemalloc.get_traced_memory()
    print(f'Количество используемой памяти {mem_size:_} байт, '
          f'пиковое {mem_peak:_} байт')

mem_use()

Количество используемой памяти 2_016 байт, пиковое 14_960 байт


linearModel.write().overwrite().save('/work/lr')

linearModel.write().overwrite().save('/lr_models/lr47')#.save('/models/sparklr47')

import os
os.getcwd()