In [20]:
%%bash
pwd

/home/dilanveracruz


In [21]:
# Kafka Consumer (Separate Code)

from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, DateType, StringType, IntegerType, DoubleType, LongType
from kafka import KafkaConsumer

In [22]:
#Create a Spark session using Python3 (ipykernel) kernel
#To configurate the connection between Apache Kafka and Pyspark, it is necessary run 4 jar files
#These files can be downloaded from browser

spark = SparkSession.builder.appName('streaming')\
    .config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.13:3.4.0')\
    .config('spark.jars.packages', 'org.apache.spark:spark-token-provider-kafka-0-10_2.13:3.4.0')\
    .config('spark.jars.packages', 'org.apache.spark:spark-streaming-kafka-0-10_2.13:3.4.0')\
    .config('spark.jars.packages', 'org.apache.kafka:kafka-clients:3.4.0')\
    .getOrCreate()

##### Set up the KafkaConsumer

In [23]:
# Create a Kafka consumer
from kafka import KafkaConsumer

consumer = KafkaConsumer('my-topic',
                         bootstrap_servers = ['localhost:9092'],
                         group_id='2.13',
                         api_version=(0,10),
                         #max_poll_records=60000,
                         consumer_timeout_ms=10000,
                         session_timeout_ms = 600000,
                         request_timeout_ms = 800000,
                         connections_max_idle_ms = 900000,
                         #enable_auto_commit=True, 
                         #value_deserializer=lambda x: json.loads(x.decode('utf-8'))
                        )

In [24]:
def process_batch(batch_df, batch_id):
    pandas_df = batch_df.toPandas()
    processed_df = spark.createDataFrame(pandas_df)

In [25]:
# Define the schema for the data
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType, LongType


schema = StructType([
    StructField("date", IntegerType(), nullable=True),
    StructField("station_name", IntegerType(), nullable=True),
    StructField("area_cleaned", IntegerType(), nullable=True),
    StructField("Hour", IntegerType(), nullable=True),
    StructField("Day", IntegerType(), nullable=True),
    StructField("DayofWeek", IntegerType(), nullable=True),
    StructField("Month", IntegerType(), nullable=True),
    StructField("mean", DoubleType(), nullable=True),
    StructField("press", DoubleType(), nullable=True),
    StructField("num_rentals", LongType(), nullable=True),
    StructField("station_id", IntegerType(), nullable=True),
    StructField("sunshine", DoubleType(), nullable=True),
    StructField("cloud", DoubleType(), nullable=True),
    StructField("prec", DoubleType(), nullable=True),
    StructField("latitude", DoubleType(), nullable=True),
    StructField("longitude", DoubleType(), nullable=True),
    StructField("end_station_id", IntegerType(), nullable=True)
])

#Get the data sent by the producer
messages_processed = 0
max_messages = 1 #108652
while messages_processed < max_messages: 
    messages = consumer.poll(timeout_ms=1800)
    if messages:
        batch_df = spark.createDataFrame([], schema)
        for message in messages.values():
            for msg in message:
                value = msg.value.decode('utf-8')
                json_rdd = spark.sparkContext.parallelize([value])
                df = spark.read.json(json_rdd, schema=schema)
                batch_df = batch_df.union(df)
                messages_processed += 1

        process_batch(batch_df, batch_id=None)
        consumer.commit()

# Close the consumer
consumer.close()


23/05/28 03:25:57 WARN DAGScheduler: Broadcasting large task binary with size 4.9 MiB
                                                                                

In [27]:
batch_df.show()

23/05/28 03:32:58 WARN DAGScheduler: Broadcasting large task binary with size 6.5 MiB
23/05/28 03:32:58 WARN DAGScheduler: Broadcasting large task binary with size 6.5 MiB
23/05/28 03:32:58 WARN DAGScheduler: Broadcasting large task binary with size 6.5 MiB
23/05/28 03:33:00 WARN DAGScheduler: Broadcasting large task binary with size 6.5 MiB
23/05/28 03:33:06 WARN DAGScheduler: Broadcasting large task binary with size 6.5 MiB

+--------+------------+------------+----+---+---------+-----+----+-----+-----------+----------+--------+-----+----+--------+---------+--------------+
|    date|station_name|area_cleaned|Hour|Day|DayofWeek|Month|mean|press|num_rentals|station_id|sunshine|cloud|prec|latitude|longitude|end_station_id|
+--------+------------+------------+----+---+---------+-----+----+-----+-----------+----------+--------+-----+----+--------+---------+--------------+
|20200125|        null|        null|  10| 25|        7|    1|null| null|          2|       115|     0.0|  8.0| 0.2|    null|     null|          null|
|20200125|        null|        null|  10| 25|        7|    1|null| null|          3|       259|     0.0|  8.0| 0.2|    null|     null|          null|
|20200125|        null|        null|  10| 25|        7|    1|null| null|          1|       269|     0.0|  8.0| 0.2|    null|     null|          null|
|20200125|        null|        null|  10| 25|        7|    1|null| null|          2|       677|     

                                                                                

In [14]:
batch_df.printSchema()

root
 |-- date: integer (nullable = true)
 |-- station_name: integer (nullable = true)
 |-- area_cleaned: integer (nullable = true)
 |-- Hour: integer (nullable = true)
 |-- Day: integer (nullable = true)
 |-- DayofWeek: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- mean: double (nullable = true)
 |-- press: double (nullable = true)
 |-- num_rentals: long (nullable = true)
 |-- station_id: integer (nullable = true)
 |-- sunshine: double (nullable = true)
 |-- cloud: double (nullable = true)
 |-- prec: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- end_station_id: integer (nullable = true)



##### Import the model

In [15]:
from pyspark.ml import PipelineModel
GBT_model = PipelineModel.load('model-GBT')



In [16]:
pred_stream = GBT_model.transform(batch_df)
pred_stream.show(5, vertical=True)

23/05/28 03:08:38 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
23/05/28 03:08:47 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB
23/05/28 03:08:47 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB
23/05/28 03:08:49 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB
23/05/28 03:08:53 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB
23/05/28 03:09:09 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB
23/05/28 03:10:20 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB
23/05/28 03:16:23 WARN DAGScheduler: Broadcasting large task binary with size 6.7 MiB

(0 rows)



                                                                                

In [None]:
grouped_plot = pred_stream.groupBy('area_cleaned') \
                .agg(
                     sum('prediction').alias('Estimated Value')
                    )

In [None]:
grouped_plot = grouped_plot.orderBy(F.col('Estimated Value').desc())

In [None]:
grouped_plot.show(5)

##### Rentals prediction in each area

In [None]:
import matplotlib.pyplot as plt
import numpy as np
# Sample data


df_plot = grouped_plot.limit(10).toPandas()

# Set the width of each bar
bar_width = 0.35


# Create the plot
fig, ax = plt.subplots()
index = np.arange(len(df_plot['area_cleaned']))


# Plot the real values
rects1 = ax.barh(index, df_plot['Estimated Value'], bar_width, label='Previsão')


# Set y-axis labels
ax.set_yticks(index + bar_width / 2)
ax.set_yticklabels(df_plot['area_cleaned'])

# Add labels and title
ax.set_xlabel('Valores')
ax.set_ylabel('Áreas')
ax.set_title('Número de alugueres para cada área')


# Add legend
ax.legend()


# Display the plot
plt.show()

##### Rental prediction for the next business day

In [None]:
first_filter = pred_stream.filter(F.col('date') == 20200102)
first_filter.count()

In [None]:
df_plot = ( first_filter
                .groupby('Hour')
                .count()
                .sort('Hour', ascending=True)
                .toPandas()
          )
plotBar(df_plot, 'Hour', 'count')
plt.title('Previsão de alugueres por hora durante o dia 2 de janeiro de 2020')
plt.show()

##### Area with most demand

In [None]:
grouped_plot = first_filter.groupBy('area_cleaned') \
                .agg(
                     sum('prediction').alias('Estimated Value')
                    )

In [None]:
grouped_plot = grouped_plot.orderBy(F.col('Estimated Value').desc())
grouped_plot.show(5)

In [None]:
df_plot = grouped_plot.limit(10).toPandas()
plotBar(df_plot, 'area_cleaned', 'Estimated Value')

plt.xticks(rotation=45)
plt.ylabel('Valores Estimados')
plt.xlabel('Áreas')
plt.title('Áreas previstas com mais alugueres durante o dia 2 de janeiro de 2020')
plt.show()

##### Overview of the rentals during the first two weeks of January

In [None]:
import pyspark.sql.functions as F
from pyspark.sql.functions import sum,avg,max,count 

grouped_plot = pred_stream.groupBy('date') \
                .agg(sum('prediction').alias('Estimated Value')
                    ) 

In [None]:
grouped_plot = grouped_plot.sort('date')

In [None]:
grouped_plot = grouped_plot.withColumn(
    "date", 
    F.to_date(F.col("date").cast("string"), "yyyyMMdd")
)

In [None]:
grouped_plot.show(5)

In [None]:
import matplotlib.pyplot as plt

# Sample data
df_plot = grouped_plot.toPandas()
# Create the plot
plt.plot(df_plot['date'], df_plot['Estimated Value'], label='Valores previstos')


In [None]:
# Set labels and title
plt.xticks(rotation=45)
plt.xlabel('Tempo')
plt.ylabel('Valores')
plt.title('Número estimado de alugueres por data')

# Add legend
plt.legend()
# Display the plot
plt.show()