<a href="https://colab.research.google.com/github/GabrielDan92/PySpark/blob/main/pySparkTC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org//dist/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz
!tar xf spark-3.1.2-bin-hadoop3.2.tgz
!pip install -q pyspark
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop3.2"

import findspark
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, lit, col, array, round

[K     |████████████████████████████████| 212.4 MB 69 kB/s 
[K     |████████████████████████████████| 198 kB 46.8 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [2]:
findspark.init()
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("Learning_Spark") \
    .getOrCreate()

***
**(!) optional:** import the tables from local files instead of creating them in pyspark

In [None]:
# from google.colab import files
# files.upload()
# data = spark.read.csv('trips.csv',inferSchema=True, header=True)
# data2 = spark.read.csv('stations.csv',inferSchema=True, header=True)

# # add the row number column and move it to the left of the imported tables
# w = Window().orderBy(lit('A'))
# data = data.withColumn("row_num", row_number().over(w))
# data2 = data2.withColumn("row_num", row_number().over(w))
# data.select('row_num', 'origin', 'destination', 'internal_bus_station_ids', 'triptimes').show()
# data2.select('row_num', 'internal_bus_station_id', 'public_bus_station_id').show()

***

In [3]:
stations = [(1, 0, "BAutogara"), \
            (2, 1, "BVAutogara"), \
            (3, 2, "SBAutogara"), \
            (4, 3, "CJAutogara"), \
            (5, 4, "MMAutogara"), \
            (6, 5, "ISAutogara"), \
            (7, 6, "CTAutogara"), \
            (8, 7, "TMAutogara"), \
            (9, 8, "BCAutogara"), \
            (10, 9, "MSAutogara")]

stationsColumns = ["row_num", "internal_bus_station_id", "public_bus_station"]
stationsDF = spark.createDataFrame(data=stations, schema=stationsColumns)

In [60]:
trips = [(1, "B", "MM", [0,2,4], [datetime(2020, 3, 1, 10, 10, 00), datetime(2020, 3, 1, 12, 20, 10), datetime(2020, 3, 1, 14, 10, 10)]), \
        (2, "BV", "IS", [1,8,5], [datetime(2020, 3, 1, 8, 10, 00), datetime(2020, 3, 1, 12, 20, 10), datetime(2020, 3, 1, 15, 10, 10)]), \
        (3, "TM", "CT", [7,2,6], [datetime(2020, 4, 1, 10, 45, 00), datetime(2020, 4, 1, 12, 20, 10), datetime(2020, 4, 1, 19, 30, 10)]), \
        (4, "CJ", "BC", [3,9,8], [datetime(2020, 5, 1, 7, 10, 00), datetime(2020, 5, 1, 12, 20, 10), datetime(2020, 5, 2, 22, 10, 10)])]

tripsColumns = ["row_num", "origin", "destination", "internal_bus_stations_ids", "triptimes"]
tripsDF = spark.createDataFrame(data=trips, schema=tripsColumns)

In [61]:
stationsDF.show()
# stationsDF.printSchema()
tripsDF.show(truncate=False)
# tripsDF.printSchema()
stations = stationsDF.alias("stations")
trips = tripsDF.alias("trips")

+-------+-----------------------+------------------+
|row_num|internal_bus_station_id|public_bus_station|
+-------+-----------------------+------------------+
|      1|                      0|         BAutogara|
|      2|                      1|        BVAutogara|
|      3|                      2|        SBAutogara|
|      4|                      3|        CJAutogara|
|      5|                      4|        MMAutogara|
|      6|                      5|        ISAutogara|
|      7|                      6|        CTAutogara|
|      8|                      7|        TMAutogara|
|      9|                      8|        BCAutogara|
|     10|                      9|        MSAutogara|
+-------+-----------------------+------------------+

+-------+------+-----------+-------------------------+---------------------------------------------------------------+
|row_num|origin|destination|internal_bus_stations_ids|triptimes                                                      |
+-------+------+---

Split the internal_bus_stations_ids arrays into individual columns in a temporary table "tripsNameDF".

In [67]:
tripsNameDF = trips.select(trips.internal_bus_stations_ids[0].alias("first_dest_id"),
                          trips.internal_bus_stations_ids[1].alias("second_dest_id"),
                          trips.internal_bus_stations_ids[2].alias("third_dest_id"))
tripsNameDF.show()

+-------------+--------------+-------------+
|first_dest_id|second_dest_id|third_dest_id|
+-------------+--------------+-------------+
|            0|             2|            4|
|            1|             5|         null|
|            7|             2|            6|
|            3|             9|            8|
+-------------+--------------+-------------+



Use the individual IDs as maching terms for the left joins against the original "stations" table.

Left join used for capturing complete trips and also trips with less than 3 stops

In [68]:
tripsNameDF = tripsNameDF.join(stations, tripsNameDF.first_dest_id == stations.internal_bus_station_id, how="left") \
            .select(tripsNameDF["*"], stations["public_bus_station"].alias("first_dest"))
tripsNameDF = tripsNameDF.join(stations, tripsNameDF.second_dest_id == stations.internal_bus_station_id, how="left") \
            .select(tripsNameDF["*"], stations["public_bus_station"].alias("second_dest"))
tripsNameDF = tripsNameDF.join(stations, tripsNameDF.third_dest_id == stations.internal_bus_station_id, how="left") \
            .select(tripsNameDF["*"], stations["public_bus_station"].alias("third_dest"))

tripsNameDF.show(truncate=False)

+-------------+--------------+-------------+----------+-----------+----------+
|first_dest_id|second_dest_id|third_dest_id|first_dest|second_dest|third_dest|
+-------------+--------------+-------------+----------+-----------+----------+
|1            |5             |null         |BVAutogara|ISAutogara |null      |
|7            |2             |6            |TMAutogara|SBAutogara |CTAutogara|
|3            |9             |8            |CJAutogara|MSAutogara |BCAutogara|
|0            |2             |4            |BAutogara |SBAutogara |MMAutogara|
+-------------+--------------+-------------+----------+-----------+----------+



Add the matching values in an array, saved in a new column called "public_bus_stops"

create arrays with null values

In [69]:
tripsNameDF = tripsNameDF.select(array("first_dest_id", "second_dest_id", "third_dest_id").alias("internal_bus_stations_ids"), \
                               array("first_dest", "second_dest", "third_dest").alias("public_bus_stops"))
tripsNameDF.show(truncate=False)

+-------------------------+------------------------------------+
|internal_bus_stations_ids|public_bus_stops                    |
+-------------------------+------------------------------------+
|[1, 5, null]             |[BVAutogara, ISAutogara, null]      |
|[7, 2, 6]                |[TMAutogara, SBAutogara, CTAutogara]|
|[3, 9, 8]                |[CJAutogara, MSAutogara, BCAutogara]|
|[0, 2, 4]                |[BAutogara, SBAutogara, MMAutogara] |
+-------------------------+------------------------------------+



In [70]:
tripsDurationDF = trips.select(trips.internal_bus_stations_ids,
                                trips.triptimes [0].alias("first_dest_time"),
                                trips.triptimes [1].alias("second_dest_time"),
                                trips.triptimes [2].alias("third_dest_time"),)

tripsDurationDF.show(truncate=False)

+-------------------------+-------------------+-------------------+-------------------+
|internal_bus_stations_ids|first_dest_time    |second_dest_time   |third_dest_time    |
+-------------------------+-------------------+-------------------+-------------------+
|[0, 2, 4]                |2020-03-01 10:10:00|2020-03-01 12:20:10|2020-03-01 14:10:10|
|[1, 5]                   |2020-03-01 08:10:00|2020-03-01 12:20:10|2020-03-01 15:10:10|
|[7, 2, 6]                |2020-04-01 10:45:00|2020-04-01 12:20:10|2020-04-01 19:30:10|
|[3, 9, 8]                |2020-05-01 07:10:00|2020-05-01 12:20:10|2020-05-02 22:10:10|
+-------------------------+-------------------+-------------------+-------------------+



In [71]:
tripsDurationDF = tripsDurationDF.withColumn("duration", \
                                             tripsDurationDF["third_dest_time"].cast("int") - \
                                             tripsDurationDF["first_dest_time"].cast("int")
                                             )

tripsDurationDF = tripsDurationDF.withColumn("duration", round(tripsDurationDF["duration"])/60)
tripsDurationDF = tripsDurationDF.withColumn("duration", round(tripsDurationDF["duration"],2))
tripsDurationDF = tripsDurationDF.withColumn("duration", concat(col("duration"), lit(" min")))
tripsDurationDF.show(truncate=False)

+-------------------------+-------------------+-------------------+-------------------+-----------+
|internal_bus_stations_ids|first_dest_time    |second_dest_time   |third_dest_time    |duration   |
+-------------------------+-------------------+-------------------+-------------------+-----------+
|[0, 2, 4]                |2020-03-01 10:10:00|2020-03-01 12:20:10|2020-03-01 14:10:10|240.17 min |
|[1, 5]                   |2020-03-01 08:10:00|2020-03-01 12:20:10|2020-03-01 15:10:10|420.17 min |
|[7, 2, 6]                |2020-04-01 10:45:00|2020-04-01 12:20:10|2020-04-01 19:30:10|525.17 min |
|[3, 9, 8]                |2020-05-01 07:10:00|2020-05-01 12:20:10|2020-05-02 22:10:10|2340.17 min|
+-------------------------+-------------------+-------------------+-------------------+-----------+



In [72]:
trips = trips.join(tripsNameDF, trips.internal_bus_stations_ids == tripsNameDF.internal_bus_stations_ids) \
        .select(trips["row_num"], trips["origin"], trips["destination"], trips["internal_bus_stations_ids"], tripsNameDF["public_bus_stops"], trips["triptimes"])
trips.orderBy(["row_num"]).show(truncate=False)

+-------+------+-----------+-------------------------+------------------------------------+---------------------------------------------------------------+
|row_num|origin|destination|internal_bus_stations_ids|public_bus_stops                    |triptimes                                                      |
+-------+------+-----------+-------------------------+------------------------------------+---------------------------------------------------------------+
|1      |B     |MM         |[0, 2, 4]                |[BAutogara, SBAutogara, MMAutogara] |[2020-03-01 10:10:00, 2020-03-01 12:20:10, 2020-03-01 14:10:10]|
|3      |TM    |CT         |[7, 2, 6]                |[TMAutogara, SBAutogara, CTAutogara]|[2020-04-01 10:45:00, 2020-04-01 12:20:10, 2020-04-01 19:30:10]|
|4      |CJ    |BC         |[3, 9, 8]                |[CJAutogara, MSAutogara, BCAutogara]|[2020-05-01 07:10:00, 2020-05-01 12:20:10, 2020-05-02 22:10:10]|
+-------+------+-----------+-------------------------+----------

In [73]:
trips = trips.join(tripsDurationDF, trips.internal_bus_stations_ids == tripsDurationDF.internal_bus_stations_ids) \
        .select(trips["row_num"], trips["origin"], trips["destination"], trips["public_bus_stops"], tripsDurationDF["duration"])
trips.orderBy(["row_num"]).show(truncate=False)

+-------+------+-----------+------------------------------------+-----------+
|row_num|origin|destination|public_bus_stops                    |duration   |
+-------+------+-----------+------------------------------------+-----------+
|1      |B     |MM         |[BAutogara, SBAutogara, MMAutogara] |240.17 min |
|3      |TM    |CT         |[TMAutogara, SBAutogara, CTAutogara]|525.17 min |
|4      |CJ    |BC         |[CJAutogara, MSAutogara, BCAutogara]|2340.17 min|
+-------+------+-----------+------------------------------------+-----------+

