<a href="https://colab.research.google.com/github/GabrielDan92/PySpark/blob/main/pySparkTC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org//dist/spark/spark-3.1.2/spark-3.1.2-bin-hadoop3.2.tgz
!tar xf spark-3.1.2-bin-hadoop3.2.tgz
!pip install -q pyspark
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.2-bin-hadoop3.2"

import findspark
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, lit, col, array, round, size, when, concat

[K     |████████████████████████████████| 212.4 MB 84 kB/s 
[K     |████████████████████████████████| 198 kB 82.5 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [2]:
findspark.init()
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("Learning_Spark") \
    .getOrCreate()

***
**No longer in use.** Import local files and add them to DataFrames instead of creating them in PySpark.

In [3]:
# from google.colab import files
# files.upload()
# data = spark.read.csv('trips.csv',inferSchema=True, header=True)
# data2 = spark.read.csv('stations.csv',inferSchema=True, header=True)

# # add the row number column and move it to the left of the imported tables
# w = Window().orderBy(lit('A'))
# data = data.withColumn("row_num", row_number().over(w))
# data2 = data2.withColumn("row_num", row_number().over(w))
# data.select('row_num', 'origin', 'destination', 'internal_bus_station_ids', 'triptimes').show()
# data2.select('row_num', 'internal_bus_station_id', 'public_bus_station_id').show()

***

In [4]:
stations = [(1, 0, "BAutogara"), \
            (2, 1, "BVAutogara"), \
            (3, 2, "SBAutogara"), \
            (4, 3, "CJAutogara"), \
            (5, 4, "MMAutogara"), \
            (6, 5, "ISAutogara"), \
            (7, 6, "CTAutogara"), \
            (8, 7, "TMAutogara"), \
            (9, 8, "BCAutogara"), \
            (10, 9, "MSAutogara")]

stationsColumns = ["row_num", "internal_bus_station_id", "public_bus_station"]
stationsDF = spark.createDataFrame(data=stations, schema=stationsColumns)

In [5]:
trips = [(1, "B", "MM", [0,2,4], [datetime(2020, 3, 1, 10, 10, 00), datetime(2020, 3, 1, 12, 20, 10), datetime(2020, 3, 1, 14, 10, 10)]), \
        (2, "BV", "IS", [1,8,5], [datetime(2020, 3, 1, 8, 10, 00), datetime(2020, 3, 1, 12, 20, 10), datetime(2020, 3, 1, 15, 10, 10)]), \
        (3, "TM", "CT", [7,2,6], [datetime(2020, 4, 1, 10, 45, 00), datetime(2020, 4, 1, 12, 20, 10), datetime(2020, 4, 1, 19, 30, 10)]), \
        #  (4, "CJ", "BC", [3,9, 5, 6, 7], [datetime(2020, 5, 1, 7, 10, 00), datetime(2020, 5, 1, 12, 20, 10), datetime(2020, 5, 1, 13, 20, 10), datetime(2020, 5, 1, 14, 20, 10), datetime(2020, 5, 1, 15, 20, 10)])]
        (4, "CJ", "BC", [3,9,8], [datetime(2020, 5, 1, 7, 10, 00), datetime(2020, 5, 1, 12, 20, 10), datetime(2020, 5, 2, 22, 10, 10)])]

tripsColumns = ["row_num", "origin", "destination", "internal_bus_stations_ids", "triptimes"]
tripsDF = spark.createDataFrame(data=trips, schema=tripsColumns)

In [6]:
stationsDF.show()
# stationsDF.printSchema()
tripsDF.show(truncate=False)
# tripsDF.printSchema()
stations = stationsDF.alias("stations")
trips = tripsDF.alias("trips")

+-------+-----------------------+------------------+
|row_num|internal_bus_station_id|public_bus_station|
+-------+-----------------------+------------------+
|      1|                      0|         BAutogara|
|      2|                      1|        BVAutogara|
|      3|                      2|        SBAutogara|
|      4|                      3|        CJAutogara|
|      5|                      4|        MMAutogara|
|      6|                      5|        ISAutogara|
|      7|                      6|        CTAutogara|
|      8|                      7|        TMAutogara|
|      9|                      8|        BCAutogara|
|     10|                      9|        MSAutogara|
+-------+-----------------------+------------------+

+-------+------+-----------+-------------------------+---------------------------------------------------------------+
|row_num|origin|destination|internal_bus_stations_ids|triptimes                                                      |
+-------+------+---

Identify the arrays length and the maximum length from the `internal_bus_station_ids` column for a dynamic number of bus stops, in order to prevent hardcoding the array elements that will be selected.

In [7]:
columns = trips.select(trips.internal_bus_stations_ids, size("internal_bus_stations_ids").alias("size"))
columns.show()
max = columns.agg({"size": "max"}).collect()[0]
maxArrLength = max["max(size)"]

+-------------------------+----+
|internal_bus_stations_ids|size|
+-------------------------+----+
|                [0, 2, 4]|   3|
|                [1, 8, 5]|   3|
|                [7, 2, 6]|   3|
|                [3, 9, 8]|   3|
+-------------------------+----+



Create a custom string to be passed to `spark.sql()` for retrieving and splitting all array elements into individual columns, regadless of the bus stations count.

In [8]:
queryString = ""
tempViewName = "trips"
trips.createTempView(tempViewName)

for i in range(maxArrLength):
    queryString += f"{tempViewName}.internal_bus_stations_ids[{str(i)}] as column_{i+1}"
    if i != maxArrLength - 1:
        queryString += ", "

print(f"Dynamic SQL query: \n <SELECT {queryString} FROM {tempViewName}>")

Dynamic SQL query: 
 <SELECT trips.internal_bus_stations_ids[0] as column_1, trips.internal_bus_stations_ids[1] as column_2, trips.internal_bus_stations_ids[2] as column_3 FROM trips>


Split the internal_bus_stations_ids arrays into individual columns in a temporary table `tripsNameDF`.

In [9]:
tripsNameDF = spark.sql(f"SELECT {queryString} FROM {tempViewName}")
tripsNameDF.show()

+--------+--------+--------+
|column_1|column_2|column_3|
+--------+--------+--------+
|       0|       2|       4|
|       1|       8|       5|
|       7|       2|       6|
|       3|       9|       8|
+--------+--------+--------+



**No longer in use.** Hardcoded array indexes PySpark SELECT and JOIN queries.

In [10]:
# tripsNameDF = trips.select(trips.internal_bus_stations_ids[0].alias("first_dest_id"),
#                           trips.internal_bus_stations_ids[1].alias("second_dest_id"),
#                           trips.internal_bus_stations_ids[2].alias("third_dest_id"))
# tripsNameDF.show()

# tripsNameDF = tripsNameDF.join(stations, tripsNameDF.first_dest_id == stations.internal_bus_station_id, how="left") \
#             .select(tripsNameDF["*"], stations["public_bus_station"].alias("first_dest"))
# tripsNameDF = tripsNameDF.join(stations, tripsNameDF.second_dest_id == stations.internal_bus_station_id, how="left") \
#             .select(tripsNameDF["*"], stations["public_bus_station"].alias("second_dest"))
# tripsNameDF = tripsNameDF.join(stations, tripsNameDF.third_dest_id == stations.internal_bus_station_id, how="left") \
#             .select(tripsNameDF["*"], stations["public_bus_station"].alias("third_dest"))
# tripsNameDF.show()          

Use the individual IDs as maching terms for the left joins against the original `stations` data set.

In [11]:
columnNames = tripsNameDF.schema.names
joinedName = ""

for name in columnNames:
    joinedName = name + "_public"
    tripsNameDF = tripsNameDF.join(stations, tripsNameDF[name] == stations.internal_bus_station_id, how="left") \
            .select(tripsNameDF["*"], stations["public_bus_station"].alias(joinedName))

# tripsNameDF = tripsNameDF.na.fill("")

# for name in columnNames:
#     tripsNameDF = tripsNameDF.withColumn(name, tripsNameDF[name].cast("long"))

tripsNameDF.show()

+--------+--------+--------+---------------+---------------+---------------+
|column_1|column_2|column_3|column_1_public|column_2_public|column_3_public|
+--------+--------+--------+---------------+---------------+---------------+
|       7|       2|       6|     TMAutogara|     SBAutogara|     CTAutogara|
|       1|       8|       5|     BVAutogara|     BCAutogara|     ISAutogara|
|       3|       9|       8|     CJAutogara|     MSAutogara|     BCAutogara|
|       0|       2|       4|      BAutogara|     SBAutogara|     MMAutogara|
+--------+--------+--------+---------------+---------------+---------------+



In [12]:
# tripsNameDF.printSchema()
# columnNames = tripsNameDF.schema.names
# for name in columnNames:
#     tripsNameDF = tripsNameDF.withColumn(name, tripsNameDF[name].cast("string"))

# tripsNameDF = tripsNameDF.na.fill("")

# for name in columnNames:
#     tripsNameDF = tripsNameDF.withColumn(name, tripsNameDF[name].cast("long"))

# tripsNameDF.printSchema()
# tripsNameDF.show()

Add the matching values in an array, saved in a new column called "public_bus_stops"

create arrays with null values

In [13]:
columnNames = tripsNameDF.schema.names
internal = []
public = []

for name in columnNames:
    if name.find("public") != -1:
        public.append(name)
    else:
        internal.append(name)

tripsNameDF = tripsNameDF.select(array(internal).alias("internal_bus_stations"), \
                                array(public).alias("public_bus_stops"))

tripsNameDF.show(truncate=False)

+---------------------+------------------------------------+
|internal_bus_stations|public_bus_stops                    |
+---------------------+------------------------------------+
|[7, 2, 6]            |[TMAutogara, SBAutogara, CTAutogara]|
|[1, 8, 5]            |[BVAutogara, BCAutogara, ISAutogara]|
|[3, 9, 8]            |[CJAutogara, MSAutogara, BCAutogara]|
|[0, 2, 4]            |[BAutogara, SBAutogara, MMAutogara] |
+---------------------+------------------------------------+



In [15]:
columns = trips.select(trips.internal_bus_stations_ids.alias("internal_bus_stations"), trips.triptimes, size("triptimes").alias("size"))
columns.show(truncate=False)
max = columns.agg({"size": "max"}).collect()[0]
maxArrLength = max["max(size)"]

+---------------------+---------------------------------------------------------------+----+
|internal_bus_stations|triptimes                                                      |size|
+---------------------+---------------------------------------------------------------+----+
|[0, 2, 4]            |[2020-03-01 10:10:00, 2020-03-01 12:20:10, 2020-03-01 14:10:10]|3   |
|[1, 8, 5]            |[2020-03-01 08:10:00, 2020-03-01 12:20:10, 2020-03-01 15:10:10]|3   |
|[7, 2, 6]            |[2020-04-01 10:45:00, 2020-04-01 12:20:10, 2020-04-01 19:30:10]|3   |
|[3, 9, 8]            |[2020-05-01 07:10:00, 2020-05-01 12:20:10, 2020-05-02 22:10:10]|3   |
+---------------------+---------------------------------------------------------------+----+



In [16]:
tempViewName = "triptimes"
queryString = tempViewName + ".internal_bus_stations_ids as internal_bus_stations, "
trips.createTempView(tempViewName)

for i in range(maxArrLength):
    queryString += f"{tempViewName}.triptimes[{str(i)}] as column_{i+1}"
    if i != maxArrLength - 1:
        queryString += ", "

print(f"Dynamic SQL query: \n <SELECT {queryString} FROM {tempViewName}>")

Dynamic SQL query: 
 <SELECT triptimes.internal_bus_stations_ids as internal_bus_stations, triptimes.triptimes[0] as column_1, triptimes.triptimes[1] as column_2, triptimes.triptimes[2] as column_3 FROM triptimes>


In [17]:
tripsDurationDF = spark.sql(f"SELECT {queryString} FROM {tempViewName}")
tripsDurationDF.show()

+---------------------+-------------------+-------------------+-------------------+
|internal_bus_stations|           column_1|           column_2|           column_3|
+---------------------+-------------------+-------------------+-------------------+
|            [0, 2, 4]|2020-03-01 10:10:00|2020-03-01 12:20:10|2020-03-01 14:10:10|
|            [1, 8, 5]|2020-03-01 08:10:00|2020-03-01 12:20:10|2020-03-01 15:10:10|
|            [7, 2, 6]|2020-04-01 10:45:00|2020-04-01 12:20:10|2020-04-01 19:30:10|
|            [3, 9, 8]|2020-05-01 07:10:00|2020-05-01 12:20:10|2020-05-02 22:10:10|
+---------------------+-------------------+-------------------+-------------------+



In [18]:
from pyspark.sql.functions import concat
columnNames = tripsDurationDF.schema.names
maxIndex = len(columnNames)-1

tripsDurationDF = tripsDurationDF.withColumn("duration", \
                                             col(columnNames[len(columnNames)-1]).cast("int") - \
                                             col(columnNames[1]).cast("int"))
tripsDurationDF.show()

for i in range(maxIndex, 0, -1):
    tripsDurationDF = tripsDurationDF.withColumn("duration", \
                                            when(tripsDurationDF["duration"].isNull(), \
                                            col(columnNames[i]).cast("int") - col(columnNames[1]).cast("int")) \
                                            .otherwise(tripsDurationDF["duration"]))

tripsDurationDF = tripsDurationDF.withColumn("duration", round(tripsDurationDF["duration"])/60)
tripsDurationDF = tripsDurationDF.withColumn("duration", round(tripsDurationDF["duration"],2))
tripsDurationDF = tripsDurationDF.withColumn("duration", concat(col("duration"), lit(" min")))
tripsDurationDF.show(truncate=False)

+---------------------+-------------------+-------------------+-------------------+--------+
|internal_bus_stations|           column_1|           column_2|           column_3|duration|
+---------------------+-------------------+-------------------+-------------------+--------+
|            [0, 2, 4]|2020-03-01 10:10:00|2020-03-01 12:20:10|2020-03-01 14:10:10|   14410|
|            [1, 8, 5]|2020-03-01 08:10:00|2020-03-01 12:20:10|2020-03-01 15:10:10|   25210|
|            [7, 2, 6]|2020-04-01 10:45:00|2020-04-01 12:20:10|2020-04-01 19:30:10|   31510|
|            [3, 9, 8]|2020-05-01 07:10:00|2020-05-01 12:20:10|2020-05-02 22:10:10|  140410|
+---------------------+-------------------+-------------------+-------------------+--------+

+---------------------+-------------------+-------------------+-------------------+-----------+
|internal_bus_stations|column_1           |column_2           |column_3           |duration   |
+---------------------+-------------------+-------------------+

In [19]:
columnNames = tripsDurationDF.schema.names
timestamps = []

for i in range(1, len(columnNames)-1):
        timestamps.append(columnNames[i])

tripsDurationDF = tripsDurationDF.select(tripsDurationDF.internal_bus_stations, \
                                array(timestamps).alias("triptimes"), tripsDurationDF.duration)

tripsDurationDF.show(truncate=False)

+---------------------+---------------------------------------------------------------+-----------+
|internal_bus_stations|triptimes                                                      |duration   |
+---------------------+---------------------------------------------------------------+-----------+
|[0, 2, 4]            |[2020-03-01 10:10:00, 2020-03-01 12:20:10, 2020-03-01 14:10:10]|240.17 min |
|[1, 8, 5]            |[2020-03-01 08:10:00, 2020-03-01 12:20:10, 2020-03-01 15:10:10]|420.17 min |
|[7, 2, 6]            |[2020-04-01 10:45:00, 2020-04-01 12:20:10, 2020-04-01 19:30:10]|525.17 min |
|[3, 9, 8]            |[2020-05-01 07:10:00, 2020-05-01 12:20:10, 2020-05-02 22:10:10]|2340.17 min|
+---------------------+---------------------------------------------------------------+-----------+



In [21]:
trips = trips.join(tripsDurationDF, trips.internal_bus_stations_ids == tripsDurationDF.internal_bus_stations) \
        .select(trips["internal_bus_stations_ids"], trips["row_num"], trips["origin"], trips["destination"], tripsDurationDF["duration"])
trips.orderBy(["row_num"]).show(truncate=False)

+-------------------------+-------+------+-----------+-----------+
|internal_bus_stations_ids|row_num|origin|destination|duration   |
+-------------------------+-------+------+-----------+-----------+
|[0, 2, 4]                |1      |B     |MM         |240.17 min |
|[1, 8, 5]                |2      |BV    |IS         |420.17 min |
|[7, 2, 6]                |3      |TM    |CT         |525.17 min |
|[3, 9, 8]                |4      |CJ    |BC         |2340.17 min|
+-------------------------+-------+------+-----------+-----------+



In [22]:
trips = trips.join(tripsNameDF, trips.internal_bus_stations_ids == tripsNameDF.internal_bus_stations) \
        .select(trips["row_num"], trips["origin"], trips["destination"], trips["internal_bus_stations_ids"], tripsNameDF["public_bus_stops"], trips["duration"])
trips.orderBy(["row_num"]).show(truncate=False)

+-------+------+-----------+-------------------------+------------------------------------+-----------+
|row_num|origin|destination|internal_bus_stations_ids|public_bus_stops                    |duration   |
+-------+------+-----------+-------------------------+------------------------------------+-----------+
|1      |B     |MM         |[0, 2, 4]                |[BAutogara, SBAutogara, MMAutogara] |240.17 min |
|2      |BV    |IS         |[1, 8, 5]                |[BVAutogara, BCAutogara, ISAutogara]|420.17 min |
|3      |TM    |CT         |[7, 2, 6]                |[TMAutogara, SBAutogara, CTAutogara]|525.17 min |
|4      |CJ    |BC         |[3, 9, 8]                |[CJAutogara, MSAutogara, BCAutogara]|2340.17 min|
+-------+------+-----------+-------------------------+------------------------------------+-----------+

