Documentation Available at https://spark.apache.org/docs/latest/api/python/reference/index.html

### Entry point of Spark - SparkSession

In [1]:
from pyspark.sql import SparkSession

### Show Spark version

In [3]:
print(spark.version)

3.5.0


### Initialize SparkSession

In [2]:
# Initialize SparkSession
spark = SparkSession.builder \
    .appName("MySparkApp") \
    .getOrCreate()

### Reading CSV file using inbuilt inferSchema function - Approach 1

In [41]:
circuits_df = spark.read.csv("./data/circuits.csv", header = True, inferSchema = True)

In [42]:
circuits_df.show()

+---------+--------------+--------------------+------------+---------+--------+---------+---+--------------------+
|circuitId|    circuitRef|                name|    location|  country|     lat|      lng|alt|                 url|
+---------+--------------+--------------------+------------+---------+--------+---------+---+--------------------+
|        1|   albert_park|Albert Park Grand...|   Melbourne|Australia|-37.8497|  144.968| 10|http://en.wikiped...|
|        2|        sepang|Sepang Internatio...|Kuala Lumpur| Malaysia| 2.76083|  101.738| 18|http://en.wikiped...|
|        3|       bahrain|Bahrain Internati...|      Sakhir|  Bahrain| 26.0325|  50.5106|  7|http://en.wikiped...|
|        4|     catalunya|Circuit de Barcel...|    Montmeló|    Spain|   41.57|  2.26111|109|http://en.wikiped...|
|        5|      istanbul|       Istanbul Park|    Istanbul|   Turkey| 40.9517|   29.405|130|http://en.wikiped...|
|        6|        monaco|   Circuit de Monaco| Monte-Carlo|   Monaco| 43.7347| 

In [43]:
circuits_df.schema

StructType([StructField('circuitId', IntegerType(), True), StructField('circuitRef', StringType(), True), StructField('name', StringType(), True), StructField('location', StringType(), True), StructField('country', StringType(), True), StructField('lat', DoubleType(), True), StructField('lng', DoubleType(), True), StructField('alt', IntegerType(), True), StructField('url', StringType(), True)])

### Reading CSV file using inbuilt inferSchema function - Approach 2

In [44]:
circuits_df = spark.read.option("header", "true") \
    .option("inferSchema", "true") \
    .csv("data/circuits.csv")

In [45]:
circuits_df.show()

+---------+--------------+--------------------+------------+---------+--------+---------+---+--------------------+
|circuitId|    circuitRef|                name|    location|  country|     lat|      lng|alt|                 url|
+---------+--------------+--------------------+------------+---------+--------+---------+---+--------------------+
|        1|   albert_park|Albert Park Grand...|   Melbourne|Australia|-37.8497|  144.968| 10|http://en.wikiped...|
|        2|        sepang|Sepang Internatio...|Kuala Lumpur| Malaysia| 2.76083|  101.738| 18|http://en.wikiped...|
|        3|       bahrain|Bahrain Internati...|      Sakhir|  Bahrain| 26.0325|  50.5106|  7|http://en.wikiped...|
|        4|     catalunya|Circuit de Barcel...|    Montmeló|    Spain|   41.57|  2.26111|109|http://en.wikiped...|
|        5|      istanbul|       Istanbul Park|    Istanbul|   Turkey| 40.9517|   29.405|130|http://en.wikiped...|
|        6|        monaco|   Circuit de Monaco| Monte-Carlo|   Monaco| 43.7347| 

### Reading CSV file using custom schema - Approach 2

In [26]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType

In [27]:
circuits_schema = StructType(
    fields=[StructField("circuitId", IntegerType(), False),
                                     StructField("circuitRef", StringType(), True),
                                     StructField("name", StringType(), True),
                                     StructField("location", StringType(), True),
                                     StructField("country", StringType(), True),
                                     StructField("lat", DoubleType(), True),
                                     StructField("lng", DoubleType(), True),
                                     StructField("alt", IntegerType(), True),
                                     StructField("url", StringType(), True)
        ]
    )

In [29]:
circuits_df = spark.read \
                    .option("header", True) \
                    .schema(circuits_schema) \
                    .csv("./data/circuits.csv")

In [30]:
circuits_df.show()

+---------+--------------+--------------------+------------+---------+--------+---------+---+--------------------+
|circuitId|    circuitRef|                name|    location|  country|     lat|      lng|alt|                 url|
+---------+--------------+--------------------+------------+---------+--------+---------+---+--------------------+
|        1|   albert_park|Albert Park Grand...|   Melbourne|Australia|-37.8497|  144.968| 10|http://en.wikiped...|
|        2|        sepang|Sepang Internatio...|Kuala Lumpur| Malaysia| 2.76083|  101.738| 18|http://en.wikiped...|
|        3|       bahrain|Bahrain Internati...|      Sakhir|  Bahrain| 26.0325|  50.5106|  7|http://en.wikiped...|
|        4|     catalunya|Circuit de Barcel...|    Montmeló|    Spain|   41.57|  2.26111|109|http://en.wikiped...|
|        5|      istanbul|       Istanbul Park|    Istanbul|   Turkey| 40.9517|   29.405|130|http://en.wikiped...|
|        6|        monaco|   Circuit de Monaco| Monte-Carlo|   Monaco| 43.7347| 

### Reading different type of data - JSON

In [32]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DateType

In [33]:
races_schema = StructType(
    fields=[StructField("raceId", IntegerType(), False),
                                  StructField("year", IntegerType(), True),
                                  StructField("round", IntegerType(), True),
                                  StructField("circuitId", IntegerType(), True),
                                  StructField("name", StringType(), True),
                                  StructField("date", DateType(), True),
                                  StructField("time", StringType(), True),
                                  StructField("url", StringType(), True) 
])

races_df = spark.read \
                .option("header", True) \
                .schema(races_schema) \
                .csv("./data/races.csv")

In [34]:
races_df.show()

+------+----+-----+---------+--------------------+----------+--------+--------------------+
|raceId|year|round|circuitId|                name|      date|    time|                 url|
+------+----+-----+---------+--------------------+----------+--------+--------------------+
|     1|2009|    1|        1|Australian Grand ...|2009-03-29|06:00:00|http://en.wikiped...|
|     2|2009|    2|        2|Malaysian Grand Prix|2009-04-05|09:00:00|http://en.wikiped...|
|     3|2009|    3|       17|  Chinese Grand Prix|2009-04-19|07:00:00|http://en.wikiped...|
|     4|2009|    4|        3|  Bahrain Grand Prix|2009-04-26|12:00:00|http://en.wikiped...|
|     5|2009|    5|        4|  Spanish Grand Prix|2009-05-10|12:00:00|http://en.wikiped...|
|     6|2009|    6|        6|   Monaco Grand Prix|2009-05-24|12:00:00|http://en.wikiped...|
|     7|2009|    7|        5|  Turkish Grand Prix|2009-06-07|12:00:00|http://en.wikiped...|
|     8|2009|    8|        9|  British Grand Prix|2009-06-21|12:00:00|http://en.

#### Reading JSON

In [35]:
constructors_schema = "constructorId INT, constructorRef STRING, name STRING, nationality STRING, url STRING"

constructor_df = spark.read \
                    .schema(constructors_schema) \
                    .json("./data/constructors.json")

In [36]:
constructor_df.show()

+-------------+--------------+-----------+-----------+--------------------+
|constructorId|constructorRef|       name|nationality|                 url|
+-------------+--------------+-----------+-----------+--------------------+
|            1|       mclaren|    McLaren|    British|http://en.wikiped...|
|            2|    bmw_sauber| BMW Sauber|     German|http://en.wikiped...|
|            3|      williams|   Williams|    British|http://en.wikiped...|
|            4|       renault|    Renault|     French|http://en.wikiped...|
|            5|    toro_rosso| Toro Rosso|    Italian|http://en.wikiped...|
|            6|       ferrari|    Ferrari|    Italian|http://en.wikiped...|
|            7|        toyota|     Toyota|   Japanese|http://en.wikiped...|
|            8|   super_aguri|Super Aguri|   Japanese|http://en.wikiped...|
|            9|      red_bull|   Red Bull|   Austrian|http://en.wikiped...|
|           10|   force_india|Force India|     Indian|http://en.wikiped...|
|           

In [37]:
constructor_df = spark.read.json("./data/constructors.json")

In [38]:
constructor_df.show()

+-------------+--------------+-----------+-----------+--------------------+
|constructorId|constructorRef|       name|nationality|                 url|
+-------------+--------------+-----------+-----------+--------------------+
|            1|       mclaren|    McLaren|    British|http://en.wikiped...|
|            2|    bmw_sauber| BMW Sauber|     German|http://en.wikiped...|
|            3|      williams|   Williams|    British|http://en.wikiped...|
|            4|       renault|    Renault|     French|http://en.wikiped...|
|            5|    toro_rosso| Toro Rosso|    Italian|http://en.wikiped...|
|            6|       ferrari|    Ferrari|    Italian|http://en.wikiped...|
|            7|        toyota|     Toyota|   Japanese|http://en.wikiped...|
|            8|   super_aguri|Super Aguri|   Japanese|http://en.wikiped...|
|            9|      red_bull|   Red Bull|   Austrian|http://en.wikiped...|
|           10|   force_india|Force India|     Indian|http://en.wikiped...|
|           

In [40]:
constructor_df.schema

StructType([StructField('constructorId', LongType(), True), StructField('constructorRef', StringType(), True), StructField('name', StringType(), True), StructField('nationality', StringType(), True), StructField('url', StringType(), True)])

## Selection of Column

In [49]:
circuits_df.show(n=5)

+---------+-----------+--------------------+------------+---------+--------+-------+---+--------------------+
|circuitId| circuitRef|                name|    location|  country|     lat|    lng|alt|                 url|
+---------+-----------+--------------------+------------+---------+--------+-------+---+--------------------+
|        1|albert_park|Albert Park Grand...|   Melbourne|Australia|-37.8497|144.968| 10|http://en.wikiped...|
|        2|     sepang|Sepang Internatio...|Kuala Lumpur| Malaysia| 2.76083|101.738| 18|http://en.wikiped...|
|        3|    bahrain|Bahrain Internati...|      Sakhir|  Bahrain| 26.0325|50.5106|  7|http://en.wikiped...|
|        4|  catalunya|Circuit de Barcel...|    Montmeló|    Spain|   41.57|2.26111|109|http://en.wikiped...|
|        5|   istanbul|       Istanbul Park|    Istanbul|   Turkey| 40.9517| 29.405|130|http://en.wikiped...|
+---------+-----------+--------------------+------------+---------+--------+-------+---+--------------------+
only showi

In [50]:
circuits_df_selected = df.select("circuitId", "circuitRef")

In [51]:
circuits_df_selected.show()

+---------+--------------+
|circuitId|    circuitRef|
+---------+--------------+
|        1|   albert_park|
|        2|        sepang|
|        3|       bahrain|
|        4|     catalunya|
|        5|      istanbul|
|        6|        monaco|
|        7|    villeneuve|
|        8|   magny_cours|
|        9|   silverstone|
|       10|hockenheimring|
|       11|   hungaroring|
|       12|      valencia|
|       13|           spa|
|       14|         monza|
|       15|    marina_bay|
|       16|          fuji|
|       17|      shanghai|
|       18|    interlagos|
|       19|  indianapolis|
|       20|   nurburgring|
+---------+--------------+
only showing top 20 rows



## Renaming the columns of the Dataframe

In [52]:
circuits_renamed_df = circuits_df.withColumnRenamed("circuitId", "circuit_id") \
.withColumnRenamed("circuitRef", "circuit_ref") \
.withColumnRenamed("lat", "latitude") \
.withColumnRenamed("lng", "longitude") \
.withColumnRenamed("alt", "altitude") 

In [53]:
circuits_renamed_df.show()

+----------+--------------+--------------------+------------+---------+--------+---------+--------+--------------------+
|circuit_id|   circuit_ref|                name|    location|  country|latitude|longitude|altitude|                 url|
+----------+--------------+--------------------+------------+---------+--------+---------+--------+--------------------+
|         1|   albert_park|Albert Park Grand...|   Melbourne|Australia|-37.8497|  144.968|      10|http://en.wikiped...|
|         2|        sepang|Sepang Internatio...|Kuala Lumpur| Malaysia| 2.76083|  101.738|      18|http://en.wikiped...|
|         3|       bahrain|Bahrain Internati...|      Sakhir|  Bahrain| 26.0325|  50.5106|       7|http://en.wikiped...|
|         4|     catalunya|Circuit de Barcel...|    Montmeló|    Spain|   41.57|  2.26111|     109|http://en.wikiped...|
|         5|      istanbul|       Istanbul Park|    Istanbul|   Turkey| 40.9517|   29.405|     130|http://en.wikiped...|
|         6|        monaco|   Ci

### Adding new Column - withColumn

In [59]:
from pyspark.sql.functions import current_timestamp, col

In [60]:
circuits_time_df = circuits_renamed_df.withColumn("ingestion_date", current_timestamp()) 

In [61]:
circuits_time_df.show()

+----------+--------------+--------------------+------------+---------+--------+---------+--------+--------------------+--------------------+
|circuit_id|   circuit_ref|                name|    location|  country|latitude|longitude|altitude|                 url|      ingestion_date|
+----------+--------------+--------------------+------------+---------+--------+---------+--------+--------------------+--------------------+
|         1|   albert_park|Albert Park Grand...|   Melbourne|Australia|-37.8497|  144.968|      10|http://en.wikiped...|2024-07-26 11:10:...|
|         2|        sepang|Sepang Internatio...|Kuala Lumpur| Malaysia| 2.76083|  101.738|      18|http://en.wikiped...|2024-07-26 11:10:...|
|         3|       bahrain|Bahrain Internati...|      Sakhir|  Bahrain| 26.0325|  50.5106|       7|http://en.wikiped...|2024-07-26 11:10:...|
|         4|     catalunya|Circuit de Barcel...|    Montmeló|    Spain|   41.57|  2.26111|     109|http://en.wikiped...|2024-07-26 11:10:...|
|     

In [62]:
df_with_new_col = circuits_time_df.withColumn("new_column", col("altitude") * 2)

In [63]:
df_with_new_col.show()

+----------+--------------+--------------------+------------+---------+--------+---------+--------+--------------------+--------------------+----------+
|circuit_id|   circuit_ref|                name|    location|  country|latitude|longitude|altitude|                 url|      ingestion_date|new_column|
+----------+--------------+--------------------+------------+---------+--------+---------+--------+--------------------+--------------------+----------+
|         1|   albert_park|Albert Park Grand...|   Melbourne|Australia|-37.8497|  144.968|      10|http://en.wikiped...|2024-07-26 11:10:...|        20|
|         2|        sepang|Sepang Internatio...|Kuala Lumpur| Malaysia| 2.76083|  101.738|      18|http://en.wikiped...|2024-07-26 11:10:...|        36|
|         3|       bahrain|Bahrain Internati...|      Sakhir|  Bahrain| 26.0325|  50.5106|       7|http://en.wikiped...|2024-07-26 11:10:...|        14|
|         4|     catalunya|Circuit de Barcel...|    Montmeló|    Spain|   41.57|  

In [68]:
df_with_expr = df_with_new_col.select(
    "*",
    (col("latitude") * 1.1).alias("latitude_adjusted"),
    (col("longitude") + 10).alias("longitude_adjusted")
)

In [69]:
df_with_expr.show()

+----------+--------------+--------------------+------------+---------+--------+---------+--------+--------------------+--------------------+-------------------+-------------------+
|circuit_id|   circuit_ref|                name|    location|  country|latitude|longitude|altitude|                 url|      ingestion_date|  latitude_adjusted| longitude_adjusted|
+----------+--------------+--------------------+------------+---------+--------+---------+--------+--------------------+--------------------+-------------------+-------------------+
|         1|   albert_park|Albert Park Grand...|   Melbourne|Australia|-37.8497|  144.968|      10|http://en.wikiped...|2024-07-26 11:15:...|          -41.63467|            154.968|
|         2|        sepang|Sepang Internatio...|Kuala Lumpur| Malaysia| 2.76083|  101.738|      18|http://en.wikiped...|2024-07-26 11:15:...|           3.036913|            111.738|
|         3|       bahrain|Bahrain Internati...|      Sakhir|  Bahrain| 26.0325|  50.5106|

### Handling Null Data

#### Checking the Null data

##### Checking for Null Values in All Columns

In [73]:
from pyspark.sql.functions import col, sum

# Count nulls in each column
null_counts = df_with_expr.select([sum(col(c).isNull().cast("int")).alias(c) for c in df_with_expr.columns])
null_counts.show()

+----------+-----------+----+--------+-------+--------+---------+--------+---+--------------+-----------------+------------------+
|circuit_id|circuit_ref|name|location|country|latitude|longitude|altitude|url|ingestion_date|latitude_adjusted|longitude_adjusted|
+----------+-----------+----+--------+-------+--------+---------+--------+---+--------------+-----------------+------------------+
|         0|          0|   0|       0|      0|       0|        0|       0|  0|             0|                0|                 0|
+----------+-----------+----+--------+-------+--------+---------+--------+---+--------------+-----------------+------------------+



Checking for Null Values in Specific Columns

In [74]:
# Filter rows where specific columns are null
df_nulls_in_columns = df_with_expr.filter(col("latitude").isNull() | col("longitude").isNull())
df_nulls_in_columns.show()

+----------+-----------+----+--------+-------+--------+---------+--------+---+--------------+-----------------+------------------+
|circuit_id|circuit_ref|name|location|country|latitude|longitude|altitude|url|ingestion_date|latitude_adjusted|longitude_adjusted|
+----------+-----------+----+--------+-------+--------+---------+--------+---+--------------+-----------------+------------------+
+----------+-----------+----+--------+-------+--------+---------+--------+---+--------------+-----------------+------------------+



### Filling the Null Values

In [75]:
# Replace null values with a specific value in all columns
df_filled = df_with_expr.fillna("unknown")

#### Droping the Rows with the null values

In [76]:
# Drop rows where any column is null
df_dropped_any = df_filled.dropna()

# Drop rows where specific columns are null
df_dropped_some = df_filled.dropna(subset=["name", "location"])
df_dropped_some.show()

+----------+--------------+--------------------+------------+---------+--------+---------+--------+--------------------+--------------------+-------------------+-------------------+
|circuit_id|   circuit_ref|                name|    location|  country|latitude|longitude|altitude|                 url|      ingestion_date|  latitude_adjusted| longitude_adjusted|
+----------+--------------+--------------------+------------+---------+--------+---------+--------+--------------------+--------------------+-------------------+-------------------+
|         1|   albert_park|Albert Park Grand...|   Melbourne|Australia|-37.8497|  144.968|      10|http://en.wikiped...|2024-07-26 13:39:...|          -41.63467|            154.968|
|         2|        sepang|Sepang Internatio...|Kuala Lumpur| Malaysia| 2.76083|  101.738|      18|http://en.wikiped...|2024-07-26 13:39:...|           3.036913|            111.738|
|         3|       bahrain|Bahrain Internati...|      Sakhir|  Bahrain| 26.0325|  50.5106|

#### Filtering Out Bad Data

In [77]:
df_filtered = df_filled.filter(
    (col("latitude").isNotNull()) &
    (col("longitude").isNotNull()) &
    (col("altitude") >= 0)
)
df_filtered.show()

+----------+--------------+--------------------+------------+---------+--------+---------+--------+--------------------+--------------------+-------------------+-------------------+
|circuit_id|   circuit_ref|                name|    location|  country|latitude|longitude|altitude|                 url|      ingestion_date|  latitude_adjusted| longitude_adjusted|
+----------+--------------+--------------------+------------+---------+--------+---------+--------+--------------------+--------------------+-------------------+-------------------+
|         1|   albert_park|Albert Park Grand...|   Melbourne|Australia|-37.8497|  144.968|      10|http://en.wikiped...|2024-07-26 13:40:...|          -41.63467|            154.968|
|         2|        sepang|Sepang Internatio...|Kuala Lumpur| Malaysia| 2.76083|  101.738|      18|http://en.wikiped...|2024-07-26 13:40:...|           3.036913|            111.738|
|         3|       bahrain|Bahrain Internati...|      Sakhir|  Bahrain| 26.0325|  50.5106|

## filter()

In [79]:
circuits_df.show()

+---------+--------------+--------------------+------------+---------+--------+---------+---+--------------------+
|circuitId|    circuitRef|                name|    location|  country|     lat|      lng|alt|                 url|
+---------+--------------+--------------------+------------+---------+--------+---------+---+--------------------+
|        1|   albert_park|Albert Park Grand...|   Melbourne|Australia|-37.8497|  144.968| 10|http://en.wikiped...|
|        2|        sepang|Sepang Internatio...|Kuala Lumpur| Malaysia| 2.76083|  101.738| 18|http://en.wikiped...|
|        3|       bahrain|Bahrain Internati...|      Sakhir|  Bahrain| 26.0325|  50.5106|  7|http://en.wikiped...|
|        4|     catalunya|Circuit de Barcel...|    Montmeló|    Spain|   41.57|  2.26111|109|http://en.wikiped...|
|        5|      istanbul|       Istanbul Park|    Istanbul|   Turkey| 40.9517|   29.405|130|http://en.wikiped...|
|        6|        monaco|   Circuit de Monaco| Monte-Carlo|   Monaco| 43.7347| 

In [80]:
filtered_df = circuits_df.filter("country = 'Australia'")
filtered_df.show()

+---------+-----------+--------------------+---------+---------+--------+-------+---+--------------------+
|circuitId| circuitRef|                name| location|  country|     lat|    lng|alt|                 url|
+---------+-----------+--------------------+---------+---------+--------+-------+---+--------------------+
|        1|albert_park|Albert Park Grand...|Melbourne|Australia|-37.8497|144.968| 10|http://en.wikiped...|
|       29|   adelaide|Adelaide Street C...| Adelaide|Australia|-34.9272|138.617| 58|http://en.wikiped...|
+---------+-----------+--------------------+---------+---------+--------+-------+---+--------------------+



In [81]:
filtered_df = circuits_df.where("lat > 30")
filtered_df.show()

+---------+--------------+--------------------+--------------------+--------+-------+---------+---+--------------------+
|circuitId|    circuitRef|                name|            location| country|    lat|      lng|alt|                 url|
+---------+--------------+--------------------+--------------------+--------+-------+---------+---+--------------------+
|        4|     catalunya|Circuit de Barcel...|            Montmeló|   Spain|  41.57|  2.26111|109|http://en.wikiped...|
|        5|      istanbul|       Istanbul Park|            Istanbul|  Turkey|40.9517|   29.405|130|http://en.wikiped...|
|        6|        monaco|   Circuit de Monaco|         Monte-Carlo|  Monaco|43.7347|  7.42056|  7|http://en.wikiped...|
|        7|    villeneuve|Circuit Gilles Vi...|            Montreal|  Canada|   45.5| -73.5228| 13|http://en.wikiped...|
|        8|   magny_cours|Circuit de Nevers...|         Magny Cours|  France|46.8642|  3.16361|228|http://en.wikiped...|
|        9|   silverstone| Silve

In [82]:
filtered_df = circuits_df.filter(col("country").isin(["Turkey", "Malaysia"]))
filtered_df.show()

+---------+----------+--------------------+------------+--------+-------+-------+---+--------------------+
|circuitId|circuitRef|                name|    location| country|    lat|    lng|alt|                 url|
+---------+----------+--------------------+------------+--------+-------+-------+---+--------------------+
|        2|    sepang|Sepang Internatio...|Kuala Lumpur|Malaysia|2.76083|101.738| 18|http://en.wikiped...|
|        5|  istanbul|       Istanbul Park|    Istanbul|  Turkey|40.9517| 29.405|130|http://en.wikiped...|
+---------+----------+--------------------+------------+--------+-------+-------+---+--------------------+



#### Multiple filter()

In [83]:
filtered_df = circuits_df.filter(
    (col("country").isin(["Australia", "Turkey"])) & 
    (col("alt").between(10, 100))
)
filtered_df.show()

+---------+-----------+--------------------+---------+---------+--------+-------+---+--------------------+
|circuitId| circuitRef|                name| location|  country|     lat|    lng|alt|                 url|
+---------+-----------+--------------------+---------+---------+--------+-------+---+--------------------+
|        1|albert_park|Albert Park Grand...|Melbourne|Australia|-37.8497|144.968| 10|http://en.wikiped...|
|       29|   adelaide|Adelaide Street C...| Adelaide|Australia|-34.9272|138.617| 58|http://en.wikiped...|
+---------+-----------+--------------------+---------+---------+--------+-------+---+--------------------+



## Aggregating Data

#### Count the Number of Circuits by Country

In [91]:
from pyspark.sql.functions import count, avg, max, min

count_by_country = circuits_df.groupBy("country").agg(count("circuitId").alias("num_circuits"))
count_by_country.show()

+------------+------------+
|     country|num_circuits|
+------------+------------+
|      Russia|           1|
|      Sweden|           1|
|    Malaysia|           1|
|   Singapore|           1|
|      Turkey|           1|
|     Germany|           3|
|      France|           7|
|   Argentina|           1|
|     Belgium|           3|
|       China|           1|
|       India|           1|
|       Italy|           4|
|       Spain|           6|
|      Monaco|           1|
|     Morocco|           1|
|         USA|          11|
|      Mexico|           1|
|  Azerbaijan|           1|
|          UK|           4|
|Saudi Arabia|           1|
+------------+------------+
only showing top 20 rows



#### Average Altitude of Circuits by Country

In [88]:
average_altitude_by_country = circuits_df.groupBy("country").agg(avg("alt").alias("average_altitude"))
average_altitude_by_country.show()


+------------+------------------+
|     country|  average_altitude|
+------------+------------------+
|      Russia|               2.0|
|      Sweden|             153.0|
|    Malaysia|              18.0|
|   Singapore|              18.0|
|      Turkey|             130.0|
|     Germany|244.66666666666666|
|      France|             310.0|
|   Argentina|               8.0|
|     Belgium|             192.0|
|       China|               5.0|
|       India|             194.0|
|       Italy|            145.75|
|       Spain|153.83333333333334|
|      Monaco|               7.0|
|     Morocco|              19.0|
|         USA|             243.0|
|      Mexico|            2227.0|
|  Azerbaijan|              -7.0|
|          UK|             101.5|
|Saudi Arabia|              15.0|
+------------+------------------+
only showing top 20 rows



#### Maximum and Minimum Latitude for Each Country

In [92]:
max_min_lat_by_country = circuits_df.groupBy("country").agg(
    max("lat").alias("max_latitude"),
    min("lat").alias("min_latitude")
)
max_min_lat_by_country.show()

+------------+------------+------------+
|     country|max_latitude|min_latitude|
+------------+------------+------------+
|      Russia|     43.4057|     43.4057|
|      Sweden|     57.2653|     57.2653|
|    Malaysia|     2.76083|     2.76083|
|   Singapore|      1.2914|      1.2914|
|      Turkey|     40.9517|     40.9517|
|     Germany|     52.4806|     49.3278|
|      France|     49.3306|     43.2506|
|   Argentina|    -34.6943|    -34.6943|
|     Belgium|     50.9894|     50.4372|
|       China|     31.3389|     31.3389|
|       India|     28.3487|     28.3487|
|       Italy|     45.6156|      42.475|
|       Spain|       41.57|     36.7083|
|      Monaco|     43.7347|     43.7347|
|     Morocco|     33.5786|     33.5786|
|         USA|     42.3369|     27.4547|
|      Mexico|     19.4042|     19.4042|
|  Azerbaijan|     40.3725|     40.3725|
|          UK|     53.4769|     51.3569|
|Saudi Arabia|     21.5433|     21.5433|
+------------+------------+------------+
only showing top

#### Count of Circuits and Average Altitude

In [93]:
count_and_avg_altitude_by_country = circuits_df.groupBy("country").agg(
    count("circuitId").alias("num_circuits"),
    avg("alt").alias("average_altitude")
)
count_and_avg_altitude_by_country.show()

+------------+------------+------------------+
|     country|num_circuits|  average_altitude|
+------------+------------+------------------+
|      Russia|           1|               2.0|
|      Sweden|           1|             153.0|
|    Malaysia|           1|              18.0|
|   Singapore|           1|              18.0|
|      Turkey|           1|             130.0|
|     Germany|           3|244.66666666666666|
|      France|           7|             310.0|
|   Argentina|           1|               8.0|
|     Belgium|           3|             192.0|
|       China|           1|               5.0|
|       India|           1|             194.0|
|       Italy|           4|            145.75|
|       Spain|           6|153.83333333333334|
|      Monaco|           1|               7.0|
|     Morocco|           1|              19.0|
|         USA|          11|             243.0|
|      Mexico|           1|            2227.0|
|  Azerbaijan|           1|              -7.0|
|          UK

## Join in pyspark

In [94]:
circuits_df.show()

+---------+--------------+--------------------+------------+---------+--------+---------+---+--------------------+
|circuitId|    circuitRef|                name|    location|  country|     lat|      lng|alt|                 url|
+---------+--------------+--------------------+------------+---------+--------+---------+---+--------------------+
|        1|   albert_park|Albert Park Grand...|   Melbourne|Australia|-37.8497|  144.968| 10|http://en.wikiped...|
|        2|        sepang|Sepang Internatio...|Kuala Lumpur| Malaysia| 2.76083|  101.738| 18|http://en.wikiped...|
|        3|       bahrain|Bahrain Internati...|      Sakhir|  Bahrain| 26.0325|  50.5106|  7|http://en.wikiped...|
|        4|     catalunya|Circuit de Barcel...|    Montmeló|    Spain|   41.57|  2.26111|109|http://en.wikiped...|
|        5|      istanbul|       Istanbul Park|    Istanbul|   Turkey| 40.9517|   29.405|130|http://en.wikiped...|
|        6|        monaco|   Circuit de Monaco| Monte-Carlo|   Monaco| 43.7347| 

In [95]:
races_df.show()

+------+----+-----+---------+--------------------+----------+--------+--------------------+
|raceId|year|round|circuitId|                name|      date|    time|                 url|
+------+----+-----+---------+--------------------+----------+--------+--------------------+
|     1|2009|    1|        1|Australian Grand ...|2009-03-29|06:00:00|http://en.wikiped...|
|     2|2009|    2|        2|Malaysian Grand Prix|2009-04-05|09:00:00|http://en.wikiped...|
|     3|2009|    3|       17|  Chinese Grand Prix|2009-04-19|07:00:00|http://en.wikiped...|
|     4|2009|    4|        3|  Bahrain Grand Prix|2009-04-26|12:00:00|http://en.wikiped...|
|     5|2009|    5|        4|  Spanish Grand Prix|2009-05-10|12:00:00|http://en.wikiped...|
|     6|2009|    6|        6|   Monaco Grand Prix|2009-05-24|12:00:00|http://en.wikiped...|
|     7|2009|    7|        5|  Turkish Grand Prix|2009-06-07|12:00:00|http://en.wikiped...|
|     8|2009|    8|        9|  British Grand Prix|2009-06-21|12:00:00|http://en.

## Inner Join

In [96]:
inner_join_df = circuits_df.join(races_df, on="circuitId", how="inner")
inner_join_df.show(truncate=False)

+---------+-----------+------------------------------+------------+---------+--------+---------+---+-----------------------------------------------------------------+------+----+-----+---------------------+----------+--------+-------------------------------------------------------+
|circuitId|circuitRef |name                          |location    |country  |lat     |lng      |alt|url                                                              |raceId|year|round|name                 |date      |time    |url                                                    |
+---------+-----------+------------------------------+------------+---------+--------+---------+---+-----------------------------------------------------------------+------+----+-----+---------------------+----------+--------+-------------------------------------------------------+
|1        |albert_park|Albert Park Grand Prix Circuit|Melbourne   |Australia|-37.8497|144.968  |10 |http://en.wikipedia.org/wiki/Melbourne_Grand_Prix_C

## Left Join

In [97]:
left_join_df = circuits_df.join(races_df, on="circuitId", how="left")
left_join_df.show(truncate=False)

+---------+-----------+------------------------------+---------+---------+--------+-------+---+---------------------------------------------------------+------+----+-----+---------------------+----------+--------+--------------------------------------------------------+
|circuitId|circuitRef |name                          |location |country  |lat     |lng    |alt|url                                                      |raceId|year|round|name                 |date      |time    |url                                                     |
+---------+-----------+------------------------------+---------+---------+--------+-------+---+---------------------------------------------------------+------+----+-----+---------------------+----------+--------+--------------------------------------------------------+
|1        |albert_park|Albert Park Grand Prix Circuit|Melbourne|Australia|-37.8497|144.968|10 |http://en.wikipedia.org/wiki/Melbourne_Grand_Prix_Circuit|1051  |2021|21   |Australian Grand

## Right Join

In [98]:
right_join_df = circuits_df.join(races_df, on="circuitId", how="right")
right_join_df.show(truncate=False)

+---------+-----------+------------------------------+------------+---------+--------+---------+---+-----------------------------------------------------------------+------+----+-----+---------------------+----------+--------+-------------------------------------------------------+
|circuitId|circuitRef |name                          |location    |country  |lat     |lng      |alt|url                                                              |raceId|year|round|name                 |date      |time    |url                                                    |
+---------+-----------+------------------------------+------------+---------+--------+---------+---+-----------------------------------------------------------------+------+----+-----+---------------------+----------+--------+-------------------------------------------------------+
|1        |albert_park|Albert Park Grand Prix Circuit|Melbourne   |Australia|-37.8497|144.968  |10 |http://en.wikipedia.org/wiki/Melbourne_Grand_Prix_C

## Full Outer Join

In [99]:
full_outer_join_df = circuits_df.join(races_df, on="circuitId", how="outer")
full_outer_join_df.show(truncate=False)

+---------+-----------+------------------------------+---------+---------+--------+-------+---+---------------------------------------------------------+------+----+-----+---------------------+----------+--------+--------------------------------------------------------+
|circuitId|circuitRef |name                          |location |country  |lat     |lng    |alt|url                                                      |raceId|year|round|name                 |date      |time    |url                                                     |
+---------+-----------+------------------------------+---------+---------+--------+-------+---+---------------------------------------------------------+------+----+-----+---------------------+----------+--------+--------------------------------------------------------+
|1        |albert_park|Albert Park Grand Prix Circuit|Melbourne|Australia|-37.8497|144.968|10 |http://en.wikipedia.org/wiki/Melbourne_Grand_Prix_Circuit|1     |2009|1    |Australian Grand

## User Defined functions (UDF)

In [101]:
circuits_df.show()

+---------+--------------+--------------------+------------+---------+--------+---------+---+--------------------+
|circuitId|    circuitRef|                name|    location|  country|     lat|      lng|alt|                 url|
+---------+--------------+--------------------+------------+---------+--------+---------+---+--------------------+
|        1|   albert_park|Albert Park Grand...|   Melbourne|Australia|-37.8497|  144.968| 10|http://en.wikiped...|
|        2|        sepang|Sepang Internatio...|Kuala Lumpur| Malaysia| 2.76083|  101.738| 18|http://en.wikiped...|
|        3|       bahrain|Bahrain Internati...|      Sakhir|  Bahrain| 26.0325|  50.5106|  7|http://en.wikiped...|
|        4|     catalunya|Circuit de Barcel...|    Montmeló|    Spain|   41.57|  2.26111|109|http://en.wikiped...|
|        5|      istanbul|       Istanbul Park|    Istanbul|   Turkey| 40.9517|   29.405|130|http://en.wikiped...|
|        6|        monaco|   Circuit de Monaco| Monte-Carlo|   Monaco| 43.7347| 

### UDF to Categorize Altitude

In [102]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def categorize_altitude(altitude):
    if altitude < 50:
        return "Low"
    elif 50 <= altitude <= 100:
        return "Medium"
    else:
        return "High"

# Register the UDF
categorize_altitude_udf = udf(categorize_altitude, StringType())

circuits_with_altitude_category_df = circuits_df.withColumn(
    "altitude_category", categorize_altitude_udf(col("alt"))
)

circuits_with_altitude_category_df.show(truncate=False)


+---------+--------------+------------------------------+------------+---------+--------+---------+---+-----------------------------------------------------------------+-----------------+
|circuitId|circuitRef    |name                          |location    |country  |lat     |lng      |alt|url                                                              |altitude_category|
+---------+--------------+------------------------------+------------+---------+--------+---------+---+-----------------------------------------------------------------+-----------------+
|1        |albert_park   |Albert Park Grand Prix Circuit|Melbourne   |Australia|-37.8497|144.968  |10 |http://en.wikipedia.org/wiki/Melbourne_Grand_Prix_Circuit        |Low              |
|2        |sepang        |Sepang International Circuit  |Kuala Lumpur|Malaysia |2.76083 |101.738  |18 |http://en.wikipedia.org/wiki/Sepang_International_Circuit        |Low              |
|3        |bahrain       |Bahrain International Circuit |Sak

## Window Functions

In [103]:
from pyspark.sql.functions import col, row_number, rank, dense_rank, lag, lead, sum
from pyspark.sql.window import Window

#### Calculate the total number of races for each year using a window function.

In [104]:
# Define the window specification
window_spec_races = Window.partitionBy("year")

total_races_per_year_df = races_df.withColumn("total_races", sum("raceId").over(window_spec_races))

total_races_per_year_df.show(truncate=False)


+------+----+-----+---------+------------------+----------+----+----------------------------------------------------+-----------+
|raceId|year|round|circuitId|name              |date      |time|url                                                 |total_races|
+------+----+-----+---------+------------------+----------+----+----------------------------------------------------+-----------+
|833   |1950|1    |9        |British Grand Prix|1950-05-13|\N  |http://en.wikipedia.org/wiki/1950_British_Grand_Prix|5852       |
|834   |1950|2    |6        |Monaco Grand Prix |1950-05-21|\N  |http://en.wikipedia.org/wiki/1950_Monaco_Grand_Prix |5852       |
|835   |1950|3    |19       |Indianapolis 500  |1950-05-30|\N  |http://en.wikipedia.org/wiki/1950_Indianapolis_500  |5852       |
|836   |1950|4    |66       |Swiss Grand Prix  |1950-06-04|\N  |http://en.wikipedia.org/wiki/1950_Swiss_Grand_Prix  |5852       |
|837   |1950|5    |13       |Belgian Grand Prix|1950-06-18|\N  |http://en.wikipedia.org/wi

#### rank()

In [105]:
# Define the window specification
window_spec_rank = Window.partitionBy("year").orderBy("round")

# Apply the rank function
races_with_rank_df = races_df.withColumn("rank", rank().over(window_spec_rank))

races_with_rank_df.show(truncate=False)

+------+----+-----+---------+------------------+----------+----+----------------------------------------------------+----+
|raceId|year|round|circuitId|name              |date      |time|url                                                 |rank|
+------+----+-----+---------+------------------+----------+----+----------------------------------------------------+----+
|833   |1950|1    |9        |British Grand Prix|1950-05-13|\N  |http://en.wikipedia.org/wiki/1950_British_Grand_Prix|1   |
|834   |1950|2    |6        |Monaco Grand Prix |1950-05-21|\N  |http://en.wikipedia.org/wiki/1950_Monaco_Grand_Prix |2   |
|835   |1950|3    |19       |Indianapolis 500  |1950-05-30|\N  |http://en.wikipedia.org/wiki/1950_Indianapolis_500  |3   |
|836   |1950|4    |66       |Swiss Grand Prix  |1950-06-04|\N  |http://en.wikipedia.org/wiki/1950_Swiss_Grand_Prix  |4   |
|837   |1950|5    |13       |Belgian Grand Prix|1950-06-18|\N  |http://en.wikipedia.org/wiki/1950_Belgian_Grand_Prix|5   |
|838   |1950|6  

#### dense_rank()

In [107]:
# Define the window specification
window_spec_dense_rank = Window.partitionBy("year").orderBy("round")

# Apply the dense_rank function
races_with_dense_rank_df = races_df.withColumn("dense_rank", dense_rank().over(window_spec_dense_rank))

races_with_dense_rank_df.show(truncate=False)


+------+----+-----+---------+------------------+----------+----+----------------------------------------------------+----------+
|raceId|year|round|circuitId|name              |date      |time|url                                                 |dense_rank|
+------+----+-----+---------+------------------+----------+----+----------------------------------------------------+----------+
|833   |1950|1    |9        |British Grand Prix|1950-05-13|\N  |http://en.wikipedia.org/wiki/1950_British_Grand_Prix|1         |
|834   |1950|2    |6        |Monaco Grand Prix |1950-05-21|\N  |http://en.wikipedia.org/wiki/1950_Monaco_Grand_Prix |2         |
|835   |1950|3    |19       |Indianapolis 500  |1950-05-30|\N  |http://en.wikipedia.org/wiki/1950_Indianapolis_500  |3         |
|836   |1950|4    |66       |Swiss Grand Prix  |1950-06-04|\N  |http://en.wikipedia.org/wiki/1950_Swiss_Grand_Prix  |4         |
|837   |1950|5    |13       |Belgian Grand Prix|1950-06-18|\N  |http://en.wikipedia.org/wiki/1950

## SQL queries in pyspark

In [108]:
# Register DataFrames as SQL temporary views
circuits_df.createOrReplaceTempView("circuits")
races_df.createOrReplaceTempView("races")

#### Query all columns from the circuits table.

In [109]:
# Execute SQL query
simple_query_df = spark.sql("SELECT * FROM circuits")

# Show the result
simple_query_df.show(truncate=False)

+---------+--------------+------------------------------+------------+---------+--------+---------+---+-----------------------------------------------------------------+
|circuitId|circuitRef    |name                          |location    |country  |lat     |lng      |alt|url                                                              |
+---------+--------------+------------------------------+------------+---------+--------+---------+---+-----------------------------------------------------------------+
|1        |albert_park   |Albert Park Grand Prix Circuit|Melbourne   |Australia|-37.8497|144.968  |10 |http://en.wikipedia.org/wiki/Melbourne_Grand_Prix_Circuit        |
|2        |sepang        |Sepang International Circuit  |Kuala Lumpur|Malaysia |2.76083 |101.738  |18 |http://en.wikipedia.org/wiki/Sepang_International_Circuit        |
|3        |bahrain       |Bahrain International Circuit |Sakhir      |Bahrain  |26.0325 |50.5106  |7  |http://en.wikipedia.org/wiki/Bahrain_Internatio

#### Find circuits located in "Australia".

In [110]:
# Execute SQL query with filtering
australia_circuits_df = spark.sql("SELECT * FROM circuits WHERE country = 'Australia'")

# Show the result
australia_circuits_df.show(truncate=False)


+---------+-----------+------------------------------+---------+---------+--------+-------+---+---------------------------------------------------------+
|circuitId|circuitRef |name                          |location |country  |lat     |lng    |alt|url                                                      |
+---------+-----------+------------------------------+---------+---------+--------+-------+---+---------------------------------------------------------+
|1        |albert_park|Albert Park Grand Prix Circuit|Melbourne|Australia|-37.8497|144.968|10 |http://en.wikipedia.org/wiki/Melbourne_Grand_Prix_Circuit|
|29       |adelaide   |Adelaide Street Circuit       |Adelaide |Australia|-34.9272|138.617|58 |http://en.wikipedia.org/wiki/Adelaide_Street_Circuit     |
+---------+-----------+------------------------------+---------+---------+--------+-------+---+---------------------------------------------------------+



####  Joining Tables

In [111]:
# Execute SQL query with join
race_circuit_info_df = spark.sql("""
    SELECT r.name AS race_name, c.name AS circuit_name
    FROM races r
    JOIN circuits c ON r.circuitId = c.circuitId
""")

# Show the result
race_circuit_info_df.show(truncate=False)

+---------------------+------------------------------+
|race_name            |circuit_name                  |
+---------------------+------------------------------+
|Australian Grand Prix|Albert Park Grand Prix Circuit|
|Malaysian Grand Prix |Sepang International Circuit  |
|Chinese Grand Prix   |Shanghai International Circuit|
|Bahrain Grand Prix   |Bahrain International Circuit |
|Spanish Grand Prix   |Circuit de Barcelona-Catalunya|
|Monaco Grand Prix    |Circuit de Monaco             |
|Turkish Grand Prix   |Istanbul Park                 |
|British Grand Prix   |Silverstone Circuit           |
|German Grand Prix    |Nürburgring                   |
|Hungarian Grand Prix |Hungaroring                   |
|European Grand Prix  |Valencia Street Circuit       |
|Belgian Grand Prix   |Circuit de Spa-Francorchamps  |
|Italian Grand Prix   |Autodromo Nazionale di Monza  |
|Singapore Grand Prix |Marina Bay Street Circuit     |
|Japanese Grand Prix  |Suzuka Circuit                |
|Brazilian

#### Aggregation and Grouping

In [112]:
# Execute SQL query with aggregation
race_count_per_circuit_df = spark.sql("""
    SELECT c.name AS circuit_name, COUNT(r.raceId) AS num_races
    FROM races r
    JOIN circuits c ON r.circuitId = c.circuitId
    GROUP BY c.name
""")

# Show the result
race_count_per_circuit_df.show(truncate=False)


+------------------------------+---------+
|circuit_name                  |num_races|
+------------------------------+---------+
|Fair Park                     |1        |
|Scandinavian Raceway          |6        |
|Istanbul Park                 |8        |
|Albert Park Grand Prix Circuit|25       |
|Circuito da Boavista          |2        |
|Circuit Gilles Villeneuve     |41       |
|Adelaide Street Circuit       |11       |
|Korean International Circuit  |4        |
|Suzuka Circuit                |32       |
|Buddh International Circuit   |3        |
|Autodromo Nazionale di Monza  |71       |
|Baku City Circuit             |5        |
|Silverstone Circuit           |56       |
|A1-Ring                       |25       |
|Yas Marina Circuit            |13       |
|Circuit Bremgarten            |5        |
|Rouen-Les-Essarts             |5        |
|Circuit de Barcelona-Catalunya|31       |
|Circuit de Spa-Francorchamps  |54       |
|Autódromo José Carlos Pace    |38       |
+----------

#### Ordering Results

In [113]:
# Execute SQL query with ordering
races_ordered_by_date_df = spark.sql("""
    SELECT * FROM races
    ORDER BY date
""")

# Show the result
races_ordered_by_date_df.show(truncate=False)


+------+----+-----+---------+------------------+----------+----+----------------------------------------------------+
|raceId|year|round|circuitId|name              |date      |time|url                                                 |
+------+----+-----+---------+------------------+----------+----+----------------------------------------------------+
|833   |1950|1    |9        |British Grand Prix|1950-05-13|\N  |http://en.wikipedia.org/wiki/1950_British_Grand_Prix|
|834   |1950|2    |6        |Monaco Grand Prix |1950-05-21|\N  |http://en.wikipedia.org/wiki/1950_Monaco_Grand_Prix |
|835   |1950|3    |19       |Indianapolis 500  |1950-05-30|\N  |http://en.wikipedia.org/wiki/1950_Indianapolis_500  |
|836   |1950|4    |66       |Swiss Grand Prix  |1950-06-04|\N  |http://en.wikipedia.org/wiki/1950_Swiss_Grand_Prix  |
|837   |1950|5    |13       |Belgian Grand Prix|1950-06-18|\N  |http://en.wikipedia.org/wiki/1950_Belgian_Grand_Prix|
|838   |1950|6    |55       |French Grand Prix |1950-07-