In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [2]:
spark = SparkSession.builder.config("spark.jars", "/Drivers/SQL_Sever/jdbc/postgresql-42.7.3.jar")\
.getOrCreate()

In [3]:
matches=spark.read.format('csv') \
    .option('header','True') \
    .option('inferschema','True') \
    .load('../data/matches.csv')

In [4]:
matches.show(3)

+-------+----+----------+-------------------+----------+----+------------+-------------+----------------+----------+-------------+--------------+
| Season|Tour|      Date|               Time|  Opponent|HoAw|ArsenalScore|OpponentScore|         Stadium|Attendance|        Coach|       Referee|
+-------+----+----------+-------------------+----------+----+------------+-------------+----------------+----------+-------------+--------------+
|2017/18|   1|2017-08-11|2025-03-27 20:45:00| Leicester|home|           4|            3|Emirates Stadium|     59387|Arsène Wenger|     Mike Dean|
|2017/18|   2|2017-08-19|2025-03-27 18:30:00|Stoke City|away|           0|            1|  bet365 Stadium|     29459|Arsène Wenger|Andre Marriner|
|2017/18|   3|2017-08-27|2025-03-27 17:00:00| Liverpool|away|           0|            4|         Anfield|     53206|Arsène Wenger|  Craig Pawson|
+-------+----+----------+-------------------+----------+----+------------+-------------+----------------+----------+--------

In [5]:
matches.schema

StructType([StructField('Season', StringType(), True), StructField('Tour', IntegerType(), True), StructField('Date', DateType(), True), StructField('Time', TimestampType(), True), StructField('Opponent', StringType(), True), StructField('HoAw', StringType(), True), StructField('ArsenalScore', IntegerType(), True), StructField('OpponentScore', IntegerType(), True), StructField('Stadium', StringType(), True), StructField('Attendance', IntegerType(), True), StructField('Coach', StringType(), True), StructField('Referee', StringType(), True)])

In [6]:
matches.createOrReplaceTempView("Matches")

In [7]:
distinct_matches=spark.sql('''
    select count(distinct Date) 
    from Matches

''').show()

+--------------------+
|count(DISTINCT Date)|
+--------------------+
|                 214|
+--------------------+



In [8]:
Matches=spark.sql("""
    select count(Date)
    from Matches
""").show()

+-----------+
|count(Date)|
+-----------+
|        214|
+-----------+



In [9]:
matches.columns

['Season',
 'Tour',
 'Date',
 'Time',
 'Opponent',
 'HoAw',
 'ArsenalScore',
 'OpponentScore',
 'Stadium',
 'Attendance',
 'Coach',
 'Referee']

In [10]:
DimMatch= matches.withColumn("MatchID", monotonically_increasing_id())

In [11]:
DimMatch.columns

['Season',
 'Tour',
 'Date',
 'Time',
 'Opponent',
 'HoAw',
 'ArsenalScore',
 'OpponentScore',
 'Stadium',
 'Attendance',
 'Coach',
 'Referee',
 'MatchID']

In [12]:
DimMatch.write.csv('../data/DimMatches', header=True)

#### Loading the DimMatch for DWH Schema in ArsenalFC Database in Post

In [12]:
DimMatch=spark.read.csv('../data/DimMatches/DimMatches.csv',header=True)

In [14]:
DimMatch.write.format("jdbc") \
    .option("url", "jdbc:postgresql://postgres:5432/mydatabase") \
    .option("driver", "org.postgresql.Driver") \
    .option("dbtable", "dwh.DimArsenalMatches") \
    .option("user", "postgres") \
    .option("password", "postgres") \
    .mode("overwrite") \
    .save()

In [15]:
from pyspark.sql.functions import to_date,date_format

In [16]:
DimMatch.columns

['Season',
 'Tour',
 'Date',
 'Time',
 'Opponent',
 'HoAw',
 'ArsenalScore',
 'OpponentScore',
 'Stadium',
 'Attendance',
 'Coach',
 'Referee',
 'MatchID']

In [17]:
DimMatch=DimMatch.withColumn("FormattedDate", date_format(to_date("Date", "yyyy-MM-dd"), "yyyy-MM-dd"))

In [18]:
DimMatch.columns

['Season',
 'Tour',
 'Date',
 'Time',
 'Opponent',
 'HoAw',
 'ArsenalScore',
 'OpponentScore',
 'Stadium',
 'Attendance',
 'Coach',
 'Referee',
 'MatchID',
 'FormattedDate']

In [19]:
DimMatch.createOrReplaceTempView("dimmatch")

In [20]:
spark.sql("""
    select distinct Date
    from dimmatch

""").show()

+----------+
|      Date|
+----------+
|2017-12-22|
|2020-09-12|
|2020-06-20|
|2018-12-22|
|2017-10-01|
|2019-09-22|
|2018-09-15|
|2022-02-19|
|2020-11-29|
|2018-12-16|
|2020-01-01|
|2022-04-04|
|2018-10-07|
|2021-12-06|
|2020-12-16|
|2022-05-16|
|2019-10-06|
|2018-11-11|
|2019-04-01|
|2019-12-09|
+----------+
only showing top 20 rows



In [21]:
spark.sql("""
    select distinct FormattedDate
    from dimmatch

""").show()

+-------------+
|FormattedDate|
+-------------+
|   2017-12-22|
|   2020-09-12|
|   2020-06-20|
|   2018-12-22|
|   2017-10-01|
|   2019-09-22|
|   2018-09-15|
|   2022-02-19|
|   2020-11-29|
|   2018-12-16|
|   2020-01-01|
|   2022-04-04|
|   2018-10-07|
|   2021-12-06|
|   2020-12-16|
|   2022-05-16|
|   2019-10-06|
|   2018-11-11|
|   2019-04-01|
|   2019-12-09|
+-------------+
only showing top 20 rows



In [22]:
DimMatch.schema

StructType([StructField('Season', StringType(), True), StructField('Tour', StringType(), True), StructField('Date', StringType(), True), StructField('Time', StringType(), True), StructField('Opponent', StringType(), True), StructField('HoAw', StringType(), True), StructField('ArsenalScore', StringType(), True), StructField('OpponentScore', StringType(), True), StructField('Stadium', StringType(), True), StructField('Attendance', StringType(), True), StructField('Coach', StringType(), True), StructField('Referee', StringType(), True), StructField('MatchID', StringType(), True), StructField('FormattedDate', StringType(), True)])

In [23]:
DimMatch.write.format("jdbc") \
    .option("url", "jdbc:postgresql://postgres:5432/mydatabase") \
    .option("driver", "org.postgresql.Driver") \
    .option("dbtable", "dwh.DimArsenalMatches") \
    .option("user", "postgres") \
    .option("password", "postgres") \
    .mode("overwrite") \
    .save()

In [24]:
DimMatch.write.format("jdbc") \
    .option("url", "jdbc:postgresql://postgres:5432/mydatabase") \
    .option("driver", "org.postgresql.Driver") \
    .option("dbtable", "dwh.DimArsenalMatches") \
    .option("user", "postgres") \
    .option("password", "postgres") \
    .mode("overwrite") \
    .save()