In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [2]:
spark= SparkSession.builder.getOrCreate()

In [3]:
matches=spark.read.format('csv') \
    .option('header','True') \
    .option('inferschema','True') \
    .load('../data/matches.csv')

In [4]:
matches.show(3)

+-------+----+----------+-------------------+----------+----+------------+-------------+----------------+----------+-------------+--------------+
| Season|Tour|      Date|               Time|  Opponent|HoAw|ArsenalScore|OpponentScore|         Stadium|Attendance|        Coach|       Referee|
+-------+----+----------+-------------------+----------+----+------------+-------------+----------------+----------+-------------+--------------+
|2017/18|   1|2017-08-11|2025-03-25 20:45:00| Leicester|home|           4|            3|Emirates Stadium|     59387|Arsène Wenger|     Mike Dean|
|2017/18|   2|2017-08-19|2025-03-25 18:30:00|Stoke City|away|           0|            1|  bet365 Stadium|     29459|Arsène Wenger|Andre Marriner|
|2017/18|   3|2017-08-27|2025-03-25 17:00:00| Liverpool|away|           0|            4|         Anfield|     53206|Arsène Wenger|  Craig Pawson|
+-------+----+----------+-------------------+----------+----+------------+-------------+----------------+----------+--------

In [5]:
matches.schema

StructType([StructField('Season', StringType(), True), StructField('Tour', IntegerType(), True), StructField('Date', DateType(), True), StructField('Time', TimestampType(), True), StructField('Opponent', StringType(), True), StructField('HoAw', StringType(), True), StructField('ArsenalScore', IntegerType(), True), StructField('OpponentScore', IntegerType(), True), StructField('Stadium', StringType(), True), StructField('Attendance', IntegerType(), True), StructField('Coach', StringType(), True), StructField('Referee', StringType(), True)])

In [6]:
matches.createOrReplaceTempView("Matches")

In [7]:
distinct_matches=spark.sql('''
    select count(distinct Date) 
    from Matches

''').show()

+--------------------+
|count(DISTINCT Date)|
+--------------------+
|                 214|
+--------------------+



In [8]:
Matches=spark.sql("""
    select count(Date)
    from Matches
""").show()

+-----------+
|count(Date)|
+-----------+
|        214|
+-----------+



In [9]:
matches.columns

['Season',
 'Tour',
 'Date',
 'Time',
 'Opponent',
 'HoAw',
 'ArsenalScore',
 'OpponentScore',
 'Stadium',
 'Attendance',
 'Coach',
 'Referee']

In [10]:
DimMatch= matches.withColumn("MatchID", monotonically_increasing_id())

In [11]:
DimMatch.columns

['Season',
 'Tour',
 'Date',
 'Time',
 'Opponent',
 'HoAw',
 'ArsenalScore',
 'OpponentScore',
 'Stadium',
 'Attendance',
 'Coach',
 'Referee',
 'MatchID']

In [12]:
DimMatch.write.csv('../data/DimMatches', header=True)

AnalysisException: [PATH_ALREADY_EXISTS] Path file:/home/jovyan/work/data/DimMatches already exists. Set mode as "overwrite" to overwrite the existing path.

#### Loading the DimMatch for DWH Schema in ArsenalFC Database in Post

In [None]:
DimMatch=spark.read.csv('../data/DimMatches/DimMatches.csv',header=True)

In [None]:
DimMatch.write.format("jdbc") \
    .option("url", "jdbc:postgresql://postgres:5432/mydatabase") \
    .option("driver", "org.postgresql.Driver") \
    .option("dbtable", "dwh.DimArsenalMatches") \
    .option("user", "postgres") \
    .option("password", "postgres") \
    .mode("overwrite") \
    .save()

In [None]:
from pyspark.sql.functions import to_date,date_format

In [None]:
DimMatch.columns

In [None]:
DimMatch=DimMatch.withColumn("FormattedDate", date_format(to_date("Date", "yyyy-MM-dd"), "yyyy-MM-dd"))

In [None]:
DimMatch.columns

In [None]:
DimMatch.createOrReplaceTempView("dimmatch")

In [None]:
spark.sql("""
    select distinct Date
    from dimmatch

""").show()

In [None]:
spark.sql("""
    select distinct FormattedDate
    from dimmatch

""").show()

In [None]:
DimMatch.schema

In [None]:
DimMatch.write.format("jdbc") \
    .option("url", "jdbc:postgresql://postgres:5432/mydatabase") \
    .option("driver", "org.postgresql.Driver") \
    .option("dbtable", "dwh.DimArsenalMatches") \
    .option("user", "postgres") \
    .option("password", "postgres") \
    .mode("overwrite") \
    .save()