In [18]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [19]:
spark= SparkSession.builder.getOrCreate()

In [20]:
GoalKeepers = spark.read.format('csv')\
    .option("header", "True")\
    .option("inferSchema", "True")\
    .load("../data/goalkeepers.csv")

In [21]:
GoalKeepers.columns

['LastName',
 'FirstName',
 'Date',
 'Start',
 'Pos',
 'Min',
 'SoTA',
 'GA',
 'Saves',
 'PSxG',
 'PKatt',
 'PKA',
 'PKm',
 'PassAtt',
 'Throws',
 'AvgLen',
 'GKAtt',
 'GKAvgLen',
 'C']

In [22]:
GoalKeepers.createOrReplaceTempView('gk')

In [23]:
GoalKeepers=GoalKeepers.withColumn('fullname',concat_ws(' ',col('FirstName'),col('LastName')))

In [24]:
GoalKeepers.select('fullname').show(5,False)

+------------+
|fullname    |
+------------+
|David Ospina|
|Petr Cech   |
|Petr Cech   |
|David Ospina|
|David Ospina|
+------------+
only showing top 5 rows



In [46]:
goalkeep=spark.sql('''
    select distinct concat(firstname,' ',lastname) as fullname
    from gk
''')

In [47]:
gk_=spark.sql("""
    select count(distinct fullname)
    from ( select distinct concat(firstname,' ',lastname) as fullname
    from gk ) as kk
""").show()

+------------------------+
|count(DISTINCT fullname)|
+------------------------+
|                       7|
+------------------------+



In [48]:
goalkeep=goalkeep.withColumn('GKID',monotonically_increasing_id()+1)

In [49]:
goalkeep.show()

+-----------------+----+
|         fullname|GKID|
+-----------------+----+
|Emiliano Martinez|   1|
|   Aaron Ramsdale|   2|
|       Bernd Leno|   3|
|        Petr Cech|   4|
|  Runar Runarsson|   5|
|      Mathew Ryan|   6|
|     David Ospina|   7|
+-----------------+----+



In [51]:
DimGoalKeepers=GoalKeepers.join(goalkeep,on='fullname',how='inner')

In [52]:
DimGoalKeepers.show()

+------------+--------+---------+----------+-----+---+---+----+---+-----+----+-----+----+----+-------+------+------+-----+--------+---+----+
|    fullname|LastName|FirstName|      Date|Start|Pos|Min|SoTA| GA|Saves|PSxG|PKatt| PKA| PKm|PassAtt|Throws|AvgLen|GKAtt|GKAvgLen|  C|GKID|
+------------+--------+---------+----------+-----+---+---+----+---+-----+----+-----+----+----+-------+------+------+-----+--------+---+----+
|David Ospina|  Ospina|    David| 5/13/2018|    1| GK| 90|   3|  0|    3| 0.4|    0|   0|   0|     39|     8|  31.9|    9|    48.2|  0|   7|
|   Petr Cech|    Cech|     Petr|  5/9/2018|    1| GK| 90|  10|  3|    7| 3.2|    1|   1|   0|     26|     7|  34.5|   11|    66.0|  1|   4|
|   Petr Cech|    Cech|     Petr|  5/6/2018|    1| GK| 90|   2|  0|    2| 0.2|    0|   0|   0|     31|     8|  32.4|    2|    56.0|  1|   4|
|David Ospina|  Ospina|    David| 4/29/2018|    1| GK| 90|   2|  2|    0| 1.3|    0|   0|   0|     15|     4|  41.1|    5|    49.0|  0|   7|
|David Ospina

In [53]:
DimGoalKeepers.columns

['fullname',
 'LastName',
 'FirstName',
 'Date',
 'Start',
 'Pos',
 'Min',
 'SoTA',
 'GA',
 'Saves',
 'PSxG',
 'PKatt',
 'PKA',
 'PKm',
 'PassAtt',
 'Throws',
 'AvgLen',
 'GKAtt',
 'GKAvgLen',
 'C',
 'GKID']

In [54]:
DimGoalKeepers.write.csv('../data/DimGoalKeeper',header=True)

### Creating the FactGK

In [58]:
dimmatch=spark.read.format('csv')\
    .option('header','True') \
    .option('inferSchema','True')\
    .load('../data/DimMatches/DimMatches.csv')

In [59]:
factgk=dimmatch.join(DimGoalKeepers,on='Date',how='left')

In [63]:
factgk.columns

['Date',
 'Season',
 'Tour',
 'Time',
 'Opponent',
 'HoAw',
 'ArsenalScore',
 'OpponentScore',
 'Stadium',
 'Attendance',
 'Coach',
 'Referee',
 'MatchID',
 'fullname',
 'LastName',
 'FirstName',
 'Start',
 'Pos',
 'Min',
 'SoTA',
 'GA',
 'Saves',
 'PSxG',
 'PKatt',
 'PKA',
 'PKm',
 'PassAtt',
 'Throws',
 'AvgLen',
 'GKAtt',
 'GKAvgLen',
 'C',
 'GKID']

In [62]:
dimmatch.columns

['Season',
 'Tour',
 'Date',
 'Time',
 'Opponent',
 'HoAw',
 'ArsenalScore',
 'OpponentScore',
 'Stadium',
 'Attendance',
 'Coach',
 'Referee',
 'MatchID']

In [64]:
factgk=factgk.drop(
'Season',
 'Tour',
 'Time',
 'Opponent',
 'HoAw',
 'Stadium',
 'Coach',
 'Referee',
  'Pos',
    'C',
'fullname',
 'LastName',
 'FirstName'
    
)

In [65]:
factgk.columns

['Date',
 'ArsenalScore',
 'OpponentScore',
 'Attendance',
 'MatchID',
 'Start',
 'Min',
 'SoTA',
 'GA',
 'Saves',
 'PSxG',
 'PKatt',
 'PKA',
 'PKm',
 'PassAtt',
 'Throws',
 'AvgLen',
 'GKAtt',
 'GKAvgLen',
 'GKID']

In [66]:
factgk.write.csv('../data/FactGK', header=True)