In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [2]:
spark= SparkSession.builder.getOrCreate()

In [5]:
players=spark.read.format('csv')\
    .option('header','True') \
    .option('inferSchema','True') \
    .load('../data/players.csv')

In [6]:
players.show()

+---------+---------+---------+-----+---+---+---+---+---+---+---+---+---+---+-------+-------+----+------+---+----+---+------+-------+------+-------+------+----------+---+
| LastName|FirstName|     Date|Start|Pos|Min|  G|  A| PK|PKA|  S|SoT| YK| RK|Touches|Tackles|Ints|Blocks| xG|npxG|xAG|Passes|PassesA|PrgPas|Carries|PrgCar|      Line|  C|
+---------+---------+---------+-----+---+---+---+---+---+---+---+---+---+---+-------+-------+----+------+---+----+---+------+-------+------+-------+------+----------+---+
| Bellerin|   Hector|8/11/2017|    1| WB| 90|  0|  0|  0|  0|  1|  1|  0|  0|     79|      3|   0|     0|0.3| 0.3|0.0|    61|     70|     3|     51|     1|  Defender|  0|
|   Elneny|  Mohamed|8/11/2017|    1| CM| 66|  0|  1|  0|  0|  1|  0|  0|  0|     82|      4|   0|     2|0.0| 0.0|0.1|    65|     72|     4|     57|     0|Midfielder|  0|
|  Holding|      Rob|8/11/2017|    1| CB| 66|  0|  0|  0|  0|  0|  0|  0|  0|     75|      1|   1|     0|0.0| 0.0|0.0|    50|     60|     4|     

In [7]:
players.columns

['LastName',
 'FirstName',
 'Date',
 'Start',
 'Pos',
 'Min',
 'G',
 'A',
 'PK',
 'PKA',
 'S',
 'SoT',
 'YK',
 'RK',
 'Touches',
 'Tackles',
 'Ints',
 'Blocks',
 'xG',
 'npxG',
 'xAG',
 'Passes',
 'PassesA',
 'PrgPas',
 'Carries',
 'PrgCar',
 'Line',
 'C']

In [8]:
players.schema

StructType([StructField('LastName', StringType(), True), StructField('FirstName', StringType(), True), StructField('Date', StringType(), True), StructField('Start', IntegerType(), True), StructField('Pos', StringType(), True), StructField('Min', IntegerType(), True), StructField('G', IntegerType(), True), StructField('A', IntegerType(), True), StructField('PK', IntegerType(), True), StructField('PKA', IntegerType(), True), StructField('S', IntegerType(), True), StructField('SoT', IntegerType(), True), StructField('YK', IntegerType(), True), StructField('RK', IntegerType(), True), StructField('Touches', IntegerType(), True), StructField('Tackles', IntegerType(), True), StructField('Ints', IntegerType(), True), StructField('Blocks', IntegerType(), True), StructField('xG', DoubleType(), True), StructField('npxG', DoubleType(), True), StructField('xAG', DoubleType(), True), StructField('Passes', IntegerType(), True), StructField('PassesA', IntegerType(), True), StructField('PrgPas', Intege

In [9]:
players.createOrReplaceTempView("players")

In [12]:
spark.sql('''
        select concat(firstname," ",lastname) as fullname
        from players
          
          ''').count()

2741

In [16]:
distinct_players=spark.sql('''
        select distinct concat(firstname," ",lastname) as fullname
        from players
          
          ''')

In [17]:
players_dates=spark.sql('''
    select count(distinct Date)
    from players

''')

In [18]:
players_dates.show()

+--------------------+
|count(DISTINCT Date)|
+--------------------+
|                 214|
+--------------------+



## Here we make sure that the distinct count date for matches in DimMatches Equal the distinct count Date for DimPlayers 214

In [20]:
distinct_players=distinct_players.withColumn('PlayersID',monotonically_increasing_id())

In [21]:
distinct_players.show()

+-------------------+---------+
|           fullname|PlayersID|
+-------------------+---------+
|   Emile Smith Rowe|        0|
|    Folarin Balogun|        1|
|    Hector Bellerin|        2|
|        Joe Willock|        3|
|     William Saliba|        4|
|       Aaron Ramsey|        5|
|        Bukayo Saka|        6|
|     Kieran Tierney|        7|
|   Shkodran Mustafi|        8|
|    Daniel Ceballos|        9|
|Alexandre Lacazette|       10|
|     Kieran Willian|       11|
|      Gabriel Jesus|       12|
|      Danny Welbeck|       13|
| Gabriel Marquinhos|       14|
|         Pablo Mari|       15|
|          Ben White|       16|
|      Ethan Nwaneri|       17|
|     Sead Kolasinac|       18|
|     Calum Chambers|       19|
+-------------------+---------+
only showing top 20 rows



In [23]:
players=players.withColumn('fullname',concat_ws(' ',col('FirstName'),col('LastName')))

In [24]:
players.columns

['LastName',
 'FirstName',
 'Date',
 'Start',
 'Pos',
 'Min',
 'G',
 'A',
 'PK',
 'PKA',
 'S',
 'SoT',
 'YK',
 'RK',
 'Touches',
 'Tackles',
 'Ints',
 'Blocks',
 'xG',
 'npxG',
 'xAG',
 'Passes',
 'PassesA',
 'PrgPas',
 'Carries',
 'PrgCar',
 'Line',
 'C',
 'fullname']

In [28]:
players.select('fullname').show(5)

+-------------------+
|           fullname|
+-------------------+
|    Hector Bellerin|
|     Mohamed Elneny|
|        Rob Holding|
|     Sead Kolasinac|
|Alexandre Lacazette|
+-------------------+
only showing top 5 rows



In [29]:
dimplayer=players.join(distinct_players,on='fullname',how='inner')

In [31]:
dimplayer.columns

['fullname',
 'LastName',
 'FirstName',
 'Date',
 'Start',
 'Pos',
 'Min',
 'G',
 'A',
 'PK',
 'PKA',
 'S',
 'SoT',
 'YK',
 'RK',
 'Touches',
 'Tackles',
 'Ints',
 'Blocks',
 'xG',
 'npxG',
 'xAG',
 'Passes',
 'PassesA',
 'PrgPas',
 'Carries',
 'PrgCar',
 'Line',
 'C',
 'PlayersID']

In [33]:
dimplayer.write.csv('../data/Dimplayers.csv', header=True)

In [36]:
dimplayer.select('PlayersID',
'LastName',
 'FirstName',
 'Date',
 'Start',
 'Pos',
 'Min',
 'G',
 'A',
 'PK',
 'PKA' 
).show()

+---------+---------+---------+---------+-----+---+---+---+---+---+---+
|PlayersID| LastName|FirstName|     Date|Start|Pos|Min|  G|  A| PK|PKA|
+---------+---------+---------+---------+-----+---+---+---+---+---+---+
|        2| Bellerin|   Hector|8/11/2017|    1| WB| 90|  0|  0|  0|  0|
|       49|   Elneny|  Mohamed|8/11/2017|    1| CM| 66|  0|  1|  0|  0|
|       26|  Holding|      Rob|8/11/2017|    1| CB| 66|  0|  0|  0|  0|
|       18|Kolasinac|     Sead|8/11/2017|    1| CB| 90|  0|  1|  0|  0|
|       10|Lacazette|Alexandre|8/11/2017|    1| FW| 90|  1|  0|  0|  0|
|       21|  Monreal|    Nacho|8/11/2017|    1| CB| 90|  0|  0|  0|  0|
|       53|     Ozil|    Mesut|8/11/2017|    1| AM| 90|  0|  0|  0|  0|
|       13|  Welbeck|    Danny|8/11/2017|    1| AM| 74|  1|  0|  0|  0|
|       44|    Xhaka|   Granit|8/11/2017|    1| CM| 90|  0|  2|  0|  0|
|        5|   Ramsey|    Aaron|8/11/2017|    0| DM| 24|  1|  0|  0|  0|
|        2| Bellerin|   Hector|8/19/2017|    1| WB| 90|  0|  0| 