In [88]:
import pyspark
from pyspark.sql import SparkSession, Row
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.sql.functions import * 
from datetime import datetime, timedelta

In [2]:
credentials_location = "/home/abdol/AFCON_2023_DE_Stats/code/mage-spark/keys/my-creds.json"

In [3]:
conf = SparkConf() \
    .setMaster('local[*]') \
    .setAppName('test') \
    .set("spark.jars", "/home/abdol/AFCON_2023_DE_Stats/lib/gcs-connector-hadoop3-2.2.5.jar") \
    .set("spark.hadoop.google.cloud.auth.service.account.enable", "true") \
    .set("spark.hadoop.google.cloud.auth.service.account.json.keyfile", credentials_location)

In [4]:
sc = SparkContext(conf=conf)

24/04/04 00:14:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [6]:
hadoop_conf = sc._jsc.hadoopConfiguration()

In [7]:
hadoop_conf.set("fs.AbstractFileSystem.gs.impl",  "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
hadoop_conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
hadoop_conf.set("fs.gs.auth.service.account.json.keyfile", credentials_location)
hadoop_conf.set("fs.gs.auth.service.account.enable", "true")

In [8]:
spark = SparkSession.builder \
    .config(conf=sc.getConf()) \
    .getOrCreate()

In [9]:
matches_df = spark.read.parquet("gs://cloud_bucket_dbt/acfon_matches.parquet")

                                                                                

In [10]:
matches_df.count()

                                                                                

52

In [11]:
matches_df.printSchema()

root
 |-- match_id: long (nullable = true)
 |-- match_date: string (nullable = true)
 |-- kick_off: string (nullable = true)
 |-- competition: string (nullable = true)
 |-- season: string (nullable = true)
 |-- home_team: string (nullable = true)
 |-- away_team: string (nullable = true)
 |-- home_score: long (nullable = true)
 |-- away_score: long (nullable = true)
 |-- match_week: long (nullable = true)
 |-- competition_stage: string (nullable = true)
 |-- stadium: string (nullable = true)
 |-- referee: string (nullable = true)
 |-- home_managers: string (nullable = true)
 |-- away_managers: string (nullable = true)
 |-- source: string (nullable = true)



In [20]:
matches_df.registerTempTable('Matches')



In [12]:
matches_df.show(n=5,truncate=False)

[Stage 4:>                                                          (0 + 1) / 1]

+--------+----------+------------+-------------------------------+------+-------------+-------------+----------+----------+----------+-----------------+-----------------------------------------+---------------------+-----------------------------+-----------------+-------------+
|match_id|match_date|kick_off    |competition                    |season|home_team    |away_team    |home_score|away_score|match_week|competition_stage|stadium                                  |referee              |home_managers                |away_managers    |source       |
+--------+----------+------------+-------------------------------+------+-------------+-------------+----------+----------+----------+-----------------+-----------------------------------------+---------------------+-----------------------------+-----------------+-------------+
|3923881 |2024-02-11|22:00:00.000|Africa - African Cup of Nations|2023  |Nigeria      |Côte d'Ivoire|1         |2         |8         |Final            |Stade Olymp

                                                                                

In [13]:
events_df = spark.read.parquet("gs://cloud_bucket_dbt/match_events.parquet")

                                                                                

In [14]:
events_df.count()

                                                                                

288671

In [15]:
events_df.printSchema()

root
 |-- foul_committed_penalty: boolean (nullable = true)
 |-- foul_won_penalty: boolean (nullable = true)
 |-- player: string (nullable = true)
 |-- player_id: double (nullable = true)
 |-- match_id: double (nullable = true)
 |-- team: string (nullable = true)
 |-- team_id: double (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- type: string (nullable = true)
 |-- location: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- play_pattern: string (nullable = true)
 |-- foul_committed_card: string (nullable = true)
 |-- foul_committed_offensive: boolean (nullable = true)
 |-- foul_committed_type: string (nullable = true)
 |-- foul_won_defensive: boolean (nullable = true)
 |-- minute: double (nullable = true)
 |-- pass_goal_assist: boolean (nullable = true)
 |-- pass_shot_assist: boolean (nullable = true)
 |-- pass_outcome: string (nullable = true)
 |-- pass_cross: boolean (nullable = true)
 |-- position: string (nullable = true)
 |-- possession:

#####  define dimensions

##### team dimension

In [33]:
Teams = spark.sql("""
            SELECT DISTINCT 
            home_team
            FROM Matches
    """)

In [29]:
Teams.count()

                                                                                

24

In [42]:
Teams.show(truncate=False)

[Stage 39:>                                                         (0 + 1) / 1]

+-------+------------------+
|Team_ID|Team_Name         |
+-------+------------------+
|0      |Côte d'Ivoire     |
|1      |Senegal           |
|2      |Equatorial Guinea |
|3      |Congo DR          |
|4      |Algeria           |
|5      |Angola            |
|6      |Ghana             |
|7      |Nigeria           |
|8      |Mauritania        |
|9      |Morocco           |
|10     |Tunisia           |
|11     |Namibia           |
|12     |Zambia            |
|13     |Guinea            |
|14     |Cape Verde Islands|
|15     |Mozambique        |
|16     |Gambia            |
|17     |Tanzania          |
|18     |Cameroon          |
|19     |Guinea-Bissau     |
+-------+------------------+
only showing top 20 rows



                                                                                

In [34]:
Teams = Teams.withColumn('Team_ID',monotonically_increasing_id())

In [37]:
Teams = Teams.withColumnRenamed('home_team','Team_Name')

In [40]:
Teams = Teams.select('Team_ID','Team_Name')

In [41]:
Teams.printSchema()

root
 |-- Team_ID: long (nullable = false)
 |-- Team_Name: string (nullable = true)



##### Staduims dimension

In [43]:
Staduim = spark.sql("""
            SELECT DISTINCT stadium
            FROM Matches
    """)

In [46]:
Staduim = Staduim.withColumn('Staduim_ID',monotonically_increasing_id())

In [47]:
Staduim = Staduim.select('Staduim_ID', 'stadium')

In [48]:
Staduim.show(truncate=False)

[Stage 48:>                                                         (0 + 1) / 1]

+----------+-----------------------------------------+
|Staduim_ID|stadium                                  |
+----------+-----------------------------------------+
|0         |Stade Laurent Pokou                      |
|1         |Stade Amadou Gon Coulibaly               |
|2         |Stade de Bouaké                          |
|3         |Stade Charles Konan Banny de Yamoussoukro|
|4         |Stade Félix Houphouët-Boigny             |
|5         |Stade Olympique Alassane Ouattara        |
+----------+-----------------------------------------+



                                                                                

##### Referee Dimension

In [51]:
Referee = spark.sql("""
                    SELECT DISTINCT referee
                    FROM Matches
            """)

In [52]:
Referee = Referee.withColumn('referee_id', monotonically_increasing_id())

In [53]:
Referee = Referee.select('referee_id', 'referee')

In [55]:
Referee.show(truncate=False)

[Stage 54:>                                                         (0 + 1) / 1]

+----------+---------------------------+
|referee_id|referee                    |
+----------+---------------------------+
|0         |Ibrahim Mutaz              |
|1         |Mustapha Ghorbal           |
|2         |Abdel Aziz Mohamed Bouh    |
|3         |Omar Abdulkadir Artan      |
|4         |Bouchra Karboubi           |
|5         |Mohamed Maarouf Eid Mansour|
|6         |Jalal Jayed                |
|7         |Redouane Jiyed             |
|8         |Mohamed Adel Elsaid        |
|9         |Abdulkadir Artan           |
|10        |Mahmood Ali Mahmood Ismail |
|11        |Peter Waweru Kamaku        |
|12        |Alhadi Allaou Mahamat      |
|13        |Pierre Ghislain Atcho      |
|14        |Daniel Nii Ayi Laryea      |
|15        |Mohamed Adel Hussein       |
|16        |Dahane Beida               |
|17        |Abongile Tom               |
|18        |Issa Sy                    |
|19        |Bamlak Tessema Weyesa      |
+----------+---------------------------+
only showing top

                                                                                

##### Manager Dimension

In [57]:
Manager = spark.sql("""
                    SELECT DISTINCT home_managers 
                    FROM Matches
            """)

In [58]:
Manager = Manager.withColumn("manager_id", monotonically_increasing_id())

In [59]:
Manager = Manager.withColumnRenamed('home_managers', 'manager')

In [60]:
Manager.count()

                                                                                

25

In [61]:
Manager.show(truncate=False)

[Stage 63:>                                                         (0 + 1) / 1]

+-------------------------------+----------+
|manager                        |manager_id|
+-------------------------------+----------+
|Emerse Faé                     |0         |
|Rui Carlos Pinho da Vitória    |1         |
|Avraham Grant                  |2         |
|Éric Chelle                    |3         |
|Djamel Belmadi                 |4         |
|Pedro Valdemar Soares Gonçalves|5         |
|José Vítor dos Santos Peseiro  |6         |
|Collin Benjamin                |7         |
|Chris Hughton                  |8         |
|Rigobert Song Bahanag          |9         |
|Baciro Candé                   |10        |
|Jalel Kadri                    |11        |
|Amir Abdou                     |12        |
|Aliou Cissé                    |13        |
|Francisco Queriol Conde Júnior |14        |
|Tom Saintfiet                  |15        |
|Hemed Suleiman Ali             |16        |
|Hugo Henri Broos               |17        |
|Juan Micha Obiang Bicogo       |18        |
|Pedro Lei

                                                                                

##### Date Dimension

In [96]:
def get_max_min_date_from_src():
    max_min_date_list = spark.sql("""
                        SELECT 
                                MIN(match_date) AS min_match_date,
                                MAX(match_date) AS max_match_date
                        FROM 
                                Matches
            """).collect()
    
    
    start_date = to_date(lit(max_min_date_list[0]['min_match_date']), "%Y-%m-%d")
    max_date = to_date(lit(max_min_date_list[0]['max_match_date']), "%Y-%m-%d")
    

    
    dates = spark.range(start_date.subtract(days=1), max_date.add(days=1), 1).select("id").alias("date_id")
    
    dates = dates.withColumn("date", date_add(start_date, col("date_id") - 1))

    
    dates = dates.withColumn("year", year(col("date")))
    dates = dates.withColumn("month", month(col("date")))
    dates = dates.withColumn("day_of_week", dayofweek(col("date")))
    dates = dates.withColumn("id", monotonically_increasing_id())                        
    return dates

In [97]:
get_max_min_date_from_src().show()

                                                                                

TypeError: 'Column' object is not callable

In [98]:
spark.stop()