In [1]:
import pyspark
from pyspark.sql import SparkSession, Row, SQLContext
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.sql.functions import * 
from datetime import datetime, timedelta
from pyspark.sql.types import *
import random

In [2]:
credentials_location = "/home/abdol/AFCON_2023_DE_Stats/code/mage-spark/keys/my-creds.json"

In [3]:
conf = SparkConf() \
    .setMaster('local[*]') \
    .setAppName('test') \
    .set("spark.jars", "/home/abdol/AFCON_2023_DE_Stats/lib/gcs-connector-hadoop3-2.2.5.jar") \
    .set("spark.hadoop.google.cloud.auth.service.account.enable", "true") \
    .set("spark.hadoop.google.cloud.auth.service.account.json.keyfile", credentials_location)

In [4]:
sc = SparkContext(conf=conf)

In [5]:
hadoop_conf = sc._jsc.hadoopConfiguration()

In [6]:
hadoop_conf.set("fs.AbstractFileSystem.gs.impl",  "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
hadoop_conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
hadoop_conf.set("fs.gs.auth.service.account.json.keyfile", credentials_location)
hadoop_conf.set("fs.gs.auth.service.account.enable", "true")

In [7]:
spark = SparkSession.builder \
    .config(conf=sc.getConf()) \
    .getOrCreate()

In [8]:
matches_df = spark.read.parquet("gs://cloud_bucket_dbt/acfon_matches.parquet")

In [None]:
matches_df.count()

In [22]:
matches_df.printSchema()

root
 |-- match_id: long (nullable = true)
 |-- match_date: string (nullable = true)
 |-- kick_off: string (nullable = true)
 |-- competition: string (nullable = true)
 |-- season: string (nullable = true)
 |-- home_team: string (nullable = true)
 |-- away_team: string (nullable = true)
 |-- home_score: long (nullable = true)
 |-- away_score: long (nullable = true)
 |-- match_week: long (nullable = true)
 |-- competition_stage: string (nullable = true)
 |-- stadium: string (nullable = true)
 |-- referee: string (nullable = true)
 |-- home_managers: string (nullable = true)
 |-- away_managers: string (nullable = true)
 |-- source: string (nullable = true)



In [9]:
matches_df.registerTempTable('Matches')



In [None]:
matches_df.show(n=5,truncate=False)

In [50]:
events_df = spark.read.parquet("gs://cloud_bucket_dbt/match_events.parquet")

In [None]:
events_df.count()

In [51]:
events_df.printSchema()

root
 |-- foul_committed_penalty: boolean (nullable = true)
 |-- foul_won_penalty: boolean (nullable = true)
 |-- player: string (nullable = true)
 |-- player_id: double (nullable = true)
 |-- match_id: double (nullable = true)
 |-- team: string (nullable = true)
 |-- team_id: double (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- type: string (nullable = true)
 |-- location: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- play_pattern: string (nullable = true)
 |-- foul_committed_card: string (nullable = true)
 |-- foul_committed_offensive: boolean (nullable = true)
 |-- foul_committed_type: string (nullable = true)
 |-- foul_won_defensive: boolean (nullable = true)
 |-- minute: double (nullable = true)
 |-- pass_goal_assist: boolean (nullable = true)
 |-- pass_shot_assist: boolean (nullable = true)
 |-- pass_outcome: string (nullable = true)
 |-- pass_cross: boolean (nullable = true)
 |-- position: string (nullable = true)
 |-- possession:

In [None]:
events_df.filter(events_df.player_id.isNotNull()).show(n=1,truncate=False)

In [37]:
events_df.registerTempTable('events')

In [None]:
spark.sql("""
        SELECT DISTINCT shot_outcome 
        FROM events
    """).show(truncate=False)

In [None]:
import numpy as np

In [None]:
events_df.filter(events_df.player_id.isNull()).count()

In [None]:
events_df.filter(events_df.player_id.isNotNull()).count()

#####  define dimensions

##### team dimension

In [11]:
Team_dim = spark.sql("""
            SELECT DISTINCT 
            home_team 
            FROM Matches
    """)

In [None]:
Team_dim.count()

In [12]:
Team_dim = Team_dim.withColumnRenamed('home_team','team')

In [13]:
Team_dim  = Team_dim.join(events_df.select('team_id','team').distinct(),on='team',how='leftouter')

In [14]:
Team_dim = Team_dim.withColumn('team_id',Team_dim.team_id.cast(IntegerType()))

In [None]:
Team_dim.count()

In [None]:
Team_dim.printSchema()

In [15]:
Team_dim = Team_dim.select('team_id', 'team')

In [None]:
Team_dim.filter(Team_dim.team_id.isNull()).show()

In [16]:
excluded_teams = Team_dim.select(collect_list('team_id')).collect()[0][0]

In [17]:
excluded_teams_ids = spark.sparkContext.broadcast(excluded_teams)

In [18]:
def generate_random_int_not_in_list(team_id,x_list=excluded_teams_ids.value):
    if team_id is None:
        while True:
            random_int = random.randint(1, 1000)
            if random_int not in x_list:
                return int(random_int)
    else:
        return int(team_id)


In [19]:
generate_random_int_udf= udf(generate_random_int_not_in_list,IntegerType())

In [20]:
Team_dim = Team_dim.withColumn("team_id", generate_random_int_udf("team_id"))

In [191]:
Team_dim.show(truncate=False)

+-------+------------------+
|team_id|team              |
+-------+------------------+
|3374   |Côte d'Ivoire     |
|787    |Senegal           |
|4992   |Equatorial Guinea |
|4881   |Congo DR          |
|4898   |Algeria           |
|4901   |Angola            |
|4885   |Ghana             |
|751    |Nigeria           |
|69     |Mauritania        |
|788    |Morocco           |
|917    |Tunisia           |
|965    |Namibia           |
|626    |Zambia            |
|4892   |Guinea            |
|106    |Cape Verde Islands|
|4978   |Mozambique        |
|66     |Gambia            |
|364    |Tanzania          |
|160    |Cameroon          |
|739    |Guinea-Bissau     |
+-------+------------------+
only showing top 20 rows



##### Staduims dimension

In [24]:
Staduim = spark.sql("""
            SELECT DISTINCT stadium
            FROM Matches
    """)

In [25]:
Staduim = Staduim.withColumn('Staduim_ID',monotonically_increasing_id())

In [26]:
Staduim = Staduim.select('Staduim_ID', 'stadium')

In [27]:
Staduim.show(truncate=False)

+----------+-----------------------------------------+
|Staduim_ID|stadium                                  |
+----------+-----------------------------------------+
|0         |Stade Laurent Pokou                      |
|1         |Stade Amadou Gon Coulibaly               |
|2         |Stade de Bouaké                          |
|3         |Stade Charles Konan Banny de Yamoussoukro|
|4         |Stade Félix Houphouët-Boigny             |
|5         |Stade Olympique Alassane Ouattara        |
+----------+-----------------------------------------+



##### Referee Dimension

In [28]:
Referee = spark.sql("""
                    SELECT DISTINCT referee
                    FROM Matches
            """)

In [29]:
Referee = Referee.withColumn('referee_id', monotonically_increasing_id())

In [30]:
Referee = Referee.select('referee_id', 'referee')

In [31]:
Referee.show(truncate=False)

+----------+---------------------------+
|referee_id|referee                    |
+----------+---------------------------+
|0         |Ibrahim Mutaz              |
|1         |Mustapha Ghorbal           |
|2         |Abdel Aziz Mohamed Bouh    |
|3         |Omar Abdulkadir Artan      |
|4         |Bouchra Karboubi           |
|5         |Mohamed Maarouf Eid Mansour|
|6         |Jalal Jayed                |
|7         |Redouane Jiyed             |
|8         |Mohamed Adel Elsaid        |
|9         |Abdulkadir Artan           |
|10        |Mahmood Ali Mahmood Ismail |
|11        |Peter Waweru Kamaku        |
|12        |Alhadi Allaou Mahamat      |
|13        |Pierre Ghislain Atcho      |
|14        |Daniel Nii Ayi Laryea      |
|15        |Mohamed Adel Hussein       |
|16        |Dahane Beida               |
|17        |Abongile Tom               |
|18        |Issa Sy                    |
|19        |Bamlak Tessema Weyesa      |
+----------+---------------------------+
only showing top

##### Manager Dimension

In [32]:
Manager = spark.sql("""
                    SELECT DISTINCT home_managers 
                    FROM Matches
            """)

In [33]:
Manager = Manager.withColumn("manager_id", monotonically_increasing_id())

In [34]:
Manager = Manager.withColumnRenamed('home_managers', 'manager')

In [None]:
Manager.count()

In [35]:
Manager.show(truncate=False)

+-------------------------------+----------+
|manager                        |manager_id|
+-------------------------------+----------+
|Emerse Faé                     |0         |
|Rui Carlos Pinho da Vitória    |1         |
|Avraham Grant                  |2         |
|Éric Chelle                    |3         |
|Djamel Belmadi                 |4         |
|Pedro Valdemar Soares Gonçalves|5         |
|José Vítor dos Santos Peseiro  |6         |
|Collin Benjamin                |7         |
|Chris Hughton                  |8         |
|Rigobert Song Bahanag          |9         |
|Baciro Candé                   |10        |
|Jalel Kadri                    |11        |
|Amir Abdou                     |12        |
|Aliou Cissé                    |13        |
|Francisco Queriol Conde Júnior |14        |
|Tom Saintfiet                  |15        |
|Hemed Suleiman Ali             |16        |
|Hugo Henri Broos               |17        |
|Juan Micha Obiang Bicogo       |18        |
|Pedro Lei

##### Player Dimension

In [43]:
Player_dim = spark.sql("""

            SELECT DISTINCT player_id,player
            FROM events
    """)

In [39]:
Player_dim.show()

+---------+--------------------+
|player_id|              player|
+---------+--------------------+
|  10844.0|Alfredo Kulembe R...|
|  70835.0|Mohamed Hamdy Sharaf|
|   4388.0| Kouakou Herve Koffi|
|   5229.0|      Ayoub El Kaabi|
|  58878.0|     Boubacar Traore|
| 408160.0|       Dramane Salou|
| 103207.0|Loide António Aug...|
|  23331.0|    Boubakar Kouyate|
| 143165.0|       Mohamed Chibi|
|  88123.0|Cheikh Ahmadou Ba...|
|   4385.0|       Yves Bissouma|
|  12557.0|        Falaye Sacko|
|   5237.0|        Hakim Ziyech|
| 105765.0|Jesús Lázaro Owon...|
|  12111.0|     Rayan Aït Nouri|
|  52324.0|Saúl Basilio Coco...|
|   8992.0|     Ibrahim Sissoko|
|  10865.0|Witiness Chimoio ...|
|  10573.0|Jonathan Buatu Ma...|
| 176683.0|     Ibrahim Diakité|
+---------+--------------------+
only showing top 20 rows



In [40]:
Player_dim.printSchema()

root
 |-- player_id: double (nullable = true)
 |-- player: string (nullable = true)



In [44]:
Player_dim = Player_dim.withColumn('player_id',Player_dim.player_id.cast(IntegerType()))

In [45]:
print('Count of distinct players in tournment before drop null: {}'.format(Player_dim.count()))

Count of distinct players in tournment before drop null: 210


In [46]:
Player_dim = Player_dim.filter(Player_dim.player_id.isNotNull())

In [47]:
print('Count of distinct players in tournment after drop null: {}'.format(Player_dim.count()))

Count of distinct players in tournment after drop null: 209


##### Date Dimension

In [None]:
def get_max_min_date_from_src():
    max_min_date_list = spark.sql("""
                        SELECT 
                                MIN(match_date) AS min_match_date,
                                MAX(match_date) AS max_match_date
                        FROM 
                                Matches
            """).collect()
    
    
    start_date = to_date(lit(max_min_date_list[0]['min_match_date']), "%Y-%m-%d")
    max_date = to_date(lit(max_min_date_list[0]['max_match_date']), "%Y-%m-%d")
    

    
    dates = spark.range(start_date.subtract(days=1), max_date.add(days=1), 1).select("id").alias("date_id")
    
    dates = dates.withColumn("date", date_add(start_date, col("date_id") - 1))

    
    dates = dates.withColumn("year", year(col("date")))
    dates = dates.withColumn("month", month(col("date")))
    dates = dates.withColumn("day_of_week", dayofweek(col("date")))
    dates = dates.withColumn("id", monotonically_increasing_id())                        
    return dates

In [None]:
get_max_min_date_from_src().show()

##### Event fact table

In [192]:
events_df = spark.read.parquet("gs://cloud_bucket_dbt/match_events.parquet")

In [170]:
events_df.printSchema()

root
 |-- foul_committed_penalty: boolean (nullable = true)
 |-- foul_won_penalty: boolean (nullable = true)
 |-- player: string (nullable = true)
 |-- player_id: double (nullable = true)
 |-- match_id: double (nullable = true)
 |-- team: string (nullable = true)
 |-- team_id: double (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- type: string (nullable = true)
 |-- location: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- play_pattern: string (nullable = true)
 |-- foul_committed_card: string (nullable = true)
 |-- foul_committed_offensive: boolean (nullable = true)
 |-- foul_committed_type: string (nullable = true)
 |-- foul_won_defensive: boolean (nullable = true)
 |-- minute: double (nullable = true)
 |-- pass_goal_assist: boolean (nullable = true)
 |-- pass_shot_assist: boolean (nullable = true)
 |-- pass_outcome: string (nullable = true)
 |-- pass_cross: boolean (nullable = true)
 |-- position: string (nullable = true)
 |-- possession:

In [193]:
Events_fact = events_df.filter(events_df.player_id.isNotNull())
Events_fact = Events_fact.drop(Events_fact.player)

In [194]:
### get the team ids from Team dimension 
Events_fact = Events_fact.drop('team_id')
Events_fact = Events_fact.join(Team_dim, on='Team', how='leftouter')
Events_fact = Events_fact.drop('team')
Events_fact = Events_fact.withColumnRenamed('team_id', 'team_event_id')

In [176]:
Events_fact.printSchema()

root
 |-- foul_committed_penalty: boolean (nullable = true)
 |-- foul_won_penalty: boolean (nullable = true)
 |-- player_id: double (nullable = true)
 |-- match_id: double (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- type: string (nullable = true)
 |-- location: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- play_pattern: string (nullable = true)
 |-- foul_committed_card: string (nullable = true)
 |-- foul_committed_offensive: boolean (nullable = true)
 |-- foul_committed_type: string (nullable = true)
 |-- foul_won_defensive: boolean (nullable = true)
 |-- minute: double (nullable = true)
 |-- pass_goal_assist: boolean (nullable = true)
 |-- pass_shot_assist: boolean (nullable = true)
 |-- pass_outcome: string (nullable = true)
 |-- pass_cross: boolean (nullable = true)
 |-- position: string (nullable = true)
 |-- possession: double (nullable = true)
 |-- interception_outcome: string (nullable = true)
 |-- shot_outcome: string (nullable

In [195]:
### get possession team ids from team dimension 
Events_fact = Events_fact.join(Team_dim, col("possession_team") == Team_dim.team, how='leftouter') \
                         .drop(Team_dim.team) \
                         .drop("possession_team") \
                         .withColumnRenamed("team_id","possession_team_id")

In [196]:
Events_fact = Events_fact.withColumnRenamed('team_event_id', 'team_id')

In [197]:
Events_fact = Events_fact.drop('source')

In [198]:
## Hash-based Surrogate Key for events fact table
hash_cols = ["player_id","match_id","team_id","type","timestamp"]

In [199]:
def create_hash(player_id, match_id, team_id, event, timestamp):
  # Combine the values into a string (you can customize this logic)
  return f"{player_id}_{match_id}_{team_id}_{event}_{timestamp}"

create_hash_udf = udf(create_hash, StringType())

Events_fact = Events_fact.withColumn("event_id", md5(create_hash_udf(*hash_cols)))

In [200]:
Events_fact.createOrReplaceTempView('events_fact')

In [204]:
spark.sql("""SELECT *
    FROM (
            SELECT 
                row_number() OVER(PARTITION BY EVENT_ID ORDER BY EVENT_ID) AS ROW_NUM,
                *
            FROM EVENTS_FACT)L1
    WHERE L1.ROW_NUM > 1""").show()

+-------+----------------------+----------------+---------+---------+------------+-------------+-------------+--------------+-------------------+------------------------+-------------------+------------------+------+----------------+----------------+------------+----------+--------------+----------+--------------------+------------+-------+------------------+--------------------+
|ROW_NUM|foul_committed_penalty|foul_won_penalty|player_id| match_id|   timestamp|         type|     location|  play_pattern|foul_committed_card|foul_committed_offensive|foul_committed_type|foul_won_defensive|minute|pass_goal_assist|pass_shot_assist|pass_outcome|pass_cross|      position|possession|interception_outcome|shot_outcome|team_id|possession_team_id|            event_id|
+-------+----------------------+----------------+---------+---------+------------+-------------+-------------+--------------+-------------------+------------------------+-------------------+------------------+------+----------------+-

In [183]:
## DROP DUPLICATE EVENTS 
Events_fact = spark.sql("""
    SELECT *
    FROM (
            SELECT 
                row_number() OVER(PARTITION BY EVENT_ID ORDER BY EVENT_ID) AS ROW_NUM,
                *
            FROM EVENTS_FACT)L1
    WHERE L1.ROW_NUM = 1
        
    """)

In [104]:
Events_fact.count()

25622

In [184]:
Events_fact = Events_fact.withColumnRenamed('type', 'event_type') \
                         .withColumnRenamed('timestamp', 'event_timestamp') \
                         .withColumnRenamed('minute', 'event_minute') \
                         .withColumnRenamed('location', 'event_location')

In [107]:
Events_fact.printSchema()

root
 |-- ROW_NUM: integer (nullable = false)
 |-- foul_committed_penalty: boolean (nullable = true)
 |-- foul_won_penalty: boolean (nullable = true)
 |-- player_id: double (nullable = true)
 |-- match_id: double (nullable = true)
 |-- event_timestamp: string (nullable = true)
 |-- event_type: string (nullable = true)
 |-- event_location: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- play_pattern: string (nullable = true)
 |-- foul_committed_card: string (nullable = true)
 |-- foul_committed_offensive: boolean (nullable = true)
 |-- foul_committed_type: string (nullable = true)
 |-- foul_won_defensive: boolean (nullable = true)
 |-- event_minute: double (nullable = true)
 |-- pass_goal_assist: boolean (nullable = true)
 |-- pass_shot_assist: boolean (nullable = true)
 |-- pass_outcome: string (nullable = true)
 |-- pass_cross: boolean (nullable = true)
 |-- position: string (nullable = true)
 |-- possession: double (nullable = true)
 |-- possession_team: 

In [185]:
Events_fact = Events_fact.select(
                
            'event_id',
            'player_id',
            'match_id',
            'team_id',
            'event_type',
            'event_timestamp',
            'event_minute',
            'event_location',
            'play_pattern', 
            'position',
            'pass_outcome',
            'pass_cross',
            'pass_goal_assist',
            'pass_shot_assist',
            'foul_committed_type',
            'foul_committed_card',
            'foul_committed_offensive',
            'foul_won_defensive',
            'foul_committed_penalty',
            'foul_won_penalty',
            'shot_outcome',
            'interception_outcome',
            'possession',
            'possession_team_id'
            
    )

In [186]:
Events_fact.show(n=2)

+--------------------+---------+---------+-------+-------------+---------------+------------+--------------+--------------+--------------+------------+----------+----------------+----------------+-------------------+-------------------+------------------------+------------------+----------------------+----------------+------------+--------------------+----------+------------------+
|            event_id|player_id| match_id|team_id|   event_type|event_timestamp|event_minute|event_location|  play_pattern|      position|pass_outcome|pass_cross|pass_goal_assist|pass_shot_assist|foul_committed_type|foul_committed_card|foul_committed_offensive|foul_won_defensive|foul_committed_penalty|foul_won_penalty|shot_outcome|interception_outcome|possession|possession_team_id|
+--------------------+---------+---------+-------+-------------+---------------+------------+--------------+--------------+--------------+------------+----------+----------------+----------------+-------------------+--------------

In [188]:
Events_fact.select('event_location').distinct().count()

18483

In [190]:
spark.sql("""
                SELECT DISTINCT event_location 
                FROM events_fact
    """).show()

+--------------+
|event_location|
+--------------+
|  [29.4, 16.3]|
|  [43.0, 14.7]|
|  [64.8, 66.5]|
|  [49.7, 16.5]|
|  [27.0, 53.9]|
|  [64.9, 69.6]|
|  [52.0, 45.2]|
|  [61.4, 76.1]|
|   [76.7, 9.9]|
|  [43.3, 74.4]|
|  [51.8, 69.5]|
|   [78.8, 6.1]|
|  [76.1, 21.4]|
|  [39.0, 51.3]|
|  [72.8, 73.3]|
|  [57.9, 33.2]|
|  [43.4, 71.2]|
|    [6.6, 3.7]|
|  [68.2, 32.0]|
|  [55.8, 39.1]|
+--------------+
only showing top 20 rows



In [None]:
spark.stop()