In [45]:
import pyspark
from pyspark.sql import SparkSession, Row, SQLContext
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.sql.functions import * 
from datetime import datetime, timedelta
from pyspark.sql.types import *
import random

In [2]:
credentials_location = "/home/abdol/keys/afcon_project/my-creds.json"

In [3]:
GCS_connector = "/home/abdol/AFCON_2023_DE_Stats/lib/gcs-connector-hadoop3-2.2.5.jar" 
GBQ_connector = "/home/abdol/AFCON_2023_DE_Stats/lib/spark-3.3-bigquery-0.36.1.jar"
conf_jars = f"{GCS_connector},{GBQ_connector}"

In [4]:
temp_GCS_Bucket = "afcon_datalake"
bucket_name = "afcon_datalake"

In [5]:
conf = SparkConf() \
    .setMaster('local[*]') \
    .setAppName('test') \
    .set("spark.jars", conf_jars) \
    .set("spark.hadoop.google.cloud.auth.service.account.enable", "true") \
    .set("spark.hadoop.google.cloud.auth.service.account.json.keyfile", credentials_location) \
    .set("temporaryGcsBucket",temp_GCS_Bucket)

In [6]:
sc = SparkContext(conf=conf)

In [7]:
hadoop_conf = sc._jsc.hadoopConfiguration()

In [8]:
hadoop_conf.set("fs.AbstractFileSystem.gs.impl",  "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
hadoop_conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
hadoop_conf.set("fs.gs.auth.service.account.json.keyfile", credentials_location)
hadoop_conf.set("fs.gs.auth.service.account.enable", "true")

In [9]:
spark = SparkSession.builder \
    .config(conf=sc.getConf()) \
    .getOrCreate()

In [10]:
matches_df = spark.read.parquet("gs://{}/acfon_matches.parquet".format(bucket_name))

In [20]:
matches_df.printSchema()

root
 |-- match_id: long (nullable = true)
 |-- match_date: string (nullable = true)
 |-- kick_off: string (nullable = true)
 |-- competition: string (nullable = true)
 |-- season: string (nullable = true)
 |-- home_team: string (nullable = true)
 |-- away_team: string (nullable = true)
 |-- home_score: long (nullable = true)
 |-- away_score: long (nullable = true)
 |-- match_week: long (nullable = true)
 |-- competition_stage: string (nullable = true)
 |-- stadium: string (nullable = true)
 |-- referee: string (nullable = true)
 |-- home_managers: string (nullable = true)
 |-- away_managers: string (nullable = true)
 |-- source: string (nullable = true)
 |-- home_team_id: long (nullable = true)
 |-- away_team_id: long (nullable = true)



In [11]:
matches_df.registerTempTable('Matches')



In [None]:
matches_df.toPandas()

In [13]:
team_dim = spark.read.format('bigquery') \
  .option('parentProject', 'data-engineering-afcon-2023') \
  .option('table', 'data-engineering-afcon-2023:afcon_events.team_dim') \
  .load()

In [14]:
team_dim.createOrReplaceTempView('team')

In [None]:
spark.sql("""
            SELECT *
            FROM team
            WHERE team_id in (775,3374)
        """).show()

In [None]:
team_dim.show()

In [15]:
team_dim = broadcast(team_dim)

In [16]:
matches_df = matches_df.join(team_dim, col("home_team") == team_dim.team, how='leftouter') \
                        .drop('team') \
                        .withColumnRenamed('team_id', 'home_team_id') \
                        .join(team_dim, col("away_team") == team_dim.team, how='leftouter') \
                        .drop('team') \
                        .withColumnRenamed('team_id', 'away_team_id')

In [None]:
matches_df.toPandas()

In [17]:
manager_dim = spark.read.format('bigquery') \
  .option('parentProject', 'data-engineering-afcon-2023') \
  .option('table', 'data-engineering-afcon-2023:afcon_events.manager_dim') \
  .load()

In [25]:
manager_dim.registerTempTable("manager")



In [19]:
manager_dim.printSchema()

root
 |-- manager: string (nullable = true)
 |-- manager_id: long (nullable = false)



In [18]:
manager_dim = broadcast(manager_dim)

In [23]:
matches_df = matches_df.join(manager_dim, col("home_managers") == manager_dim.manager, how='leftouter') \
                        .drop("manager") \
                        .withColumnRenamed("manager_id", "home_manager_id") \
                        .join(manager_dim , col("away_managers") == manager_dim.manager, how='leftouter') \
                        .drop("manager") \
                        .withColumnRenamed("manager_id", "away_manager_id")

In [24]:
matches_df.toPandas()

Unnamed: 0,match_id,match_date,kick_off,competition,season,home_team,away_team,home_score,away_score,match_week,competition_stage,stadium,referee,home_managers,away_managers,source,home_team_id,away_team_id,home_manager_id,away_manager_id
0,3923881,2024-02-11,22:00:00.000,Africa - African Cup of Nations,2023,Nigeria,Côte d'Ivoire,1,2,8,Final,Stade Olympique Alassane Ouattara,Dahane Beida,José Vítor dos Santos Peseiro,Emerse Faé,acfon_matches,775,3374,6,0.0
1,3923880,2024-02-10,22:00:00.000,Africa - African Cup of Nations,2023,South Africa,Congo DR,0,0,7,3rd Place Final,Stade Félix Houphouët-Boigny,Bamlak Tessema Weyesa,Hugo Henri Broos,Sébastien Desabre,acfon_matches,4976,4881,17,23.0
2,3922838,2024-02-07,22:00:00.000,Africa - African Cup of Nations,2023,Côte d'Ivoire,Congo DR,1,0,6,Semi-finals,Stade Olympique Alassane Ouattara,Ibrahim Mutaz,Emerse Faé,Sébastien Desabre,acfon_matches,3374,4881,0,23.0
3,3922837,2024-02-07,19:00:00.000,Africa - African Cup of Nations,2023,Nigeria,South Africa,1,1,6,Semi-finals,Stade de Bouaké,Amin Mohamed Omar,José Vítor dos Santos Peseiro,Hugo Henri Broos,acfon_matches,775,4976,6,17.0
4,3922242,2024-01-29,22:00:00.000,Africa - African Cup of Nations,2023,Senegal,Côte d'Ivoire,1,1,4,Round of 16,Stade Charles Konan Banny de Yamoussoukro,Pierre Ghislain Atcho,Aliou Cissé,Emerse Faé,acfon_matches,787,3374,13,0.0
5,3922239,2024-01-28,19:00:00.000,Africa - African Cup of Nations,2023,Equatorial Guinea,Guinea,0,1,4,Round of 16,Stade Olympique Alassane Ouattara,Omar Abdulkadir Artan,Juan Micha Obiang Bicogo,Kaba Diawara,acfon_matches,4992,4892,18,20.0
6,3922659,2024-02-03,19:00:00.000,Africa - African Cup of Nations,2023,Mali,Côte d'Ivoire,1,2,5,Quarter-finals,Stade de Bouaké,Mohamed Adel Elsaid,Éric Chelle,Emerse Faé,acfon_matches,4930,3374,3,0.0
7,3922658,2024-02-03,22:00:00.000,Africa - African Cup of Nations,2023,Cape Verde Islands,South Africa,0,0,5,Quarter-finals,Stade Charles Konan Banny de Yamoussoukro,Jean Jacques Ndala Ngambo,Pedro Leitão Brito,Hugo Henri Broos,acfon_matches,4924,4976,19,17.0
8,3922244,2024-01-30,19:00:00.000,Africa - African Cup of Nations,2023,Mali,Burkina Faso,2,1,4,Round of 16,Stade Amadou Gon Coulibaly,Ibrahim Mutaz,Éric Chelle,Hubert Velud,acfon_matches,4930,4921,3,21.0
9,3922356,2024-02-02,22:00:00.000,Africa - African Cup of Nations,2023,Congo DR,Guinea,3,1,5,Quarter-finals,Stade Olympique Alassane Ouattara,Mustapha Ghorbal,Sébastien Desabre,Kaba Diawara,acfon_matches,4881,4892,23,20.0


In [26]:
spark.sql("""
        SELECT *
        FROM MANAGER
        WHERE MANAGER_ID IN (6,0)
""").show()

+--------------------+----------+
|             manager|manager_id|
+--------------------+----------+
|          Emerse Faé|         0|
|José Vítor dos Sa...|         6|
+--------------------+----------+



In [28]:
stadium_dim = spark.read.format('bigquery') \
  .option('parentProject', 'data-engineering-afcon-2023') \
  .option('table', 'data-engineering-afcon-2023:afcon_events.stadium_dim') \
  .load()

In [31]:
stadium_dim.printSchema()

root
 |-- stadium_id: long (nullable = false)
 |-- stadium: string (nullable = true)



In [29]:
stadium_dim.createOrReplaceTempView('stadium')

In [32]:
matches_df = matches_df.join(broadcast(stadium_dim), on='stadium', how='leftouter') \
                        .drop('stadium')

In [37]:
matches_df.toPandas().head(2)

Unnamed: 0,match_id,match_date,kick_off,competition,season,home_team,away_team,home_score,away_score,match_week,competition_stage,home_managers,away_managers,source,home_team_id,away_team_id,home_manager_id,away_manager_id,stadium_id,referee_id
0,3923881,2024-02-11,22:00:00.000,Africa - African Cup of Nations,2023,Nigeria,Côte d'Ivoire,1,2,8,Final,José Vítor dos Santos Peseiro,Emerse Faé,acfon_matches,775,3374,6,0.0,5,16
1,3923880,2024-02-10,22:00:00.000,Africa - African Cup of Nations,2023,South Africa,Congo DR,0,0,7,3rd Place Final,Hugo Henri Broos,Sébastien Desabre,acfon_matches,4976,4881,17,23.0,4,19


In [35]:
referee_dim = spark.read.format('bigquery') \
  .option('parentProject', 'data-engineering-afcon-2023') \
  .option('table', 'data-engineering-afcon-2023:afcon_events.referee_dim') \
  .load()

In [39]:
referee_dim.printSchema()

root
 |-- referee_id: long (nullable = false)
 |-- referee: string (nullable = true)



In [36]:
matches_df = matches_df.join(broadcast(referee_dim), on='referee', how='leftouter') \
                        .drop('referee')

In [38]:
matches_df.printSchema()

root
 |-- match_id: long (nullable = true)
 |-- match_date: string (nullable = true)
 |-- kick_off: string (nullable = true)
 |-- competition: string (nullable = true)
 |-- season: string (nullable = true)
 |-- home_team: string (nullable = true)
 |-- away_team: string (nullable = true)
 |-- home_score: long (nullable = true)
 |-- away_score: long (nullable = true)
 |-- match_week: long (nullable = true)
 |-- competition_stage: string (nullable = true)
 |-- home_managers: string (nullable = true)
 |-- away_managers: string (nullable = true)
 |-- source: string (nullable = true)
 |-- home_team_id: long (nullable = true)
 |-- away_team_id: long (nullable = true)
 |-- home_manager_id: long (nullable = true)
 |-- away_manager_id: long (nullable = true)
 |-- stadium_id: long (nullable = true)
 |-- referee_id: long (nullable = true)



In [41]:
matches_columns = [
    
    'match_id',
    'home_team_id',
    'away_team_id',
    'home_manager_id',
    'away_manager_id',
    'stadium_id',
    'referee_id',
    'match_date',
    'match_week',
    'kick_off',
    'competition_stage',
    'home_score',
    'away_score'
    
]

In [42]:
matches_df = matches_df.select(matches_columns)

In [43]:
matches_df.printSchema()

root
 |-- match_id: long (nullable = true)
 |-- home_team_id: long (nullable = true)
 |-- away_team_id: long (nullable = true)
 |-- home_manager_id: long (nullable = true)
 |-- away_manager_id: long (nullable = true)
 |-- stadium_id: long (nullable = true)
 |-- referee_id: long (nullable = true)
 |-- match_date: string (nullable = true)
 |-- match_week: long (nullable = true)
 |-- kick_off: string (nullable = true)
 |-- competition_stage: string (nullable = true)
 |-- home_score: long (nullable = true)
 |-- away_score: long (nullable = true)



In [49]:
matches_df = matches_df.withColumn("match_id", matches_df.match_id.cast(IntegerType())) \
                      .withColumn("home_team_id", matches_df.home_team_id.cast(IntegerType())) \
                      .withColumn("away_team_id", matches_df.away_team_id.cast(IntegerType())) \
                      .withColumn("home_manager_id", matches_df.home_manager_id.cast(IntegerType())) \
                      .withColumn("away_manager_id", matches_df.away_manager_id.cast(IntegerType())) \
                      .withColumn("stadium_id", matches_df.stadium_id.cast(IntegerType())) \
                      .withColumn("referee_id", matches_df.referee_id.cast(IntegerType())) \
                      .withColumn("match_date", matches_df.match_date.cast(DateType())) \
                      .withColumn("match_week", matches_df.match_week.cast(IntegerType())) \
                      .withColumn("kick_off", matches_df.kick_off.cast(TimestampType())) \
                      .withColumn("home_score", matches_df.home_score.cast(IntegerType())) \
                      .withColumn("away_score", matches_df.away_score.cast(IntegerType()))

In [63]:
matches_df.printSchema()

root
 |-- match_id: integer (nullable = true)
 |-- home_team_id: integer (nullable = true)
 |-- away_team_id: integer (nullable = true)
 |-- home_manager_id: integer (nullable = true)
 |-- away_manager_id: integer (nullable = true)
 |-- stadium_id: integer (nullable = true)
 |-- referee_id: integer (nullable = true)
 |-- match_date: date (nullable = true)
 |-- match_week: integer (nullable = true)
 |-- kick_off: timestamp (nullable = true)
 |-- competition_stage: string (nullable = true)
 |-- home_score: integer (nullable = true)
 |-- away_score: integer (nullable = true)
 |-- goals_scored: integer (nullable = true)
 |-- penalties_finished: string (nullable = false)



In [55]:
matches_df = matches_df.withColumn('goals_scored', col("home_score") + col("away_score"))

In [62]:
matches_df.show(n=10)

+--------+------------+------------+---------------+---------------+----------+----------+----------+----------+-------------------+-----------------+----------+----------+------------+--------------------+
|match_id|home_team_id|away_team_id|home_manager_id|away_manager_id|stadium_id|referee_id|match_date|match_week|           kick_off|competition_stage|home_score|away_score|goals_scored|  penalties_finished|
+--------+------------+------------+---------------+---------------+----------+----------+----------+----------+-------------------+-----------------+----------+----------+------------+--------------------+
| 3923881|         775|        3374|              6|              0|         5|        16|2024-02-11|         8|2024-04-09 22:00:00|            Final|         1|         2|           3|No_Penalties_Fini...|
| 3923880|        4976|        4881|             17|             23|         4|        19|2024-02-10|         7|2024-04-09 22:00:00|  3rd Place Final|         0|         0|

In [59]:
spark.sql("""
            SELECT DISTINCT competition_stage
            FROM matches
    """).show()

+-----------------+
|competition_stage|
+-----------------+
|            Final|
|  3rd Place Final|
|      Round of 16|
|      Group Stage|
|      Semi-finals|
|   Quarter-finals|
+-----------------+



In [61]:
# define if match goes to penalites or not

matches_df = matches_df.withColumn('penalties_finished', \
                when((col("competition_stage") == "Group Stage"), 'No_Penalties_Allowed') \
                .when((col("competition_stage") != "Group Stage") & (col("goals_scored") == 0), 'Penalties_Finished') \
                .otherwise("No_Penalties_Finished"))

In [64]:
matches_df.printSchema()

root
 |-- match_id: integer (nullable = true)
 |-- home_team_id: integer (nullable = true)
 |-- away_team_id: integer (nullable = true)
 |-- home_manager_id: integer (nullable = true)
 |-- away_manager_id: integer (nullable = true)
 |-- stadium_id: integer (nullable = true)
 |-- referee_id: integer (nullable = true)
 |-- match_date: date (nullable = true)
 |-- match_week: integer (nullable = true)
 |-- kick_off: timestamp (nullable = true)
 |-- competition_stage: string (nullable = true)
 |-- home_score: integer (nullable = true)
 |-- away_score: integer (nullable = true)
 |-- goals_scored: integer (nullable = true)
 |-- penalties_finished: string (nullable = false)



In [None]:
matches_df.createOrReplaceView

In [68]:
## match_dim 
match_dim = matches_df.select(
                'match_id',
                'home_team_id',
                'away_team_id',
                'home_manager_id',
                'away_manager_id',
                'stadium_id',
                'referee_id',
                'match_week',
                'kick_off',
                'competition_stage'
        )

In [69]:
match_fact = matches_df.select(

            'match_id',
            'match_date',
            'competition_stage',
            'home_score',
            'away_score',
            'goals_scored',
            'penalties_finished'
    )

In [70]:
spark.stop()

In [None]:
# spark.stop()