In [1]:
import pyspark
from pyspark.sql import SparkSession, Row, SQLContext
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.sql.functions import * 
from datetime import datetime, timedelta
from pyspark.sql.types import *

In [2]:
credentials_location = "/home/abdol/AFCON_2023_DE_Stats/code/mage-spark/keys/my-creds.json"

In [3]:
conf = SparkConf() \
    .setMaster('local[*]') \
    .setAppName('test') \
    .set("spark.jars", "/home/abdol/AFCON_2023_DE_Stats/lib/gcs-connector-hadoop3-2.2.5.jar") \
    .set("spark.hadoop.google.cloud.auth.service.account.enable", "true") \
    .set("spark.hadoop.google.cloud.auth.service.account.json.keyfile", credentials_location)

In [4]:
sc = SparkContext(conf=conf)

In [5]:
hadoop_conf = sc._jsc.hadoopConfiguration()

In [6]:
hadoop_conf.set("fs.AbstractFileSystem.gs.impl",  "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
hadoop_conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
hadoop_conf.set("fs.gs.auth.service.account.json.keyfile", credentials_location)
hadoop_conf.set("fs.gs.auth.service.account.enable", "true")

In [None]:
spark = SparkSession.builder \
    .config(conf=sc.getConf()) \
    .getOrCreate()

In [None]:
matches_df = spark.read.parquet("gs://cloud_bucket_dbt/acfon_matches.parquet")

In [None]:
matches_df.count()

In [None]:
matches_df.printSchema()

In [None]:
matches_df.registerTempTable('Matches')

In [None]:
matches_df.show(n=5,truncate=False)

In [None]:
events_df = spark.read.parquet("gs://cloud_bucket_dbt/match_events.parquet")

In [None]:
events_df.count()

In [None]:
events_df.printSchema()

In [None]:
events_df.filter(events_df.player_id.isNotNull()).show(n=1,truncate=False)

In [None]:
events_df.registerTempTable('events')

In [None]:
spark.sql("""
        SELECT DISTINCT shot_outcome 
        FROM events
    """).show(truncate=False)

In [None]:
import numpy as np

In [None]:
events_df.filter(events_df.player_id.isNull()).count()

In [None]:
events_df.filter(events_df.player_id.isNotNull()).count()

#####  define dimensions

##### team dimension

In [None]:
Team_dim = spark.sql("""
            SELECT DISTINCT 
            home_team 
            FROM Matches
    """)

In [None]:
Team_dim.count()

In [None]:
Team_dim = Team_dim.withColumnRenamed('home_team','team')

In [None]:
Team_dim  = Team_dim.join(events_df.select('team_id','team').distinct(),on='team',how='leftouter')

In [None]:
Team_dim = Team_dim.withColumn('team_id',Team_dim.team_id.cast(IntegerType()))

In [None]:
Team_dim.count()

In [None]:
Team_dim.printSchema()

In [None]:
Team_dim = Team_dim.select('team_id', 'team')

In [None]:
Team_dim.filter(Team_dim.team_id.isNull()).show()

In [None]:
import random

In [None]:
# def generate_random_int_not_in_list(excluded_values, team_id):
#     if team_id is None:
#         while True:
#             random_int = random.randint(1, 1000)
#             if random_int not in excluded_values:
#                 return int(random_int)
#     else:
#         return int(team_id)

In [None]:
excluded_teams = Team_dim.select(collect_list('team_id')).collect()[0][0]

In [None]:
excluded_teams_ids = spark.sparkContext.broadcast(excluded_teams)

In [None]:
def generate_random_int_not_in_list(team_id,x_list=excluded_teams_ids.value):
    if team_id is None:
        while True:
            random_int = random.randint(1, 1000)
            if random_int not in x_list:
                return int(random_int)
    else:
        return int(team_id)


In [None]:
generate_random_int_udf= udf(generate_random_int_not_in_list,IntegerType())

In [None]:
Team_dim = Team_dim.withColumn("team_id", generate_random_int_udf("team_id"))

In [None]:
Team_dim.show(truncate=False)

##### Staduims dimension

In [None]:
Staduim = spark.sql("""
            SELECT DISTINCT stadium
            FROM Matches
    """)

In [None]:
Staduim = Staduim.withColumn('Staduim_ID',monotonically_increasing_id())

In [None]:
Staduim = Staduim.select('Staduim_ID', 'stadium')

In [None]:
Staduim.show(truncate=False)

##### Referee Dimension

In [None]:
Referee = spark.sql("""
                    SELECT DISTINCT referee
                    FROM Matches
            """)

In [None]:
Referee = Referee.withColumn('referee_id', monotonically_increasing_id())

In [None]:
Referee = Referee.select('referee_id', 'referee')

In [None]:
Referee.show(truncate=False)

##### Manager Dimension

In [None]:
Manager = spark.sql("""
                    SELECT DISTINCT home_managers 
                    FROM Matches
            """)

In [None]:
Manager = Manager.withColumn("manager_id", monotonically_increasing_id())

In [None]:
Manager = Manager.withColumnRenamed('home_managers', 'manager')

In [None]:
Manager.count()

In [None]:
Manager.show(truncate=False)

##### Player Dimension

In [None]:
Player_dim = spark.sql("""

            SELECT DISTINCT player_id,player
            FROM events
    """)

In [None]:
Player_dim.show()

In [None]:
Player_dim.printSchema()

In [None]:
Player_dim = Player_dim.withColumn('player_id',Player_dim.player_id.cast(IntegerType()))

##### Date Dimension

In [None]:
def get_max_min_date_from_src():
    max_min_date_list = spark.sql("""
                        SELECT 
                                MIN(match_date) AS min_match_date,
                                MAX(match_date) AS max_match_date
                        FROM 
                                Matches
            """).collect()
    
    
    start_date = to_date(lit(max_min_date_list[0]['min_match_date']), "%Y-%m-%d")
    max_date = to_date(lit(max_min_date_list[0]['max_match_date']), "%Y-%m-%d")
    

    
    dates = spark.range(start_date.subtract(days=1), max_date.add(days=1), 1).select("id").alias("date_id")
    
    dates = dates.withColumn("date", date_add(start_date, col("date_id") - 1))

    
    dates = dates.withColumn("year", year(col("date")))
    dates = dates.withColumn("month", month(col("date")))
    dates = dates.withColumn("day_of_week", dayofweek(col("date")))
    dates = dates.withColumn("id", monotonically_increasing_id())                        
    return dates

In [None]:
get_max_min_date_from_src().show()

##### Event fact table

In [None]:
# don't care with events that don't contain player_id 
Events_fact = events_df.filter(events_df.player_id.isNotNull())

In [None]:
Events_fact.registerTempTable('Events_fact')

In [None]:
spark.sql("""
            SELECT COUNT (DISTINCT PLAYER)
            FROM Events_fact
        """).show()

In [None]:
spark.stop()