In [1]:
import pyspark
from pyspark.sql import SparkSession, Row, SQLContext
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.sql.functions import * 
from datetime import datetime, timedelta
from pyspark.sql.types import *
import random
from pyspark.sql.functions import udf, col 

In [2]:
credentials_location = "/home/abdol/AFCON_2023_DE_Stats/code/mage-spark/keys/my-creds.json"

In [3]:
conf = SparkConf() \
    .setMaster('local[*]') \
    .setAppName('test') \
    .set("spark.jars", "/home/abdol/AFCON_2023_DE_Stats/lib/gcs-connector-hadoop3-2.2.5.jar") \
    .set("spark.hadoop.google.cloud.auth.service.account.enable", "true") \
    .set("spark.hadoop.google.cloud.auth.service.account.json.keyfile", credentials_location)

In [4]:
sc = SparkContext(conf=conf)

In [5]:
hadoop_conf = sc._jsc.hadoopConfiguration()

In [6]:
hadoop_conf.set("fs.AbstractFileSystem.gs.impl",  "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
hadoop_conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
hadoop_conf.set("fs.gs.auth.service.account.json.keyfile", credentials_location)
hadoop_conf.set("fs.gs.auth.service.account.enable", "true")

In [7]:
spark = SparkSession.builder \
    .config(conf=sc.getConf()) \
    .getOrCreate()

In [8]:
matches_df = spark.read.parquet("gs://cloud_bucket_dbt/acfon_matches.parquet")

In [9]:
matches_df.registerTempTable('Matches')



In [10]:
events_df = spark.read.parquet("gs://cloud_bucket_dbt/match_events.parquet")

In [11]:
Team_dim = spark.sql("""
            SELECT DISTINCT 
            home_team 
            FROM Matches
    """)

Team_dim = Team_dim.withColumnRenamed('home_team','team')

Team_dim  = Team_dim.join(events_df.select('team_id','team').distinct(),on='team',how='leftouter')

Team_dim = Team_dim.withColumn('team_id',Team_dim.team_id.cast(IntegerType()))

Team_dim = Team_dim.select('team_id', 'team')

In [12]:
# x_list = [3374, 787, 4992, 4881, 4898, 4901, 4885, 788, 4892, 4978, 774, 4921, 4930]

In [13]:
excluded_list = Team_dim.select(collect_list('team_id')).collect()[0][0]

In [15]:
x_list = spark.sparkContext.broadcast(excluded_list)

In [16]:
def generate_random_int_not_in_list(team_id,x_list=x_list.value):
    if team_id is None:
        while True:
            random_int = random.randint(1, 1000)
            if random_int not in x_list:
                return int(random_int)
    else:
        return int(team_id)


In [17]:
generate_random_int_udf= udf(generate_random_int_not_in_list,IntegerType())

In [18]:
Team_dim = Team_dim.withColumn("team_id", generate_random_int_udf("team_id"))

In [19]:
Team_dim.show(truncate=False)

+-------+------------------+
|team_id|team              |
+-------+------------------+
|3374   |CÃ´te d'Ivoire     |
|787    |Senegal           |
|4992   |Equatorial Guinea |
|4881   |Congo DR          |
|4898   |Algeria           |
|4901   |Angola            |
|4885   |Ghana             |
|652    |Nigeria           |
|44     |Mauritania        |
|788    |Morocco           |
|3      |Tunisia           |
|835    |Namibia           |
|437    |Zambia            |
|4892   |Guinea            |
|252    |Cape Verde Islands|
|4978   |Mozambique        |
|431    |Gambia            |
|760    |Tanzania          |
|931    |Cameroon          |
|999    |Guinea-Bissau     |
+-------+------------------+
only showing top 20 rows



In [20]:
spark.stop()