In [76]:
from pyspark.sql import SparkSession, types, functions as F
import pandas as pd

In [77]:
spark = SparkSession\
  .builder\
  .master("local[*]")\
  .appName('matches_etl')\
  .config("spark.driver.bindAddress", "127.0.0.1")\
  .getOrCreate()

In [78]:
df_nested = spark.read\
  .option("multiline", "true")\
  .json("./data/matches/*/*")

                                                                                

In [79]:
df_flat = df_nested.select(
    F.col('match_id').cast(types.IntegerType()),
    F.col('match_date'),
    F.col('kick_off'),
    F.col('match_week').cast(types.IntegerType()),
    F.col('season.season_id').cast(types.IntegerType()).alias('season_id'),
    F.col('season.season_name').cast(types.StringType()).alias('season_name'),
    F.col('competition.competition_id').cast(types.IntegerType()).alias('competition_id'),
    F.col('competition.competition_name').cast(types.StringType()).alias('competition_name'),
    F.col('competition.country_name').cast(types.StringType()).alias('competition_country'),
    F.col('competition_stage.id').cast(types.IntegerType()).alias('competition_stage_id'),
    F.col('competition_stage.name').cast(types.StringType()).alias('competition_stage_name'),
    F.col('home_team.home_team_id').cast(types.IntegerType()).alias('home_team_id'),
    F.col('home_team.home_team_name').cast(types.StringType()).alias('home_team_name'),
    F.col('home_team.home_team_gender').cast(types.StringType()).alias('home_team_gender'),
    F.col('home_team.home_team_group').cast(types.StringType()).alias('home_team_group'),
    F.col('home_team.country.id').cast(types.IntegerType()).alias('home_team_country_id'),
    F.col('home_team.country.name').cast(types.StringType()).alias('home_team_country'),
    F.col('away_team.away_team_id').cast(types.IntegerType()).alias('away_team_id'),
    F.col('away_team.away_team_name').cast(types.StringType()).alias('away_team_name'),
    F.col('away_team.away_team_gender').cast(types.StringType()).alias('away_team_gender'),
    F.col('away_team.away_team_group').cast(types.StringType()).alias('away_team_group'),
    F.col('away_team.country.id').cast(types.IntegerType()).alias('away_team_country_id'),
    F.col('away_team.country.name').cast(types.StringType()).alias('away_team_country'),
    F.col('home_score').cast(types.IntegerType()),
    F.col('away_score').cast(types.IntegerType()),
    F.col('referee.id').cast(types.IntegerType()).alias('referee_id'),
    F.col('referee.name').cast(types.StringType()).alias('referee_name'),
    F.col('stadium.id').cast(types.IntegerType()).alias('stadium_id'),
    F.col('stadium.name').cast(types.StringType()).alias('stadium_name'),
    F.col('stadium.country.id').cast(types.IntegerType()).alias('stadium_country_id'),
    F.col('stadium.country.name').cast(types.StringType()).alias('stadium_country_name')
)

In [80]:
df_flat.show()

+--------+----------+------------+----------+---------+-----------+--------------+----------------+-------------------+--------------------+----------------------+------------+--------------------+----------------+---------------+--------------------+-----------------+------------+--------------------+----------------+---------------+--------------------+-----------------+----------+----------+----------+--------------------+----------+--------------------+------------------+--------------------+
|match_id|match_date|    kick_off|match_week|season_id|season_name|competition_id|competition_name|competition_country|competition_stage_id|competition_stage_name|home_team_id|      home_team_name|home_team_gender|home_team_group|home_team_country_id|home_team_country|away_team_id|      away_team_name|away_team_gender|away_team_group|away_team_country_id|away_team_country|home_score|away_score|referee_id|        referee_name|stadium_id|        stadium_name|stadium_country_id|stadium_country_na

In [81]:
@F.pandas_udf(types.TimestampType())
def get_datetime(date_str: pd.Series, time_str: pd.Series) -> pd.Series:
  datetime_str = date_str.str.cat(time_str, sep=' ')

  return pd.to_datetime(datetime_str)

In [82]:
df_flat = df_flat\
  .withColumn('match_datetime', get_datetime('match_date', 'kick_off'))\
  .drop('match_date', 'kick_off')

In [83]:
df_flat.write.parquet('./data/pq/matches')

25/04/16 17:21:10 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers
                                                                                

In [84]:
spark.stop()