In [85]:
from pyspark.sql import SparkSession, types, functions as F

In [106]:
spark = SparkSession\
  .builder\
  .master("local[*]")\
  .appName('test')\
  .config("spark.driver.bindAddress", "127.0.0.1")\
  .getOrCreate()

In [107]:
matches_df_nested = spark.read\
  .option("multiline", "true")\
  .json("./data/matches/*/*")

                                                                                

In [109]:
matches_df_nested.printSchema()

root
 |-- away_score: long (nullable = true)
 |-- away_team: struct (nullable = true)
 |    |-- away_team_gender: string (nullable = true)
 |    |-- away_team_group: string (nullable = true)
 |    |-- away_team_id: long (nullable = true)
 |    |-- away_team_name: string (nullable = true)
 |    |-- country: struct (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |-- managers: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- country: struct (nullable = true)
 |    |    |    |    |-- id: long (nullable = true)
 |    |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- dob: string (nullable = true)
 |    |    |    |-- id: long (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- nickname: string (nullable = true)
 |-- competition: struct (nullable = true)
 |    |-- competition_id: long (nullable = true)
 |    |-- competition_nam

In [None]:
matches_df_flat = matches_df_nested \
  .withColumn('match_id', F.col('match_id').cast(types.LongType())) \
  .withColumn('season_id', F.col('season.season_id').cast(types.IntegerType())) \
  .withColumn('season', F.col('season.season_name').cast(types.StringType())) \
  .withColumn('match_date', F.col('match_date').cast(types.DateType())) \
  .withColumn('kickoff_time', F.col('kick_off').cast(types.TimestampType())) \
  .withColumn('match_week', F.col('match_week').cast(types.IntegerType())) \
  .withColumn('competition_id', F.col('competition.competition_id').cast(types.IntegerType())) \
  .withColumn('competition_name', F.col('competition.competition_name').cast(types.StringType())) \
  .withColumn('competition_country', F.col('competition.country_name').cast(types.StringType())) \
  .withColumn('competition_stage_id', F.col('competition_stage.id').cast(types.IntegerType())) \
  .withColumn('competition_stage_name', F.col('competition_stage.name').cast(types.StringType())) \
  .withColumn('home_team_id', F.col('home_team.home_team_id').cast(types.IntegerType())) \
  .withColumn('home_team_name', F.col('home_team.home_team_name').cast(types.StringType())) \
  .withColumn('home_team_gender', F.col('home_team.home_team_gender').cast(types.StringType())) \
  .withColumn('home_team_country_id', F.col('home_team.home_team_gender').cast(types.IntegerType())) \
  .withColumn('home_team_country_name', F.col('home_team.home_team_gender').cast(types.StringType())) \
  .withColumn('home_team_group', F.col('home_team.home_team_group').cast(types.StringType())) \
  .withColumn('away_team_id', F.col('away_team.away_team_id').cast(types.IntegerType())) \
  .withColumn('away_team_name', F.col('away_team.away_team_name').cast(types.StringType())) \
  .withColumn('away_team_gender', F.col('away_team.away_team_gender').cast(types.StringType())) \
  .withColumn('away_team_country_id', F.col('away_team.away_team_gender').cast(types.IntegerType())) \
  .withColumn('away_team_country_name', F.col('away_team.away_team_gender').cast(types.StringType())) \
  .withColumn('away_team_group', F.col('away_team.away_team_group').cast(types.StringType())) \
  .withColumn('home_score', F.col('home_score').cast(types.IntegerType())) \
  .withColumn('away_score', F.col('away_score').cast(types.IntegerType())) \
  .withColumn('referee_id', F.col('referee.id').cast(types.IntegerType())) \
  .withColumn('referee_name', F.col('referee.name')) \
  .withColumn('stadium_id', F.col('stadium.id').cast(types.IntegerType())) \
  .withColumn('stadium_name', F.col('stadium.name').cast(types.StringType()))

In [103]:
matches_df_flat.show()

+----------+--------------------+--------------------+-------------------+----------+--------------------+------------+--------------------+----------------+----------+--------+------------+----------------+----------+-------------+--------------------+---------+--------------------+--------------+----------------+-------------------+--------------------+----------------------+----------------+------------+--------------------+--------------------+----------------------+---------------+----------------+------------+--------------------+--------------------+----------------------+---------------+-------------------+----------+--------------------+---------+----------+--------------------+
|away_score|           away_team|         competition|  competition_stage|home_score|           home_team|    kick_off|        last_updated|last_updated_360|match_date|match_id|match_status|match_status_360|match_week|     metadata|             referee|   season|             stadium|competition_id|comp

In [77]:
matches_df_nested.printSchema()

root
 |-- away_score: long (nullable = true)
 |-- away_team: struct (nullable = true)
 |    |-- away_team_gender: string (nullable = true)
 |    |-- away_team_group: string (nullable = true)
 |    |-- away_team_id: long (nullable = true)
 |    |-- away_team_name: string (nullable = true)
 |    |-- country: struct (nullable = true)
 |    |    |-- id: long (nullable = true)
 |    |    |-- name: string (nullable = true)
 |    |-- managers: array (nullable = true)
 |    |    |-- element: struct (containsNull = true)
 |    |    |    |-- country: struct (nullable = true)
 |    |    |    |    |-- id: long (nullable = true)
 |    |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- dob: string (nullable = true)
 |    |    |    |-- id: long (nullable = true)
 |    |    |    |-- name: string (nullable = true)
 |    |    |    |-- nickname: string (nullable = true)
 |-- competition: struct (nullable = true)
 |    |-- competition_id: long (nullable = true)
 |    |-- competition_nam

In [78]:
matches_df_flat = matches_df_nested \
  .select([
    'match_id',
    'match_date',
    'competition.competition_name',
    'competition_stage.name',
    'season.season_name',
    'match_week',
    'kick_off',
    'home_team.home_team_name',
    'away_team.away_team_name',
    'home_score',
    'away_score',
    'last_updated'
  ])

In [79]:
matches_df_flat.show()

+--------+----------+----------------+--------------+-----------+----------+------------+--------------------+--------------------+----------+----------+--------------------+
|match_id|match_date|competition_name|          name|season_name|match_week|    kick_off|      home_team_name|      away_team_name|home_score|away_score|        last_updated|
+--------+----------+----------------+--------------+-----------+----------+------------+--------------------+--------------------+----------+----------+--------------------+
| 3825848|2015-09-23|         La Liga|Regular Season|  2015/2016|         5|20:00:00.000|          Levante UD|               Eibar|         2|         2|2023-02-21T15:19:...|
| 3825895|2015-09-23|         La Liga|Regular Season|  2015/2016|         5|22:00:00.000|          Las Palmas|             Sevilla|         2|         0|2023-02-21T15:19:...|
| 3825894|2016-05-01|         La Liga|Regular Season|  2015/2016|        36|18:15:00.000|RC Deportivo La C...|              G

In [80]:
matches_df_flat.printSchema()

root
 |-- match_id: long (nullable = true)
 |-- match_date: string (nullable = true)
 |-- competition_name: string (nullable = true)
 |-- name: string (nullable = true)
 |-- season_name: string (nullable = true)
 |-- match_week: long (nullable = true)
 |-- kick_off: string (nullable = true)
 |-- home_team_name: string (nullable = true)
 |-- away_team_name: string (nullable = true)
 |-- home_score: long (nullable = true)
 |-- away_score: long (nullable = true)
 |-- last_updated: string (nullable = true)



In [81]:
spark.stop()