In [324]:
from pyspark.sql import SparkSession
import os
from pyspark.sql.functions import col, length, from_json, expr, split, lit, to_date
from pyspark.sql.types import StringType, StructType, StructField, MapType
import pandas as pd
import numpy as np

In [325]:
spark = SparkSession.builder.getOrCreate()
base_parquet_path = './raw_data/movies_review/'

In [326]:
schema = ArrayType(
    StructType([
        StructField("author", StringType(), True),
        StructField("author_details", StructType([
            StructField("rating", StringType(), True)
        ]), True),
        StructField("created_at", StringType(), True),
    ])
)

In [329]:
df = spark.read.parquet(base_parquet_path)\
          .filter(length("results")>2)\
          .withColumn("results_test", col('results'))\
          .withColumn("results_parsed", from_json(col("results_test"), schema))\
          .withColumn("result_exploded", explode(col("results_parsed")))\
          .withColumn('result_exploded', col("result_exploded").cast(StringType()))

split_col = split(df['result_exploded'], ', ')

df = df.withColumn('author', split_col.getItem(0)) \
       .withColumn('author', expr("substring(author,2, length(author) -1)")) \
       .withColumn('rating', split_col.getItem(1)) \
       .withColumn("rating", expr("substring(rating, 2, length(rating) - 2)"))\
       .withColumn('rating_date', split_col.getItem(2))\
       .withColumn('rating_date', expr("substring(rating_date,1, length(rating_date) -1)"))\
       .withColumn("rating_date", to_date(col("rating_date"), "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"))\
       .select('author', lit('movies'), col('id').alias('content_id'), 'rating', 'rating_date')

In [330]:
df.repartition(1).write.mode('overwrite').parquet("./parsed_data")

In [328]:
df.show(truncate = False)

+----------------+------+----------+------+-----------+
|author          |movies|content_id|rating|rating_date|
+----------------+------+----------+------+-----------+
|Gurre           |movies|15        |9.0   |2014-05-31 |
|talisencrw      |movies|15        |10.0  |2016-04-13 |
|Manuel São Bento|movies|15        |10.0  |2020-11-26 |
|r96sk           |movies|15        |9.0   |2021-02-26 |
|Wuchak          |movies|15        |6.0   |2022-06-19 |
|CinemaSerf      |movies|15        |7.0   |2022-06-25 |
|testr           |movies|15        |10.0  |2022-07-12 |
|JJJ222cool      |movies|15        |5.0   |2023-06-21 |
|badelf          |movies|15        |10.0  |2023-07-26 |
|James           |movies|15        |8.0   |2023-10-20 |
|Goddard         |movies|550       |null  |2018-06-09 |
|Brett Pascoe    |movies|550       |9.0   |2018-07-05 |
|Manuel São Bento|movies|550       |8.0   |2020-11-22 |
|r96sk           |movies|550       |7.0   |2021-01-13 |
|rsanek          |movies|550       |9.0   |2021-