In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, length, from_json, expr, split, lit, to_date, explode, count
from pyspark.sql.types import StringType, StructType, StructField, MapType, ArrayType, DoubleType

In [2]:
import xml.etree.ElementTree as ET
import requests
import os
import collections
import time

import pandas as pd

In [3]:
spark = SparkSession.builder.getOrCreate()

In [4]:
!ls boardgame

collection  parsed_data  raw_data


# Boardgames

In [5]:
BASE_XML_PATH = './boardgame/raw_data/collection'
BASE_PARQUET_PATH = './boardgame/parsed_data/collection'

In [6]:
def xml_to_dataframe(xml_file) -> pd.DataFrame:
    with open(xml_file, 'r') as f:
        r_text = f.read()
        root = ET.fromstring(r_text)

    df_user_id = []
    df_type = []
    df_content_id = []
    df_rating = []
    df_rating_date = []
    
    for bg in root:
        bg_name = bg[0].text
        coll_id = bg.attrib['collid']  # I don't really know what this is, but I guess it is the id of this instance of the boardgame in the list
        object_id = bg.attrib['objectid']  # This is the boardgame identifier

        rating_val = None
        for field in bg:
            if field.tag == 'stats':
                rating_val = field[0].attrib['value']
                if rating_val == 'N/A':
                    rating_val = None
            if field.tag == 'yearpublished':
                year_published = field.text
            if field.tag == 'status':
                date_of_rating = field.attrib['lastmodified']  # Not really the rating date, but it is as close as possible with the current information.

        # print(user_id, 'boardgame', object_id, rating_val, date_of_rating)
        # print(bg_name, rating_val, year_published, coll_id, object_id)
        df_user_id.append(xml_file.split('/')[-1][:-4])
        df_type.append('boardgame')
        df_content_id.append(object_id)
        df_rating.append(rating_val)
        df_rating_date.append(date_of_rating)

    return pd.DataFrame({
        'user_id': pd.Series(df_user_id, dtype='str'),
        'type': pd.Series(df_type, dtype='category'),
        'content_id': pd.Series(df_content_id, dtype='str'),
        'rating': pd.Series(df_rating, dtype='float64'),
        'rating_date': pd.Series(df_rating_date, dtype='datetime64[ms]')
    })

# user_id = 'eekspider'
# user_id = 'adammathys'
# xml_file = f'./boardgame/raw_data/collection/{user_id}.xml'
# xml_to_dataframe(xml_file)

In [7]:
def create_boardgames_parquet():
    for xml in filter(lambda x: x.endswith('.xml'), os.listdir(BASE_XML_PATH)):
        try:
            parquet_path = f'{BASE_PARQUET_PATH}/{xml[:-4]}.parquet'
            df = xml_to_dataframe(f'{BASE_XML_PATH}/{xml}')
            df.to_parquet(parquet_path)
        except Exception as e:
            print(e)
            print(f'Error: Invalid xml file: {xml}')

In [8]:
def get_boardgames_df():
    boardgames = spark.read.parquet(BASE_PARQUET_PATH)
    return boardgames

In [9]:
create_boardgames_parquet()

In [10]:
boardgames = get_boardgames_df()
boardgames.show(10)
boardgames.printSchema()

+---------+---------+----------+------+-------------------+
|  user_id|     type|content_id|rating|        rating_date|
+---------+---------+----------+------+-------------------+
|elmozilla|boardgame|     27708|  NULL|2020-05-04 23:13:49|
|elmozilla|boardgame|     68448|   6.0|2020-05-03 10:56:13|
|elmozilla|boardgame|    173346|   8.0|2023-12-22 10:46:22|
|elmozilla|boardgame|    346703|   8.0|2023-12-22 10:46:10|
|elmozilla|boardgame|    204516|  NULL|2020-05-07 21:24:14|
|elmozilla|boardgame|    155987|   8.0|2023-12-22 10:58:20|
|elmozilla|boardgame|         5|   7.0|2023-12-22 10:56:36|
|elmozilla|boardgame|     21569|   8.0|2023-12-22 10:12:04|
|elmozilla|boardgame|    264647|  NULL|2020-05-04 19:33:29|
|elmozilla|boardgame|     31260|   7.0|2020-05-03 10:54:14|
+---------+---------+----------+------+-------------------+
only showing top 10 rows

root
 |-- user_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- content_id: string (nullable = true)
 |-- rating:

# Movies

In [11]:
def get_movies_df():
    base_parquet_path = './movie/raw_data/movies_review/'
    schema = ArrayType(
        StructType([
            StructField("author", StringType(), True),
            StructField("author_details", StructType([
                StructField("rating", StringType(), True)
            ]), True),
            StructField("created_at", StringType(), True),
        ])
    )
    
    df = spark.read.parquet(base_parquet_path)\
              .filter(length("results")>2)\
              .withColumn("results_test", col('results'))\
              .withColumn("results_parsed", from_json(col("results_test"), schema))\
              .withColumn("result_exploded", explode(col("results_parsed")))\
              .withColumn('result_exploded', col("result_exploded").cast(StringType()))
    
    split_col = split(df['result_exploded'], ', ')
    
    df = df.withColumn('author', split_col.getItem(0)) \
           .withColumn('author', expr("substring(author,2, length(author) -1)")) \
           .withColumn('rating', split_col.getItem(1)) \
           .withColumn("rating", expr("substring(rating, 2, length(rating) - 2)"))\
            .withColumn("rating", col('rating').cast(DoubleType()))\
           .withColumn('rating_date', split_col.getItem(2))\
           .withColumn('rating_date', expr("substring(rating_date,1, length(rating_date) -1)"))\
           .withColumn("rating_date", to_date(col("rating_date"), "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"))\
           .select(col('author').alias('user_id'), lit('movie').alias('type'), col('id').alias('content_id').cast(StringType()), 'rating', 'rating_date')

    df.repartition(1).write.mode('overwrite').parquet("./movie/parsed_data.parquet")

    return df

movies = get_movies_df()
movies.show()

# TODO delete
movies.printSchema()

+------------------+-----+----------+------+-----------+
|           user_id| type|content_id|rating|rating_date|
+------------------+-----+----------+------+-----------+
|             Gurre|movie|        15|   9.0| 2014-05-31|
|        talisencrw|movie|        15|  10.0| 2016-04-13|
|  Manuel São Bento|movie|        15|  10.0| 2020-11-26|
|             r96sk|movie|        15|   9.0| 2021-02-26|
|            Wuchak|movie|        15|   6.0| 2022-06-19|
|        CinemaSerf|movie|        15|   7.0| 2022-06-25|
|             testr|movie|        15|  10.0| 2022-07-12|
|        JJJ222cool|movie|        15|   5.0| 2023-06-21|
|            badelf|movie|        15|  10.0| 2023-07-26|
|             James|movie|        15|   8.0| 2023-10-20|
|          markuspm|movie|        62|  NULL| 2013-12-23|
|           izgzhen|movie|        62|   8.0| 2017-09-28|
|      tmdb47633491|movie|        62|  10.0| 2018-05-18|
|Per Gunnar Jonsson|movie|        62|   6.0| 2019-01-27|
|            Wuchak|movie|     

# Anime

In [12]:
anime = spark.read.parquet('./anime/data/sample.parquet').select('user_id','type','content_id','rating','rating_date')
anime.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- content_id: string (nullable = true)
 |-- rating: float (nullable = true)
 |-- rating_date: date (nullable = true)



# Merging all content

In [15]:
merged = boardgames\
.union(movies)\
.union(anime)
merged.printSchema()
merged.sample(False, 0.1).orderBy(col('rating_date')).show()

root
 |-- user_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- content_id: string (nullable = true)
 |-- rating: double (nullable = true)
 |-- rating_date: timestamp_ntz (nullable = true)

+-----------------+---------+----------+------+-------------------+
|          user_id|     type|content_id|rating|        rating_date|
+-----------------+---------+----------+------+-------------------+
|       talisencrw|    movie|      3924|   9.0|2016-06-30 00:00:00|
|    misterferrari|    anime|      7739|   2.0|2017-02-06 00:00:00|
|Potential Kermode|    movie|        87|   9.0|2017-02-13 00:00:00|
|     tmdb13206453|    movie|        24|  10.0|2017-06-10 00:00:00|
|            Gimly|    movie|        78|   8.0|2017-10-27 00:00:00|
|    misterferrari|    anime|      2864|   1.0|2018-05-01 00:00:00|
|    misterferrari|    anime|     12589|   3.0|2018-12-12 00:00:00|
|       John Chard|    movie|        87|   9.0|2018-12-24 00:00:00|
|       John Chard|    movie|        95| 

In [14]:
merged.groupBy(col('user_id')).count().sort(col('count'), ascending=False).show()

+------------------+-----+
|           user_id|count|
+------------------+-----+
|         elmozilla|  325|
|     misterferrari|  100|
|        adammathys|   94|
|        CinemaSerf|   30|
|        John Chard|   20|
|             r96sk|   13|
|            Wuchak|   12|
|Filipe Manuel Neto|   11|
|    Andre Gonzales|   10|
|             Gimly|    9|
|            JPV852|    7|
|           Kamurai|    6|
|     The Movie Mob|    5|
|        talisencrw|    4|
|               Rob|    4|
|            badelf|    3|
|         Ian Beale|    3|
|      Peter McGinn|    3|
|             James|    2|
| GenerationofSwine|    2|
+------------------+-----+
only showing top 20 rows

