In [1]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col, length, from_json, expr, split, lit, to_date, explode, count, lower, trim, regexp_replace
from pyspark.sql.functions import substring, max as spark_max, ceil, input_file_name, from_unixtime, regexp_extract
from pyspark.sql.types import StringType, StructType, StructField, MapType, ArrayType, DoubleType, DateType, IntegerType

In [2]:
import xml.etree.ElementTree as ET
import requests
import os
import collections
import time
import html
import json

import pandas as pd
import yake
import numpy as np
from tqdm import tqdm

In [3]:
import utils

In [4]:
AWS_ACCESS_KEY_ID = 'test_key_id'
AWS_SECRET_ACCESS_KEY = 'test_access_key'
HOST = 's3'
ENDPOINT_URL = f'http://{HOST}:4566'

TEMP_DIR = './local_data'
DOWNLOAD_FROM_S3 = True

In [5]:
spark = (
    SparkSession
    .builder
    .config("spark.jars.packages", "org.neo4j:neo4j-connector-apache-spark_2.12:5.3.0_for_spark_3")
    .getOrCreate()
)

# Collect data from S3

In [6]:
# CONTENTS = ['movie/review', 'movie/info', 'boardgame/boardgame', 'boardgame/collection', 'videogame', 'anime/user_info', 'anime/info']
# CONTENTS = ['anime/user_info', 'anime/info']
CONTENTS = ['movie', 'boardgame', 'videogame', 'anime']

In [7]:
def download_raw_data_of_content(content):
    print(f'Downloading raw-data of {content}...')
    
    target_dir = f"{TEMP_DIR}/{content}"
    
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)
        
    s3 = utils.S3_conn()

    paginator = s3.s3_client.get_paginator('list_objects_v2')
    page_iterator = paginator.paginate(Bucket='raw-data', Prefix=content)
    for page in page_iterator:
        if 'Contents' in page:
            for obj in tqdm(page['Contents']):
                key = obj['Key']
                local_file_path = f'{target_dir}/{key[len(content) + 1:]}'# os.path.join(target_dir, key[len(kind):])
                local_file_dir = os.path.dirname(local_file_path)
                
                if not os.path.exists(local_file_dir):
                    os.makedirs(local_file_dir)
                
                s3.s3_client.download_file('raw-data', key, local_file_path)



    # keys = s3.get_keys_with_prefix('raw-data', content)

    
    # for key in tqdm(keys):
    #     local_file_path = f'{target_dir}/{key[len(content) + 1:]}'# os.path.join(target_dir, key[len(kind):])
    #     local_file_dir = os.path.dirname(local_file_path)
    #     if not os.path.exists(local_file_dir):
    #         os.makedirs(local_file_dir)
    
    #     s3.s3_client.download_file('raw-data', key, local_file_path)

In [9]:
%%time
if DOWNLOAD_FROM_S3:
    for content in CONTENTS:
        download_raw_data_of_content(content)

Downloading raw-data of movie...


100%|██████████| 1000/1000 [00:11<00:00, 88.44it/s]
100%|██████████| 1000/1000 [00:10<00:00, 91.46it/s]


Downloading raw-data of boardgame...


100%|██████████| 1000/1000 [00:11<00:00, 89.72it/s]
100%|██████████| 1000/1000 [00:11<00:00, 90.47it/s]
100%|██████████| 1000/1000 [00:10<00:00, 91.77it/s]
100%|██████████| 1000/1000 [00:10<00:00, 92.36it/s]
100%|██████████| 1000/1000 [00:10<00:00, 96.82it/s]
100%|██████████| 1000/1000 [00:10<00:00, 91.63it/s]
100%|██████████| 1000/1000 [00:11<00:00, 88.23it/s]
100%|██████████| 1000/1000 [00:11<00:00, 87.77it/s]
100%|██████████| 1000/1000 [00:11<00:00, 86.53it/s]
100%|██████████| 1000/1000 [00:11<00:00, 88.74it/s]
100%|██████████| 1000/1000 [00:11<00:00, 90.58it/s]
100%|██████████| 1000/1000 [00:12<00:00, 81.74it/s]
100%|██████████| 884/884 [00:12<00:00, 73.46it/s] 


Downloading raw-data of videogame...


100%|██████████| 3/3 [00:02<00:00,  1.49it/s]


Downloading raw-data of anime...


100%|██████████| 1000/1000 [00:12<00:00, 77.10it/s]
100%|██████████| 1000/1000 [00:12<00:00, 79.14it/s]
100%|██████████| 1000/1000 [00:10<00:00, 91.26it/s]
100%|██████████| 1000/1000 [00:11<00:00, 85.16it/s]
100%|██████████| 1000/1000 [00:11<00:00, 87.28it/s]
100%|██████████| 1000/1000 [00:10<00:00, 92.62it/s]
100%|██████████| 1000/1000 [00:10<00:00, 91.14it/s]
100%|██████████| 1000/1000 [00:10<00:00, 94.32it/s]
100%|██████████| 1000/1000 [00:10<00:00, 96.43it/s]
100%|██████████| 1000/1000 [00:10<00:00, 98.93it/s]
100%|██████████| 1000/1000 [00:10<00:00, 98.36it/s]
100%|██████████| 1000/1000 [00:10<00:00, 99.47it/s]
100%|██████████| 1000/1000 [00:09<00:00, 103.04it/s]
100%|██████████| 1000/1000 [00:10<00:00, 96.04it/s]
100%|██████████| 1000/1000 [00:10<00:00, 98.83it/s]
100%|██████████| 1000/1000 [00:10<00:00, 99.52it/s]
100%|██████████| 1000/1000 [00:10<00:00, 99.15it/s]
100%|██████████| 1000/1000 [00:10<00:00, 96.27it/s]
100%|██████████| 1000/1000 [00:10<00:00, 97.90it/s]
100%|██████

CPU times: user 3min 55s, sys: 38.4 s, total: 4min 33s
Wall time: 7min 41s





# Transform data

In [10]:
s3 = utils.S3_conn()

In [11]:
def store_processed_parquet(local_directory, prefix):
    bucket_name = 'processed-data'
    
    for root, dirs, files in tqdm(os.walk(local_directory)):
        for filename in files:
            # Construct the full local path
            local_path = os.path.join(root, filename)
            
            # Construct the relative path for S3
            relative_path = os.path.relpath(local_path, local_directory)
            s3_path = os.path.join(prefix, relative_path).replace("\\", "/")  # Ensure Unix-style paths for S3
            
            # Upload the file to S3
            s3.s3_client.upload_file(local_path, bucket_name, s3_path)
            # print(f'Uploaded {local_path} to s3://{bucket_name}/{s3_path}\n')

## Boardgames

In [12]:
BOARDGAME_USERS_XML_PATH = './local_data/boardgame/collection'
BOARDGAME_USERS_PARQUET_PATH = './local_data/boardgame/processed_data/boardgame_users.parquet'
BOARDGAME_CONTENT_XML_PATH = './local_data/boardgame/boardgame'
BOARDGAME_CONTENT_PARQUET_PATH = './local_data/boardgame/processed_data/boardgame_content.parquet'

In [13]:
def xml_collection_to_dataframe(xml_file) -> pd.DataFrame:
    with open(xml_file, 'r') as f:
        r_text = f.read()
        root = ET.fromstring(r_text)

    df_user_id = []
    df_type = []
    df_content_id = []
    df_rating = []
    df_rating_date = []
    
    for bg in root:
        bg_name = bg[0].text
        coll_id = bg.attrib['collid']  # I don't really know what this is, but I guess it is the id of this instance of the boardgame in the list
        object_id = bg.attrib['objectid']  # This is the boardgame identifier

        rating_val = None
        for field in bg:
            if field.tag == 'stats':
                rating_val = field[0].attrib['value']
                if rating_val == 'N/A':
                    rating_val = None
            if field.tag == 'yearpublished':
                year_published = field.text
            if field.tag == 'status':
                date_of_rating = field.attrib['lastmodified']  # Not really the rating date, but it is as close as possible with the current information.

        # print(user_id, 'boardgame', object_id, rating_val, date_of_rating)
        # print(bg_name, rating_val, year_published, coll_id, object_id)
        df_user_id.append(xml_file.split('/')[-1][:-4])
        df_type.append('boardgame')
        df_content_id.append(object_id)
        df_rating.append(rating_val)
        df_rating_date.append(date_of_rating)

    return pd.DataFrame({
        'user_id': pd.Series(df_user_id, dtype='str'),
        'type': pd.Series(df_type, dtype='category'),
        'content_id': pd.Series(df_content_id, dtype='str'),
        'rating': pd.Series(df_rating, dtype='float64'),
        'rating_date': pd.Series(df_rating_date, dtype='datetime64[ms]')
    })

In [14]:
def create_boardgame_users_parquet():
    if not os.path.exists(BOARDGAME_USERS_PARQUET_PATH):
        os.makedirs(BOARDGAME_USERS_PARQUET_PATH)
        
    for xml in filter(lambda x: x.endswith('.xml'), os.listdir(BOARDGAME_USERS_XML_PATH)):
        try:
            df = xml_collection_to_dataframe(f'{BOARDGAME_USERS_XML_PATH}/{xml}')
            parquet_path = f'{BOARDGAME_USERS_PARQUET_PATH}/{xml[:-4]}.parquet'
            df.to_parquet(parquet_path)
        except Exception as e:
            print(e)
            print(f'Error: Invalid xml file: {xml}')

In [15]:
def xml_boardgame_to_dataframe():
    df_content_id = []
    df_content_description = []
    df_content_year = []

    for folder in os.listdir(BOARDGAME_CONTENT_XML_PATH):
        with open(f"{BOARDGAME_CONTENT_XML_PATH}/{folder}/1.xml", 'r') as f:
            r_text = f.read()
        df_content_id.append(folder)
        root = ET.fromstring(r_text)
        for bg in root:
            for field in bg:
                if field.tag == 'description':
                    df_content_description.append(html.unescape(field.text))
                if field.tag == 'yearpublished':
                    df_content_year.append(int(field.attrib['value']))

    return pd.DataFrame({
        'content_id': pd.Series(df_content_id, dtype='str'),
        'description': pd.Series(df_content_description, dtype='str'),
        'release_year': pd.Series(df_content_year, dtype='Int16')
    })

In [16]:
def create_boardgame_content_parquet():
    if not os.path.exists(BOARDGAME_CONTENT_PARQUET_PATH):
        os.makedirs(BOARDGAME_CONTENT_PARQUET_PATH)

    schema = StructType([
        StructField("content_id", StringType(), True),
        StructField("description", StringType(), True),
        StructField("release_year", IntegerType(), True)
    ])
    
    df = xml_boardgame_to_dataframe()
    df['description'] = df['description'].astype('str')
    df = df.replace([np.nan], [None])
    
    boardgame_content = (
        spark
        .createDataFrame(df, schema=schema)
        .withColumn('type', lit('boardgame'))
    )
    
    # Save parquet to processed-data zone
    boardgame_content.write.mode('overwrite').parquet(BOARDGAME_CONTENT_PARQUET_PATH)

In [17]:
def get_boardgame_users_df():
    boardgame_users = spark.read.parquet(BOARDGAME_USERS_PARQUET_PATH)
    return boardgame_users

In [18]:
def get_boardgame_content_df():
    boardgame_content = spark.read.parquet(BOARDGAME_CONTENT_PARQUET_PATH)
    return boardgame_content

In [19]:
create_boardgame_users_parquet()

'collid'
Error: Invalid xml file: Century.xml
'collid'
Error: Invalid xml file: Icythistle.xml
'collid'
Error: Invalid xml file: ItsCharlieVP.xml
'collid'
Error: Invalid xml file: nugenet.xml
'collid'
Error: Invalid xml file: marioymia.xml
'collid'
Error: Invalid xml file: RobMcWiz.xml
'collid'
Error: Invalid xml file: zigooloo.xml
'collid'
Error: Invalid xml file: Halenor.xml


In [20]:
create_boardgame_content_parquet()

In [21]:
boardgame_users = get_boardgame_users_df()
boardgame_users.show(5)
boardgame_users.printSchema()

+-----------+---------+----------+------+-------------------+
|    user_id|     type|content_id|rating|        rating_date|
+-----------+---------+----------+------+-------------------+
|zefquaavius|boardgame|    322232|   6.0|2023-08-01 14:52:32|
|zefquaavius|boardgame|    296402|   8.0|2023-08-02 14:18:24|
|zefquaavius|boardgame|    336537|  null|2023-08-02 14:18:38|
|zefquaavius|boardgame|    314445|  null|2023-08-02 14:18:54|
|zefquaavius|boardgame|    296404|  null|2023-08-02 14:19:11|
+-----------+---------+----------+------+-------------------+
only showing top 5 rows

root
 |-- user_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- content_id: string (nullable = true)
 |-- rating: double (nullable = true)
 |-- rating_date: timestamp_ntz (nullable = true)



In [22]:
boardgame_content = get_boardgame_content_df()
boardgame_content.show(5)
boardgame_content.printSchema()

+----------+--------------------+------------+---------+
|content_id|         description|release_year|     type|
+----------+--------------------+------------+---------+
|    189314|Set of seven prom...|        2015|boardgame|
|    157661|Grifters is a han...|        2015|boardgame|
|    174391|Exposed is a quic...|        2016|boardgame|
|    168054|Alone is a sci-fi...|        2019|boardgame|
|    191932|From the official...|        2012|boardgame|
+----------+--------------------+------------+---------+
only showing top 5 rows

root
 |-- content_id: string (nullable = true)
 |-- description: string (nullable = true)
 |-- release_year: integer (nullable = true)
 |-- type: string (nullable = true)



In [23]:
store_processed_parquet(BOARDGAME_USERS_PARQUET_PATH, prefix='boardgame')

1it [00:03,  3.61s/it]


In [24]:
store_processed_parquet(BOARDGAME_CONTENT_PARQUET_PATH, prefix='boardgame')

1it [00:00,  4.49it/s]


## Movies

In [25]:
MOVIE_BASE_PARQUET_PATH = './local_data/movie/review'
MOVIE_BASE_INFO_PATH = './local_data/movie/info'
MOVIE_USERS_PARQUET_PATH = "./local_data/movie/processed_data/movie_users.parquet"
MOVIE_CONTENT_PARQUET_PATH = "./local_data/movie/processed_data/movie_content.parquet"

In [26]:
def create_movie_users_parquet():
    schema = ArrayType(
        StructType([
            StructField("author", StringType(), True),
            StructField("author_details", StructType([
                StructField("rating", StringType(), True)
            ]), True),
            StructField("created_at", StringType(), True),
        ])
    )
    
    movie_users = spark.read.parquet(MOVIE_BASE_PARQUET_PATH)\
              .filter(length("results")>2)\
              .withColumn("results_test", col('results'))\
              .withColumn("results_parsed", from_json(col("results_test"), schema))\
              .withColumn("result_exploded", explode(col("results_parsed")))\
              .withColumn('result_exploded', col("result_exploded").cast(StringType()))
    
    split_col = split(movie_users['result_exploded'], ', ')
    
    movie_users = movie_users.withColumn('author', split_col.getItem(0)) \
               .withColumn('author', expr("substring(author,2, length(author) -1)")) \
               .withColumn('rating', split_col.getItem(1)) \
               .withColumn("rating", expr("substring(rating, 2, length(rating) - 2)"))\
               .withColumn("rating", col('rating').cast(DoubleType()))\
               .withColumn('rating_date', split_col.getItem(2))\
               .withColumn('rating_date', expr("substring(rating_date,1, length(rating_date) -1)"))\
               .withColumn("rating_date", to_date(col("rating_date"), "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"))\
               .select(col('author').alias('user_id'), lit('movie').alias('type'), col('id').alias('content_id').cast(StringType()), 'rating', 'rating_date')

    if not os.path.exists(MOVIE_USERS_PARQUET_PATH):
        os.makedirs(MOVIE_USERS_PARQUET_PATH)

    movie_users.repartition(1).write.mode('overwrite').parquet(MOVIE_USERS_PARQUET_PATH)

In [27]:
def create_movie_content_parquet():
    if not os.path.exists(MOVIE_CONTENT_PARQUET_PATH):
        os.makedirs(MOVIE_CONTENT_PARQUET_PATH)
    
    movie_content = (
        spark
        .read.parquet(MOVIE_BASE_INFO_PATH)
        .select(col('id').alias('content_id'), col('overview').alias('description'), col('release_date').alias('release_year'))
        .withColumn('release_year', substring("release_year", 1, 4))
        .withColumn('type', lit('movie'))
        .repartition(1).write.mode('overwrite').parquet(MOVIE_CONTENT_PARQUET_PATH)
    )
    # movie_content.repartition(1).write.mode('overwrite').parquet(MOVIE_CONTENT_PARQUET_PATH)

In [28]:
def get_movie_users_df():
    movie_users = spark.read.parquet(MOVIE_USERS_PARQUET_PATH)
    return movie_users

In [29]:
def get_movie_content_df():
    movie_content = spark.read.parquet(MOVIE_CONTENT_PARQUET_PATH)
    return movie_content

In [30]:
create_movie_users_parquet()

In [31]:
create_movie_content_parquet()

In [32]:
movie_users = get_movie_users_df()
movie_users.show(5)
movie_users.printSchema()

+------------------+-----+----------+------+-----------+
|           user_id| type|content_id|rating|rating_date|
+------------------+-----+----------+------+-----------+
|        John Chard|movie|       576|  10.0| 2017-02-10|
|      tmdb28039023|movie|       576|   6.0| 2022-08-28|
|Filipe Manuel Neto|movie|       576|   5.0| 2023-10-15|
|  Manuel São Bento|movie|    850165|   7.0| 2023-12-21|
|             r96sk|movie|    850165|   9.0| 2024-02-09|
+------------------+-----+----------+------+-----------+
only showing top 5 rows

root
 |-- user_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- content_id: string (nullable = true)
 |-- rating: double (nullable = true)
 |-- rating_date: date (nullable = true)



In [33]:
movie_content = get_movie_content_df()
movie_content.show(5)
movie_content.printSchema()

+----------+--------------------+------------+-----+
|content_id|         description|release_year| type|
+----------+--------------------+------------+-----+
|     43969|Nogreh is a young...|        2003|movie|
|    651102|Since its first p...|        1971|movie|
|     80957|Brian, (Luke Goss...|        2011|movie|
|    936897|Goldy is a spirit...|        2022|movie|
|    146536|A journey back in...|        1986|movie|
+----------+--------------------+------------+-----+
only showing top 5 rows

root
 |-- content_id: long (nullable = true)
 |-- description: string (nullable = true)
 |-- release_year: string (nullable = true)
 |-- type: string (nullable = true)



In [34]:
store_processed_parquet(MOVIE_USERS_PARQUET_PATH, prefix='movie')

1it [00:00, 15.24it/s]


In [35]:
store_processed_parquet(MOVIE_CONTENT_PARQUET_PATH, prefix='movie')

1it [00:00, 16.44it/s]


In [36]:
# GC had this code to enrich movie_users, I don't see the use yet so I am leaving this commented out.
# movie_users = movie_users.join(movie_content, ['content_id'],'left')
# movie_users.repartition(1).write.mode('overwrite').parquet(MOVIE_CONTENT_PARQUET_PATH)

In [37]:
# schema = ArrayType(
#     StructType([
#         StructField("author", StringType(), True),
#         StructField("author_details", StructType([
#             StructField("rating", StringType(), True)
#         ]), True),
#         StructField("created_at", StringType(), True),
#     ])
# )

# movie_users = spark.read.parquet(MOVIE_BASE_PARQUET_PATH)\
#           .filter(length("results")>2)\
#           .withColumn("results_test", col('results'))\
#           .withColumn("results_parsed", from_json(col("results_test"), schema))\
#           .withColumn("result_exploded", explode(col("results_parsed")))\
#           .withColumn('result_exploded', col("result_exploded").cast(StringType()))

# split_col = split(movie_users['result_exploded'], ', ')

# movie_users = movie_users.withColumn('author', split_col.getItem(0)) \
#            .withColumn('author', expr("substring(author,2, length(author) -1)")) \
#            .withColumn('rating', split_col.getItem(1)) \
#            .withColumn("rating", expr("substring(rating, 2, length(rating) - 2)"))\
#            .withColumn("rating", col('rating').cast(DoubleType()))\
#            .withColumn('rating_date', split_col.getItem(2))\
#            .withColumn('rating_date', expr("substring(rating_date,1, length(rating_date) -1)"))\
#            .withColumn("rating_date", to_date(col("rating_date"), "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"))\
#            .select(col('author').alias('user_id'), lit('movie').alias('type'), col('id').alias('content_id').cast(StringType()), 'rating', 'rating_date')

# movie_content = spark.read.parquet(MOVIE_BASE_INFO_PATH)\
#               .select(col('id').alias('content_id'),col('overview').alias('description'), col('release_date').alias('release_year'))\
#               .withColumn('release_year', substring("release_year", 1, 4))

# # movie_content.repartition(1).write.mode('overwrite').parquet("./parsed_data/movies_descript.parquet")

# # Re-lectura

# # dfIni = spark.read.parquet("./parsed_data/movies_user.parquet")
# # dfDesc = spark.read.parquet("./parsed_data/movies_descript.parquet")



## Anime

In [38]:
ANIME_BASE_CONTENT_PATH = './local_data/anime/info'
ANIME_BASE_USERS_PATH = './local_data/anime/user_info'
ANIME_TEMP_PARQUET_PATH = './local_data/anime/temp'
ANIME_USERS_PARQUET_PATH = "./local_data/anime/processed_data/anime_users.parquet"
ANIME_CONTENT_PARQUET_PATH = "./local_data/anime/processed_data/anime_content.parquet"

In [39]:
def create_anime_users_parquet():
    if not os.path.exists(ANIME_USERS_PARQUET_PATH):
        os.makedirs(ANIME_USERS_PARQUET_PATH)
    path_for_anime_lists = ANIME_BASE_USERS_PATH
    user_anime_lists_paths = os.listdir(path_for_anime_lists)
    
    df = spark.read.json(
        path = [f'{path_for_anime_lists}/{i}' for i in user_anime_lists_paths],
        multiLine = True, 
        mode = 'DROPMALFORMED'
    ).withColumn('file_name', input_file_name()).select(
        from_unixtime(col('updated_at')).alias('rating_date'),
        col('score').alias('rating'),
        col('anime_id').alias('content_id'),
        regexp_extract(col('file_name'), '\/([^\/]+)\.json$', 1).alias('user_id'),
    )\
    .withColumn('type', lit('anime'))\
    .coalesce(1).write.mode('overwrite').parquet(ANIME_USERS_PARQUET_PATH)

In [40]:
def create_anime_content_parquet():
    if not os.path.exists(ANIME_CONTENT_PARQUET_PATH):
        os.makedirs(ANIME_CONTENT_PARQUET_PATH)

    if not os.path.exists(ANIME_TEMP_PARQUET_PATH):
        os.makedirs(ANIME_TEMP_PARQUET_PATH)
    
    path_for_animes = ANIME_BASE_CONTENT_PATH
    anime_paths = os.listdir(path_for_animes)
    
    batch_size = 1000
    cnt = 0
    
    while len(anime_paths) > cnt * batch_size :
        df = spark.read.json(
            path = [f'{path_for_animes}/{i}' for i in anime_paths][cnt * batch_size: (cnt + 1) * batch_size],
            multiLine = True, 
            mode = 'DROPMALFORMED'
        )\
        .dropna(subset=['data.aired.prop.from.year'])
        df.write.mode('overwrite').parquet(f'{ANIME_TEMP_PARQUET_PATH}/{cnt}')
        cnt += 1
    
    parquet_files_path = ANIME_TEMP_PARQUET_PATH
    parquet_files = os.listdir(parquet_files_path)
    df = spark.read.parquet(*[f'{parquet_files_path}/{i}' for i in parquet_files])
    df.select(
        col('data.synopsis').alias('description'),
        col('data.title').alias('title'),
        col('data.mal_id').cast(StringType()).alias('content_id'),
        col('data.aired.prop.from.year').alias('release_year')
    )\
    .withColumn('type', lit('anime'))\
    .coalesce(1).write.mode('overwrite').parquet(ANIME_CONTENT_PARQUET_PATH)

In [41]:
def get_anime_users_df():
    anime_users = spark.read.parquet(ANIME_USERS_PARQUET_PATH)
    return anime_users

In [42]:
def get_anime_content_df():
    anime_content = spark.read.parquet(ANIME_CONTENT_PARQUET_PATH)
    return anime_content

In [43]:
create_anime_users_parquet()

In [44]:
create_anime_content_parquet()

In [45]:
anime_users = get_anime_users_df()
anime_users.show(5)
anime_users.printSchema()

+-------------------+------+----------+---------+-----+
|        rating_date|rating|content_id|  user_id| type|
+-------------------+------+----------+---------+-----+
|2023-03-17 00:35:34|     0|       918|Nabil_967|anime|
|2023-08-13 12:15:08|    10|        21|Nabil_967|anime|
|2022-07-05 23:35:51|     0|     48583|Nabil_967|anime|
|2023-07-29 14:19:26|     9|     52034|Nabil_967|anime|
|2022-03-01 19:53:09|     5|     41380|Nabil_967|anime|
+-------------------+------+----------+---------+-----+
only showing top 5 rows

root
 |-- rating_date: string (nullable = true)
 |-- rating: long (nullable = true)
 |-- content_id: long (nullable = true)
 |-- user_id: string (nullable = true)
 |-- type: string (nullable = true)



In [46]:
anime_content = get_anime_content_df()
anime_content.show(5)
anime_content.printSchema()

+--------------------+--------------------+----------+------------+-----+
|         description|               title|content_id|release_year| type|
+--------------------+--------------------+----------+------------+-----+
|During their ques...|InuYasha Movie 1:...|       452|        2001|anime|
|In the year Cosmi...|Kidou Senshi Gund...|        93|        2002|anime|
|The final battle ...|Sword Art Online:...|     40540|        2020|anime|
|On his way to a c...|Tensei Kizoku no ...|     52608|        2023|anime|
|Awaking to absolu...|Sokushi Cheat ga ...|     53730|        2024|anime|
+--------------------+--------------------+----------+------------+-----+
only showing top 5 rows

root
 |-- description: string (nullable = true)
 |-- title: string (nullable = true)
 |-- content_id: string (nullable = true)
 |-- release_year: long (nullable = true)
 |-- type: string (nullable = true)



In [47]:
store_processed_parquet(ANIME_USERS_PARQUET_PATH, prefix='anime')

1it [00:00, 10.31it/s]


In [48]:
store_processed_parquet(ANIME_CONTENT_PARQUET_PATH, prefix='anime')

1it [00:00,  9.11it/s]


## Videogames

In [49]:
VIDEOGAME_BASE_SUMMARIES_PATH = './local_data/videogame/player_profile.json'
VIDEOGAME_BASE_PROFILES_PATH = './local_data/videogame/games_played.json'
VIDEOGAME_BASE_GAMES_PATH = './local_data/videogame/steam_games.json'
VIDEOGAME_USERS_PARQUET_PATH = "./local_data/videogame/processed_data/v_users.parquet"
VIDEOGAME_CONTENT_PARQUET_PATH = "./local_data/videogame/processed_data/videogame_content.parquet"

In [50]:
def create_videogame_users_parquet():
    # Load player_summaries.json
    with open(VIDEOGAME_BASE_SUMMARIES_PATH, 'r') as f:
        player_summaries_data = json.load(f)
    
    # Load steam_profiles.json
    with open(VIDEOGAME_BASE_PROFILES_PATH, 'r') as f:
        steam_profiles_data = json.load(f)
    
    # Initialize list to store data
    common_rows = []
    
    # Process data from steam_profiles_data
    for steam_profiles in steam_profiles_data:
        steamid = list(steam_profiles.keys())[0]
        games = steam_profiles[steamid]
        player_summary = next((summary for summary in player_summaries_data if steamid in summary), None)
        # Check if the player summary data is available and not empty
        if player_summary and player_summary[steamid]:
            personaname = player_summary[steamid].get('personaname', 'Unknown')
            for game in games:
                appid = game['appid']
                playtime_forever = game['playtime_forever']
                if playtime_forever > 0:  # Skip if playtime_forever is 0
                    common_rows.append({'user_id': personaname, 'type': 'videogame', 'content_id': appid, 'temp_rating': playtime_forever})
    
    # Create Spark DataFrame
    common_df = spark.createDataFrame(common_rows)
    
    # Calculate max playtime_forever for each user_id
    max_playtime = common_df.groupBy('user_id').agg(spark_max('temp_rating').alias('max_temp_rating'))
    
    # Join max_playtime with common_df to calculate normalized ratings
    common_df = common_df.join(max_playtime, on='user_id')
    common_df = common_df.withColumn('rating', (col('temp_rating') / col('max_temp_rating')) * 10)
    
    # Apply ceiling to the ratings
    common_df = common_df.withColumn('rating', ceil(col('rating')))
    
    # Drop the 'temp_rating' and 'max_temp_rating' columns
    common_df = common_df.drop('temp_rating', 'max_temp_rating')
    
    # Add a new column 'rating_date' filled with null values
    common_df = common_df.withColumn('rating_date', lit(None).cast('string'))
    
    # Display the Spark DataFrame
    # common_df.show(10)
    
    # Take a sample of the data. Comment or uncomment 
    # sample_df = common_df.sample(withReplacement=False, fraction=0.001)
    # sample_df.write.parquet('sample_steam_users.parquet')
    
    # Save DataFrame as Parquet file
    common_df.write.mode('overwrite').parquet(VIDEOGAME_USERS_PARQUET_PATH)

In [51]:
def create_videogame_content_parquet():
    # Load the dataset from games.json
    dataset = {}
    if os.path.exists(VIDEOGAME_BASE_GAMES_PATH):
        with open(VIDEOGAME_BASE_GAMES_PATH, 'r', encoding='utf-8') as fin:
            text = fin.read()
            if len(text) > 0:
                dataset = json.loads(text)
    
    # Initialize list to store data
    rows = []
    
    # Extract the relevant data
    for app_id, game_info in dataset.items():
        name = game_info.get('name', '')
        release_date = game_info.get('release_date', '')
        # Extract the year from the release_date
        if release_date:
            release_year = release_date.split()[-1]
        else:
            release_year = ''
        description = game_info.get('detailed_description', '')
    
        rows.append(Row(content_id=app_id, content_title=name, release_year=release_year, description=description))
    
    # Create Spark DataFrame
    df = spark.createDataFrame(rows)
    df = df.withColumn('type', lit('videogame'))
    
    # Display the first few rows of the DataFrame
    # df.show()
    
    # Save DataFrame as Parquet file
    # df.write.parquet(OUTPUT_PARQUET_FILE)
    df.write.mode('overwrite').parquet(VIDEOGAME_CONTENT_PARQUET_PATH)
    # Stop Spark session
    # spark.stop()

In [52]:
def get_videogame_users_df():
    videogame_users = spark.read.parquet(VIDEOGAME_USERS_PARQUET_PATH)
    return videogame_users

In [53]:
def get_videogame_content_df():
    videogame_content = spark.read.parquet(VIDEOGAME_CONTENT_PARQUET_PATH)
    return videogame_content

In [54]:
create_videogame_users_parquet()

In [55]:
create_videogame_content_parquet()

In [56]:
videogame_users = get_videogame_users_df()
videogame_users.show(5)
videogame_users.printSchema()

+-------+----------+---------+------+-----------+
|user_id|content_id|     type|rating|rating_date|
+-------+----------+---------+------+-----------+
|   Fooo|       300|videogame|     1|       null|
|   Fooo|      4000|videogame|     1|       null|
|   Fooo|      2600|videogame|     1|       null|
|   Fooo|       220|videogame|     1|       null|
|   Fooo|       500|videogame|     1|       null|
+-------+----------+---------+------+-----------+
only showing top 5 rows

root
 |-- user_id: string (nullable = true)
 |-- content_id: long (nullable = true)
 |-- type: string (nullable = true)
 |-- rating: long (nullable = true)
 |-- rating_date: string (nullable = true)



In [57]:
videogame_content = get_videogame_content_df()
videogame_content.show(5)
videogame_content.printSchema()

+----------+--------------------+------------+--------------------+---------+
|content_id|       content_title|release_year|         description|     type|
+----------+--------------------+------------+--------------------+---------+
|    837390|        My zero trip|        2018|My zero trip is a...|videogame|
|   1564580|Nevertales: Faryo...|        2021|Mad Head Games re...|videogame|
|    263560|      Paper Sorcerer|        2014|Paper Sorcerer is...|videogame|
|    831230|    Doors Quest Demo|        2018|Doors Quest Demo ...|videogame|
|   1738500|       velvet clouds|        2021|Velvet Clouds - a...|videogame|
+----------+--------------------+------------+--------------------+---------+
only showing top 5 rows

root
 |-- content_id: string (nullable = true)
 |-- content_title: string (nullable = true)
 |-- release_year: string (nullable = true)
 |-- description: string (nullable = true)
 |-- type: string (nullable = true)



In [58]:
store_processed_parquet(VIDEOGAME_USERS_PARQUET_PATH, prefix='videogame')

1it [00:00, 13.48it/s]


In [59]:
store_processed_parquet(VIDEOGAME_CONTENT_PARQUET_PATH, prefix='videogame')

1it [00:00,  1.38it/s]


# Merging all content

In [60]:
merged_users = (
    boardgame_users
    .union(movie_users.select(['user_id', 'type', 'content_id', 'rating', 'rating_date']))
    .union(anime_users.select(['user_id', 'type', 'content_id', 'rating', 'rating_date']))
    .union(videogame_users.select(['user_id', 'type', 'content_id', 'rating', 'rating_date']))
    .withColumn('user_id', trim(lower(col('user_id'))))
    .withColumn('type', lower(col('type')))
)

merged_users.printSchema()
merged_users.show()

root
 |-- user_id: string (nullable = true)
 |-- type: string (nullable = true)
 |-- content_id: string (nullable = true)
 |-- rating: double (nullable = true)
 |-- rating_date: string (nullable = true)

+-----------+---------+----------+------+-------------------+
|    user_id|     type|content_id|rating|        rating_date|
+-----------+---------+----------+------+-------------------+
|zefquaavius|boardgame|    322232|   6.0|2023-08-01 14:52:32|
|zefquaavius|boardgame|    296402|   8.0|2023-08-02 14:18:24|
|zefquaavius|boardgame|    336537|  null|2023-08-02 14:18:38|
|zefquaavius|boardgame|    314445|  null|2023-08-02 14:18:54|
|zefquaavius|boardgame|    296404|  null|2023-08-02 14:19:11|
|zefquaavius|boardgame|    296406|  null|2023-08-01 14:56:42|
|zefquaavius|boardgame|    322429|  null|2023-08-02 14:19:32|
|zefquaavius|boardgame|    309782|   6.0|2023-08-01 14:57:01|
|zefquaavius|boardgame|    314446|  null|2023-08-02 14:19:57|
|zefquaavius|boardgame|    296407|  null|2023-08-01 

In [61]:
boardgame_content.show()

+----------+--------------------+------------+---------+
|content_id|         description|release_year|     type|
+----------+--------------------+------------+---------+
|    189314|Set of seven prom...|        2015|boardgame|
|    157661|Grifters is a han...|        2015|boardgame|
|    174391|Exposed is a quic...|        2016|boardgame|
|    168054|Alone is a sci-fi...|        2019|boardgame|
|    191932|From the official...|        2012|boardgame|
|    181495|The theme of the ...|        2015|boardgame|
|    131581|A quick trivia ga...|        2014|boardgame|
|    210179|Description from ...|        2016|boardgame|
|    168314|One Night Ultimat...|        2015|boardgame|
|    209320|Promotional card ...|        2017|boardgame|
|    163166|From the publishe...|        2015|boardgame|
|    214319|From the publishe...|        2005|boardgame|
|    181613|Dead Man's Draw i...|        2015|boardgame|
|     16620|Jinx is a new Gam...|        2012|boardgame|
|    149155|This is a promoti..

In [62]:
anime_content.show()

+--------------------+--------------------+----------+------------+-----+
|         description|               title|content_id|release_year| type|
+--------------------+--------------------+----------+------------+-----+
|During their ques...|InuYasha Movie 1:...|       452|        2001|anime|
|In the year Cosmi...|Kidou Senshi Gund...|        93|        2002|anime|
|The final battle ...|Sword Art Online:...|     40540|        2020|anime|
|On his way to a c...|Tensei Kizoku no ...|     52608|        2023|anime|
|Awaking to absolu...|Sokushi Cheat ga ...|     53730|        2024|anime|
|Eager to know why...|Dorei-ku The Anim...|     36525|        2018|anime|
|Unleashing a deva...|Naruto: Shippuude...|      4437|        2008|anime|
|Running into your...|Shinmai Maou no T...|     23233|        2015|anime|
|There exist few h...|Rakudai Kishi no ...|     30296|        2015|anime|
|In the wake of Er...|Shingeki no Kyoji...|     51535|        2023|anime|
|Yukiteru Amano is...|    Mirai Nikki 

In [63]:
merged_content = (
    boardgame_content
    .union(movie_content.select(['content_id', 'description', 'release_year', 'type']))
    .union(anime_content.select(['content_id', 'description', 'release_year', 'type']))
    .union(videogame_content.select(['content_id', 'description', 'release_year', 'type']))
    # .withColumn('user_id', trim(lower(col('user_id'))))
    # .withColumn('type', lower(col('type')))
)

merged_content.show()

+----------+--------------------+------------+---------+
|content_id|         description|release_year|     type|
+----------+--------------------+------------+---------+
|    189314|Set of seven prom...|        2015|boardgame|
|    157661|Grifters is a han...|        2015|boardgame|
|    174391|Exposed is a quic...|        2016|boardgame|
|    168054|Alone is a sci-fi...|        2019|boardgame|
|    191932|From the official...|        2012|boardgame|
|    181495|The theme of the ...|        2015|boardgame|
|    131581|A quick trivia ga...|        2014|boardgame|
|    210179|Description from ...|        2016|boardgame|
|    168314|One Night Ultimat...|        2015|boardgame|
|    209320|Promotional card ...|        2017|boardgame|
|    163166|From the publishe...|        2015|boardgame|
|    214319|From the publishe...|        2005|boardgame|
|    181613|Dead Man's Draw i...|        2015|boardgame|
|     16620|Jinx is a new Gam...|        2012|boardgame|
|    149155|This is a promoti..

In [64]:
(
    merged_users
    .select('type')
    .groupBy(col('type'))
    .count()
    .sort(col('count'), ascending=False)
    .show()
)

+---------+------+
|     type| count|
+---------+------+
|    anime|388227|
|videogame|289674|
|boardgame|258496|
|    movie|    36|
+---------+------+



In [65]:
(
    merged_content
    .select('type')
    .groupBy(col('type'))
    .count()
    .sort(col('count'), ascending=False)
    .show()
)

+---------+-----+
|     type|count|
+---------+-----+
|videogame|85103|
|    anime|25039|
|boardgame|11226|
|    movie| 1000|
+---------+-----+



In [66]:
stop

NameError: name 'stop' is not defined

In [None]:
merged_users.groupBy(col('user_id')).count().sort(col('count'), ascending=False).show()

In [None]:
(
    merged_users
    .select('user_id', 'type')
    .withColumn('user_id', trim(regexp_replace(lower(col('user_id')), '[^a-zA-Z0-9]', '')))
    .distinct()
    .groupBy(col('user_id'))
    .count()
    .sort(col('count'), ascending=False)
    .show()
)

In [None]:
(
    merged_users
    .filter(merged_users['user_id'] == 'daimyo')
    .groupBy(col('type'))
    .count()
    .show()
)

# Yake

In [None]:
# merged.show()

In [None]:
# boardgame_content.show()

In [None]:
# r = boardgame_content.rdd.map(lambda x: (x[2], get_kw(x[0])))

In [None]:
# spark.createDataFrame(r).show()

In [None]:
def get_kw(text):
   kw_extractor = yake.KeywordExtractor(
       lan='en',
       n=2,  # Max n-gram size
       top=5  # Number of keywords
   )
    
   return list(map(lambda x: str.lower(x[0]) if x else '', kw_extractor.extract_keywords(text)))


In [None]:
df = boardgame_content

In [None]:
rddK = df.rdd.map(lambda x: (x['content_id'], get_kw(x['description'])))
rddK = spark.createDataFrame(rddK).select(col('_1').alias('content_id'), col('_2').alias('keyword'))
dfK = (rddK.withColumn("keyword_1", expr("keyword[0]"))
                .withColumn("keyword_2", expr("keyword[1]"))
                .withColumn("keyword_3", expr("keyword[2]"))
                .withColumn("keyword_4", expr("keyword[3]"))
                .withColumn("keyword_5", expr("keyword[4]"))
                .select('content_id','keyword_1','keyword_2','keyword_3','keyword_4','keyword_5' )
      )
dfK.show()

- [ ] connect directly spark to neo4j (using the right connector)
- [ ] maybe provide some analytics about the users' profile

In [None]:
# Check what the RS does for the NULL values.
# - We could impute something, like the average score the user gives.

# Neo4j

In [None]:
NEO4J_URL = 'bolt://neo4j:7687'

In [None]:
type = 'Boardgame'
(
    merged_content
    .filter(merged_content['type'] == str.lower(type))
    .write
    .format("org.neo4j.spark.DataSource")
    .mode("Append")
    .option("labels", f":{type}")
    .option("url", NEO4J_URL)
    .save()
)

In [None]:
STOP

In [None]:
# spark = (
#     SparkSession
#     .builder
#     .appName("Neo4j-Spark Connector")
#     # .config("spark.jars.packages", "neo4j-contrib:neo4j-spark-connector:5.3.0_for_spark_3")
#     .config("spark.jars.packages", "org.neo4j:neo4j-connector-apache-spark_2.12:5.3.0_for_spark_3")
#     # $SPARK_HOME/bin/pyspark --packages org.neo4j:neo4j-connector-apache-spark_2.12:5.3.0_for_spark_3
#     .config("spark.neo4j.bolt.url", NEO4J_URL)
#     # .config("spark.neo4j.bolt.url", "bolt://neo4j")
#     # .config("spark.neo4j.bolt.user", "neo4j")
#     # .config("spark.neo4j.bolt.password", "password")
#     .getOrCreate()
# )

In [None]:
nodes_df.show()

In [None]:
nodes_df = spark.createDataFrame([
    Row(id=1, name="Alice"),
    Row(id=2, name="Bob")
])

# relationships_df = spark.createDataFrame([
#     Row(src=1, dst=2, relationship="KNOWS")
# ])

# Write nodes to Neo4j
(
    nodes_df
    .write
    .format("org.neo4j.spark.DataSource")
    # .mode("Overwrite")
    .mode("Append")
    .option("labels", ":Person")
    .option("url", NEO4J_URL)
    .save()
)

In [None]:
import neo4j

In [None]:
DRIVER = neo4j.GraphDatabase.driver(uri="neo4j://neo4j")


def execute(query: str):
    """
    Executes a Cypher @query and returns its result.
    """
    result = DRIVER.execute_query(query)
    return result

In [None]:
execute('MATCH (n1)-[r]->(n2) RETURN r, n1, n2 LIMIT 25')

In [None]:
import os
import json
import random

import neo4j
import yake
import numpy as np
from tqdm import tqdm

SEED = 13
# SEMANTIC_PATH = '../semanticscholar_raw_data'
SEMANTIC_PATH = '../small_sample'
DEFAULT_JOURNAL_NAME = 'Unknown'

random.seed(SEED)
np.random.seed(SEED)

DRIVER = neo4j.GraphDatabase.driver(uri="neo4j://localhost")


def execute(query: str):
    """
    Executes a Cypher @query and returns its result.
    """
    result = DRIVER.execute_query(query)
    return result


def delete_graph() -> None:
    """
    Deletes every node and edge of the graph.
    """
    query = """
        MATCH (n)
        DETACH DELETE n;
    """

    execute(query)


def parse_journal_name(paper) -> str:
    """
    Not every file has a field 'journal' in the json.
    This function treats those edge cases.
    """
    if 'journal' not in paper or not paper['journal']:
        return DEFAULT_JOURNAL_NAME
    else:
        return paper.get('journal', {'name': DEFAULT_JOURNAL_NAME}).get('name', DEFAULT_JOURNAL_NAME).replace("'", '').replace('"', '')


def sanitize_abstract(abstract: str) -> str:
    if abstract:
        return (
            abstract
            .replace('"', "'")
            .replace('\\', '\\\\')
        )
    else:
        return abstract


def create_papers():
    """
    Create the nodes of label `Paper`.
    """
    # This is used to extract the keywords from the abstract.
    kw_extractor = yake.KeywordExtractor(
        lan='en',
        n=3,  # Max n-gram size
        top=5  # Number of keywords
    )

    for fname in tqdm(os.listdir(SEMANTIC_PATH)):
        with open(f'{SEMANTIC_PATH}/{fname}') as f:
            paper = json.loads(f.read())

        title = paper['title'].replace('\\', '').replace('"', "'")
        keywords = kw_extractor.extract_keywords(paper['abstract']) if paper['abstract'] else ''
        keywords = list(map(lambda x: str.lower(x[0]) if x else '', keywords))

        # publication_venue: "{paper['publicationVenue']}",
        # venue: "{paper['venue']}",
        # fieldsOfStudy: {paper['fieldsOfStudy'] if paper['fieldsOfStudy'] else '[]'},
        query = f"""
        CREATE (n:Paper {{
            paper_id: "{paper['paperId']}",
            title: "{title}",
            year: toInteger({paper['year'] if paper['year'] else -1}),
            publicationDate: date("{paper['publicationDate'] if paper['publicationDate'] else '1970-01-01'}"),
            abstract: "{sanitize_abstract(paper['abstract'])}",
            keywords: {keywords}
        }})
        """
        try:
            execute(query)
        except:
            print(query)


def create_paper__paper_id__range_index():
    """
    Create indexes
    """
    query = """
        CREATE RANGE INDEX paper__paper_id__range_index IF NOT EXISTS
        FOR (n:Paper)
        ON (n.paper_id)
    """

    execute(query)


def create_author__author_id__range_index():
    """
    Create indexes
    """
    query = """
        CREATE RANGE INDEX author__author_id__range_index IF NOT EXISTS
        FOR (n:Author)
        ON (n.author_id)
    """

    execute(query)


def create_authors() -> None:
    """
    For each paper, generate a node with label `Author` for that paper.
    We are using the MERGE here since we don't want to duplicate authors.
    """
    for fname in tqdm(os.listdir(SEMANTIC_PATH)):
        # print(f'Creating the authors of {fname}')

        with open(f'{SEMANTIC_PATH}/{fname}') as f:
            paper = json.loads(f.read())

        for author in paper['authors']:
            query = f"""
            MERGE (n:Author {{
                name: "{author['name']}",
                author_id: "{author['authorId']}"
            }})
            """
            execute(query)


def link_author_to_paper() -> None:
    """
    Create the edge `Wrote` and `IsCorrespondingAuthor`, linking Authors and Papers.
    The first author is considered the corresponding author.
    """
    for fname in tqdm(os.listdir(SEMANTIC_PATH)):
        # print(f'Linking authors of file {fname}')

        with open(f'{SEMANTIC_PATH}/{fname}') as f:
            paper = json.loads(f.read())

            is_first = True
            for author in paper['authors']:
                if is_first:
                    # The first author is the main corresponding author.
                    query = f"""
                        MATCH (a:Author {{author_id: '{author['authorId']}'}})
                        WITH a
                        MATCH (p:Paper {{paper_id: '{paper['paperId']}'}})
                        WITH a, p
                        CREATE (a)-[e:IsCorrespondingAuthor]->(p);
                    """
                    execute(query)
                    is_first = False

                query = f"""
                    MATCH (a:Author {{author_id: '{author['authorId']}'}})
                    WITH a
                    MATCH (p:Paper {{paper_id: '{paper['paperId']}'}})
                    WITH a, p
                    CREATE (a)-[e:Wrote]->(p);
                """

                execute(query)


def link_citations_between_papers() -> None:
    """
    Generate the edge Cited linking a Paper to a Paper.
    """
    for fname in tqdm(os.listdir(SEMANTIC_PATH)):
        # print(f'Linking citations of file {fname}')

        with open(f'{SEMANTIC_PATH}/{fname}') as f:
            paper = json.loads(f.read())

        for citation in paper.get('citations', []):
            query = f"""
                MATCH (a:Paper {{paper_id: '{citation['paperId']}'}})
                WITH a
                MATCH (p:Paper {{paper_id: '{paper['paperId']}'}})
                CREATE (a)-[e:Cites]->(p);
            """
            execute(query)


def create_journals() -> None:
    """
    Create the Journal nodes.
    """
    for fname in tqdm(os.listdir(SEMANTIC_PATH)):
        with open(f'{SEMANTIC_PATH}/{fname}') as f:
            paper = json.loads(f.read())

        journal_name = parse_journal_name(paper)

        if journal_name != DEFAULT_JOURNAL_NAME:
            query = f"""
                MERGE (n:Journal {{
                    year: toInteger({paper['year'] if paper['year'] else -1}),
                    name: "{journal_name}"
                }})
            """

            execute(query)


def link_journals()-> None:
    """
    Link a Paper to a Journal creating the `PublishedIn` edge.
    """
    for fname in tqdm(os.listdir(SEMANTIC_PATH)):
        with open(f'{SEMANTIC_PATH}/{fname}') as f:
            paper = json.loads(f.read())

        query = f"""
            MATCH (p:Paper {{paper_id: '{paper['paperId']}'}})
            WITH p
            MATCH (j:Journal {{name: '{parse_journal_name(paper)}', year: toInteger({paper['year'] if paper['year'] else -1})}})
            WITH p, j
            CREATE (p)-[e:PublishedIn]->(j);
        """
        execute(query)


def change_to_conference() -> None:
    """
    Change the label from Journal to Conference if the "Journal" name contains 'conference' in it.
    """
    query = """
        MATCH (j:Journal)
        WHERE toLower(j.name) =~ '.*conference.*'
           OR toLower(j.name) =~ '.*workshop.*'
           OR toLower(j.name) =~ '.*proc\..*'
        REMOVE j:Journal
        SET j:ConfWork
    """

    execute(query)


def get_possible_reviewers():
    """
    Auxiliary function that returns an aggregation of all possible reviewers of a paper.
    The logic of a "possible reviewer" is to select an author who:
    1. wrote paper(s) cited by the paper in question; and who
    2. didn't wrote the paper itself.
    """
    query = """
        MATCH (a:Author)-[w1:Wrote]->(mp:Paper)-[c:Cites]->(cp:Paper)
        WITH mp, cp, a
        MATCH (wcp:Author)-[w2:Wrote]->(cp)
        WHERE NOT (wcp)-[:Wrote]->(mp)
        RETURN mp.paper_id AS paper_id, collect(wcp.author_id) AS possible_reviewer_ids;
    """

    return execute(query)


def link_reviewer_to_paper() -> None:
    """
    This function generates synthetic data.
    """
    result = get_possible_reviewers()

    for paper_id, possible_reviewers in tqdm(result[0]):
        # Papers can have a different amount of reviewers, varying from 1 to 4, following the distribution specified by `p`.
        # Edge case: If the paper doesn't cite any other paper, it will have 0 reviewers.
        reviewer_qty = min(
            np.random.choice(np.arange(1, 5), p=[0.1, 0.3, 0.5, 0.1]),
            len(possible_reviewers)
        )

        reviewers = random.sample(possible_reviewers, reviewer_qty)
        for reviewer in reviewers:
            query = f"""
                MATCH (a:Author {{author_id: '{reviewer}'}})
                WITH a
                MATCH (p:Paper {{paper_id: '{paper_id}'}})
                CREATE (a)-[e:Reviewed]->(p);
            """

            execute(query)


if __name__ == '__main__':
    print("Deleting graph")
    delete_graph()
    print("Papers")
    create_papers()
    create_paper__paper_id__range_index()
    print("Authors")
    create_author__author_id__range_index()
    create_authors()
    print("Wrote")
    link_author_to_paper()
    print("Citations")
    link_citations_between_papers()
    print("Journals/Conferences")
    create_journals()
    print("Linking journals")
    link_journals()
    change_to_conference()
    print("Reviewers")
    link_reviewer_to_paper()


    print("Querying...")

    print("Query 1")
    execute("""
MATCH (p:Paper)-[:cited]->(cited:Paper) WITH p.name AS journal, p.title AS title, COUNT(*) AS num_citations ORDER BY journal, num_citations DESC WITH journal, COLLECT({title: title, num_citations: num_citations}) AS papers WITH journal, papers, [i IN RANGE(1, SIZE(papers)) | i] AS ranks UNWIND ranks AS rank WITH journal, papers[rank - 1].title AS title, papers[rank - 1].num_citations AS num_citations, rank WHERE rank <= 3 RETURN journal, title, num_citations, rank ORDER BY journal, rank
    """)

    print("Query 2")
    execute("""
MATCH (a:Author)-[:Wrote]->(p:Paper)-[:PublishedIn]->(c:ConfWork)
WITH a.name AS author, collect(DISTINCT c.year) AS years, c.name AS conference
WHERE size(years) > 3
RETURN author, years, conference
ORDER BY author, conference
    """)

    print("Query 3")
    execute("""
MATCH (citing_paper:Paper)-[:Cites]->(published_paper:Paper {year: j.year})-[:PublishedIn]->(j:Journal)
WITH COUNT(DISTINCT citing_paper) AS total_citations, j.name AS journal_name, j AS j1
MATCH (j2: Journal)<-[:PublishedIn]-(p:Paper)
WHERE j2.year IN [j1.year - 1, j1.year - 2]
      AND j1.name = j2.name
WITH j1.year AS year,
     COUNT(p.title) AS past_publications,
     j1.name AS journal_name,
     total_citations
RETURN year, journal_name, total_citations, past_publications, 1.0 * total_citations / past_publications
ORDER BY journal_name, year;
    """)

    print("Query 4")
    execute("""
MATCH (a:Author)-[:Wrote]->(p:Paper)-[:cited]->(cited:Paper) WITH a, p, COUNT(*) AS num_citations ORDER BY num_citations DESC WITH a, COLLECT(num_citations) AS citation_counts WITH a, [i IN RANGE(1, SIZE(citation_counts)) | CASE WHEN citation_counts[i - 1] >= i THEN i ELSE 0 END] AS h_values WITH a, MAX(h_values) AS h_index WITH a, MAX(REDUCE(s = 0, h IN h_index | CASE WHEN h > s THEN h ELSE s END)) AS max_h_index RETURN a.author_id AS author_id, a.name AS author_name, max_h_index
    """)

    print("Recommendation system")
    print("Part 1")
    execute("""
// First we are looking for papers containing any of those keywords.
MATCH (p:Paper)
WHERE
    // Could've been an array intersection, but APOC was giving us some setup issues.
    'data management' IN p.keywords
    OR 'indexing' IN p.keywords
    OR 'data modeling' IN p.keywords
    OR 'big data' IN p.keywords
    OR 'data processing' IN p.keywords
    OR 'data storage' IN p.keywords
    OR 'data querying' IN p.keywords
RETURN *
    """)

    print("Part 2")
    execute("""
// Now we want the conferences or journals with at least 90% of published papers being related to databases.
MATCH (p:Paper)-[:PublishedIn]->(jc)
WITH p, (
        'data management' IN p.keywords
        OR 'indexing' IN p.keywords
        OR 'data modeling' IN p.keywords
        OR 'big data' IN p.keywords
        OR 'data processing' IN p.keywords
        OR 'data storage' IN p.keywords
        OR 'data querying' IN p.keywords
    ) AS in_db_community,
    jc
WITH COUNT(p) AS total_published_papers, SUM(CASE in_db_community WHEN TRUE THEN 1 ELSE 0 END) AS db_comm_papers, jc.name AS jc_name
WHERE 100.0 * db_comm_papers / total_published_papers > 90.0 
RETURN total_published_papers, db_comm_papers, 100.0 * db_comm_papers / total_published_papers AS percentage_of_db_papers, jc_name
LIMIT 50
    """)

    print("Part 3")
    execute("""
// Let's now grab the top 100 most cited papers in the Database community.
MATCH (p:Paper)-[:PublishedIn]->(jc)
WITH p, (
        'data management' IN p.keywords
        OR 'indexing' IN p.keywords
        OR 'data modeling' IN p.keywords
        OR 'big data' IN p.keywords
        OR 'data processing' IN p.keywords
        OR 'data storage' IN p.keywords
        OR 'data querying' IN p.keywords
    ) AS in_db_community,
    jc
WITH COUNT(p) AS total_published_papers, SUM(CASE in_db_community WHEN TRUE THEN 1 ELSE 0 END) AS db_comm_papers, jc.name AS jc_name, jc
WHERE 100.0 * db_comm_papers / total_published_papers > 90.0
WITH collect(jc.name) AS db_comm_conferences

MATCH (citing_paper:Paper)-[:Cites]->(cited_paper:Paper)-[:PublishedIn]->(jc1), (citing_paper)-[:PublishedIn]->(jc2)
WHERE jc1.name IN db_comm_conferences
  AND jc2.name IN db_comm_conferences
WITH cited_paper, jc1, COUNT(DISTINCT citing_paper) AS c
RETURN c, jc1.name, cited_paper.title
ORDER BY c DESC
LIMIT 100
    """)

    print("Part 4")
    execute("""
// Now, we will find the gurus of the community.
MATCH (p:Paper)-[:PublishedIn]->(jc)
WITH p, (
        'data management' IN p.keywords
        OR 'indexing' IN p.keywords
        OR 'data modeling' IN p.keywords
        OR 'big data' IN p.keywords
        OR 'data processing' IN p.keywords
        OR 'data storage' IN p.keywords
        OR 'data querying' IN p.keywords
    ) AS in_db_community,
    jc
WITH COUNT(p) AS total_published_papers, SUM(CASE in_db_community WHEN TRUE THEN 1 ELSE 0 END) AS db_comm_papers, jc.name AS jc_name, jc
WHERE 100.0 * db_comm_papers / total_published_papers > 90.0
WITH collect(jc.name) AS db_comm_conferences

MATCH (citing_paper:Paper)-[:Cites]->(cited_paper:Paper)-[:PublishedIn]->(jc1), (citing_paper)-[:PublishedIn]->(jc2)
WHERE jc1.name IN db_comm_conferences
  AND jc2.name IN db_comm_conferences
WITH cited_paper, jc1, COUNT(DISTINCT citing_paper) AS c
WITH COLLECT(cited_paper.paper_id)[1..100] AS most_cited_papers// UNWIND most_cited_papers AS most_cited_paper

MATCH (p1:Paper)<-[:Wrote]-(a:Author)-[:Wrote]->(p2:Paper)
WHERE p1 <> p2
AND p1.paper_id IN most_cited_papers
AND p2.paper_id IN most_cited_papers
RETURN a
LIMIT 100
    """)


    print("Graph Algorithms")
    print("Article Rank")
    execute("""
CALL gds.graph.drop('part_d_1', FALSE);
    """)


    execute("""
CALL gds.graph.project(
  'part_d_1',
  'Paper',
  ['Cites', 'PublishedIn']
);
    """)


    execute("""
CALL gds.articleRank.stream('part_d_1')
YIELD nodeId, score
RETURN gds.util.asNode(nodeId).title AS name, score
ORDER BY score DESC, name ASC;
    """)

    print("Node Similarity")
    execute("""
CALL gds.graph.drop('part_d_2', FALSE);
    """)


    execute("""
CALL gds.graph.project(
  'part_d_2',
  ['Author', 'Paper'],
  ['Wrote', 'IsCorrespondingAuthor']
);
    """)


    execute("""
CALL gds.nodeSimilarity.stream('part_d_2')
YIELD node1, node2, similarity
RETURN gds.util.asNode(node1).name AS Author_1, gds.util.asNode(node2).name AS Author_2, similarity
ORDER BY similarity DESC, Author_1, Author_2
    """)


    print("Evolving the graph examples")
    execute("""
// Adding an affiliation
MATCH (a:Author {author_id: '2174735571'})
CREATE (a)-[:Affiliated]->(:University {name: 'UPC'});
    """)

    execute("""
// Adding a new review
MATCH (a:Author {author_id: '2174735571'}) WITH a
MATCH (p:Paper {paper_id: '1da6ce9007a17c60697ca563419d7cc7949ab639'})
CREATE (a)-[:Reviewed {review_text: 'Some comments about xyz...', accepted: TRUE}]->(p);
    """)