# ETL para Redis

## Instalações

In [1]:
%pip install tqdm
%pip install pyarrow

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


## Importações

In [2]:
import os
import sys
from tqdm import tqdm

from pyspark.sql import SparkSession
from pyspark.sql import functions as F

from neo4j_consulta import generate_recommendations

## Constantes

In [3]:
DATASET_PATH = '../../dataset'

USER_PATH = os.path.join(DATASET_PATH, 'users-details-2023.csv')
USER_FILTERED_PATH = os.path.join(DATASET_PATH, 'user-filtered.csv')
ANIME_FILTERED_PATH = os.path.join(DATASET_PATH, 'anime-filtered.csv')

## Código para resolver um problema de versão encontrada pelo PySpark

A resolução do erro foi encontrada em uma resposta no [StackOverflow](https://stackoverflow.com/a/65010346)

In [4]:
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

## Configurando inicialização do "conector" Neo4j spark

In [5]:
spark = SparkSession.\
        builder.\
        config("spark.jars", "../../spark-redis/target/spark-redis_2.12-3.1.0-SNAPSHOT-jar-with-dependencies.jar")\
        .getOrCreate()

## Lendo o arquivo users-details-2023.csv

In [6]:
df_user = spark.read.csv(USER_PATH, inferSchema=True, header=True, escape='"', multiLine=True)
df_user = df_user.select(['mal ID', 'username', 'gender', 'days watched', 'completed', 'dropped', 'location'])

## Lendo o arquivo anime-filtered.csv

In [7]:
df_anime_filtered = spark.read.csv(ANIME_FILTERED_PATH, inferSchema=True, header=True, escape='"', multiLine=True)
df_anime_filtered = df_anime_filtered.select(['anime_id', 'name'])

## Lendo o arquivo user_filtered.csv

In [8]:
df_user_filtered = spark.read.csv(USER_FILTERED_PATH, inferSchema=True, header=True, escape='"', multiLine=True)
df_user_filtered = df_user_filtered.select(['anime_id', 'user_id'])

## Fazendo join entre anime-filtered.csv e user_filtered.csv

In [9]:
df_consumed_animes = df_user_filtered.join(df_anime_filtered, 'anime_id')

## Gerando os agregados

### Coletando as recomendações para os usuários

In [10]:
recommendations = generate_recommendations()

In [11]:
print(recommendations[148])

[{'id': 1735, 'name': 'Naruto: Shippuuden'}, {'id': 1535, 'name': 'Death Note'}, {'id': 5114, 'name': 'Fullmetal Alchemist: Brotherhood'}]


### Criando os agregados iniciais

In [12]:
agregados = {}

users = df_user.collect()
qnt_users = len(users)

for user in tqdm(users, total=qnt_users):
    user_id = user['mal ID']
    
    agregados[user_id] = {}
    agregados[user_id]['username'] = user['username']
    agregados[user_id]['gender'] = user['gender']
    agregados[user_id]['days watched'] = user['days watched']
    agregados[user_id]['completed'] = user['completed']
    agregados[user_id]['dropped'] = user['dropped']
    agregados[user_id]['location'] = user['location']
    
    if user_id in recommendations:
        agregados[user_id]['recomendations'] = recommendations[user_id]
    
    agregados[user_id]['watched animes'] = []

100%|██████████| 731290/731290 [00:11<00:00, 62442.57it/s] 


In [13]:
agregados[1]

{'username': 'Xinil',
 'gender': 'Male',
 'days watched': 142.3,
 'completed': 233.0,
 'dropped': 93.0,
 'location': 'California',
 'recomendations': [{'id': 5114, 'name': 'Fullmetal Alchemist: Brotherhood'},
  {'id': 9253, 'name': 'Steins;Gate'},
  {'id': 2904, 'name': 'Code Geass: Hangyaku no Lelouch R2'}],
 'watched animes': []}

## Adicionando os ultimos animes assistidos ao usuários

In [15]:
from pyspark.sql.functions import pandas_udf, PandasUDFType
from pyspark.sql.types import *

@pandas_udf(ArrayType(
                StructType([
                    StructField('id', IntegerType()),
                    StructField('name', StringType()),
                    StructField('img', StringType())
])), PandasUDFType.GROUPED_AGG)  
def last_3_udf(anime_ids, anime_names):
    last_animes = []
    for anime_id, anime_name in zip(anime_ids[-3:], anime_names[-3:]):
        last_animes.append({
            'id': anime_id,
            'name': anime_name,
            'img': 'img-path'
        })
    return last_animes

results = df_consumed_animes.groupby('user_id').agg(last_3_udf(df_consumed_animes.anime_id, df_consumed_animes.name)).collect()

325770




In [20]:
for result in results:
    if result['user_id'] in agregados:
        agregados[result['user_id']]['watched animes'] = []
        for row in result['min_udf(anime_id, name)']:
            agregados[result['user_id']]['watched animes'].append({
                'id': row['id'],
                'name': row['name'],
                'img': row['img']
            })

In [23]:
agregados[3]

{'username': 'Aokaado',
 'gender': 'Male',
 'days watched': 68.6,
 'completed': 137.0,
 'dropped': 44.0,
 'location': 'Oslo, Norway',
 'recomendations': [{'id': 32281, 'name': 'Kimi no Na wa.'},
  {'id': 9253, 'name': 'Steins;Gate'},
  {'id': 28851, 'name': 'Koe no Katachi'}],
 'watched animes': [{'id': 16524,
   'name': 'Suisei no Gargantia',
   'img': 'img-path'},
  {'id': 25, 'name': 'Sunabouzu', 'img': 'img-path'},
  {'id': 759, 'name': 'Tokyo Godfathers', 'img': 'img-path'}]}

### Tranformando agregados de dicionário para DataFrame Spark

### Salvando os dados no Redis