In [1]:
import requests
import datetime
import json

In [2]:
class Collector:

    def __init__(self, url) -> None:
        self.url = url
        # instancia é o final do meu endpoint
        self.instance = url.strip("/").split("/")[-1]

    # extract
    def get_endpoint(self, **kwargs):
        response = requests.get(self.url, params=kwargs)
        return response

    # ingestion
    def save_data(self, data):
        now = datetime.datetime.now().strftime("%Y%m%d_%H%M%S.%f")
        data['ingestion_date'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M%S")
        file_name = f"./datalake/pokemon/{self.instance}/{now}.json"
        with open(file_name, "w") as opened_file:
            json.dump(data, opened_file)

    def get_and_save(self, **kwargs):
        resp = self.get_endpoint(**kwargs)
        if(resp.status_code == 200):
            data = resp.json()
            self.save_data(data)
            return data
        else:
            return {}
        
    def auto_exec(self, limit=100):
        offset = 0
        while True:
            print(offset)
            data = self.get_and_save(limit=limit, offset = offset)

            # caso não tenha próxima pagina na API, sai do loop
            if(data["next"]) == None:
                break
            else:
                offset += limit


In [3]:
url = "https://pokeapi.co/api/v2/pokemon"
collector = Collector(url)
collector.auto_exec()

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300


Spark

In [4]:
from pyspark.sql import SparkSession

# Iniciar uma SparkSession
spark = SparkSession.builder \
    .appName("Pokemon") \
    .getOrCreate()

# Motra a tabela como o Pandas faz
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

df = spark.read.json('./datalake/pokemon/pokemon')

df.show()

24/09/28 18:42:36 WARN Utils: Your hostname, everton-desktop resolves to a loopback address: 127.0.1.1; using 192.168.15.8 instead (on interface enp4s0)
24/09/28 18:42:36 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/28 18:42:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


+-----+------------------+--------------------+--------------------+--------------------+
|count|    ingestion_date|                next|            previous|             results|
+-----+------------------+--------------------+--------------------+--------------------+
| 1302|2024-09-28 18:4235|https://pokeapi.c...|https://pokeapi.c...|[{darumaka-galar,...|
| 1302|2024-09-28 18:4234|https://pokeapi.c...|https://pokeapi.c...|[{metagross-mega,...|
| 1302|2024-09-28 18:4234|https://pokeapi.c...|https://pokeapi.c...|[{wo-chien, https...|
| 1302|2024-09-28 18:4234|https://pokeapi.c...|https://pokeapi.c...|[{ursaluna, https...|
| 1302|2024-09-28 18:4233|https://pokeapi.c...|https://pokeapi.c...|[{magearna, https...|
| 1302|2024-09-28 18:4233|https://pokeapi.c...|https://pokeapi.c...|[{hawlucha, https...|
| 1302|2024-09-28 18:4232|https://pokeapi.c...|https://pokeapi.c...|[{klinklang, http...|
| 1302|2024-09-28 18:4232|https://pokeapi.c...|https://pokeapi.c...|[{oshawott, https...|
| 1302|202

In [5]:
# cria uma view temporária chamada pokemon, deixando eu fazer query sql aqui dentro
df = df.createOrReplaceTempView("pokemon")

# query sql
query = "SELECT *, explode(results) as pokemon FROM pokemon"

result = spark.sql(query)

result.show()

+-----+------------------+--------------------+--------------------+--------------------+--------------------+
|count|    ingestion_date|                next|            previous|             results|             pokemon|
+-----+------------------+--------------------+--------------------+--------------------+--------------------+
| 1302|2024-09-28 18:4235|https://pokeapi.c...|https://pokeapi.c...|[{darumaka-galar,...|{darumaka-galar, ...|
| 1302|2024-09-28 18:4235|https://pokeapi.c...|https://pokeapi.c...|[{darumaka-galar,...|{darmanitan-galar...|
| 1302|2024-09-28 18:4235|https://pokeapi.c...|https://pokeapi.c...|[{darumaka-galar,...|{darmanitan-galar...|
| 1302|2024-09-28 18:4235|https://pokeapi.c...|https://pokeapi.c...|[{darumaka-galar,...|{yamask-galar, ht...|
| 1302|2024-09-28 18:4235|https://pokeapi.c...|https://pokeapi.c...|[{darumaka-galar,...|{stunfisk-galar, ...|
| 1302|2024-09-28 18:4235|https://pokeapi.c...|https://pokeapi.c...|[{darumaka-galar,...|{zygarde-10, http...|
|