In [17]:
import requests
import datetime
import json

In [18]:
class Collector:

    def __init__(self, url) -> None:
        self.url = url
        # instancia é o final do meu endpoint
        self.instance = url.strip("/").split("/")[-1]

    # extract
    def get_endpoint(self, **kwargs):
        response = requests.get(self.url, params=kwargs)
        return response

    # ingestion
    def save_data(self, data):
        now = datetime.datetime.now().strftime("%Y%m%d_%H%M%S.%f")
        data['ingestion_date'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M%S")
        file_name = f"./datalake/pokemon/{self.instance}/{now}.json"
        with open(file_name, "w") as opened_file:
            json.dump(data, opened_file)

    def get_and_save(self, **kwargs):
        resp = self.get_endpoint(**kwargs)
        if(resp.status_code == 200):
            data = resp.json()
            self.save_data(data)
            return data
        else:
            return {}
        
    def auto_exec(self, limit=100):
        offset = 0
        while True:
            print(offset)
            data = self.get_and_save(limit=limit, offset = offset)

            # caso não tenha próxima pagina na API, sai do loop
            if(data["next"]) == None:
                break
            else:
                offset += limit


In [19]:
url = "https://pokeapi.co/api/v2/pokemon"
collector = Collector(url)
collector.auto_exec()

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300


Spark

In [26]:
from pyspark.sql import SparkSession

# Iniciar uma SparkSession
spark = SparkSession.builder \
    .appName("Pokemon") \
    .getOrCreate()

# Motra a tabela como o Pandas faz
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)

df = spark.read.json('./datalake/pokemon/pokemon')

df.show()

+-----+------------------+--------------------+--------------------+--------------------+
|count|    ingestion_date|                next|            previous|             results|
+-----+------------------+--------------------+--------------------+--------------------+
| 1302|2024-09-28 18:5431|https://pokeapi.c...|https://pokeapi.c...|[{darumaka-galar,...|
| 1302|2024-09-28 18:5430|https://pokeapi.c...|https://pokeapi.c...|[{metagross-mega,...|
| 1302|2024-09-28 18:5430|https://pokeapi.c...|https://pokeapi.c...|[{wo-chien, https...|
| 1302|2024-09-28 18:5430|https://pokeapi.c...|https://pokeapi.c...|[{ursaluna, https...|
| 1302|2024-09-28 18:5429|https://pokeapi.c...|https://pokeapi.c...|[{magearna, https...|
| 1302|2024-09-28 18:5429|https://pokeapi.c...|https://pokeapi.c...|[{hawlucha, https...|
| 1302|2024-09-28 18:5428|https://pokeapi.c...|https://pokeapi.c...|[{klinklang, http...|
| 1302|2024-09-28 18:5428|https://pokeapi.c...|https://pokeapi.c...|[{oshawott, https...|
| 1302|202

In [27]:
# cria uma view temporária chamada pokemon, deixando eu fazer query sql aqui dentro
df = df.createOrReplaceTempView("pokemon")

# query sql | trabalhando com colunas específicas
query = "SELECT count, ingestion_date, explode(results) as pokemon FROM pokemon"

result = spark.sql(query)

result.show()

+-----+------------------+--------------------+
|count|    ingestion_date|             pokemon|
+-----+------------------+--------------------+
| 1302|2024-09-28 18:5431|{darumaka-galar, ...|
| 1302|2024-09-28 18:5431|{darmanitan-galar...|
| 1302|2024-09-28 18:5431|{darmanitan-galar...|
| 1302|2024-09-28 18:5431|{yamask-galar, ht...|
| 1302|2024-09-28 18:5431|{stunfisk-galar, ...|
| 1302|2024-09-28 18:5431|{zygarde-10, http...|
| 1302|2024-09-28 18:5431|{cramorant-gulpin...|
| 1302|2024-09-28 18:5431|{cramorant-gorgin...|
| 1302|2024-09-28 18:5431|{toxtricity-low-k...|
| 1302|2024-09-28 18:5431|{eiscue-noice, ht...|
| 1302|2024-09-28 18:5431|{indeedee-female,...|
| 1302|2024-09-28 18:5431|{morpeko-hangry, ...|
| 1302|2024-09-28 18:5431|{zacian-crowned, ...|
| 1302|2024-09-28 18:5431|{zamazenta-crowne...|
| 1302|2024-09-28 18:5431|{eternatus-eterna...|
| 1302|2024-09-28 18:5431|{urshifu-rapid-st...|
| 1302|2024-09-28 18:5431|{zarude-dada, htt...|
| 1302|2024-09-28 18:5431|{calyrex-ice, 

Só Pokemons