In [7]:
# data_ingestion/batch/extractors.py
from typing import Dict
import requests
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../../..')))
from pyspark.sql import DataFrame
import yfinance as yf

class BatchDataExtractor:
    def __init__(self, spark):
        self.spark = spark

    def from_government_api(self, endpoint: str = None, params: Dict = None) -> DataFrame:
        url = config.Config.DATA_SOURCES["government_api"] if endpoint is None else endpoint
        response = requests.get(url, params=params, verify=False)
        response.raise_for_status()
        
        temp_path = "/tmp/api_response.json"
        with open(temp_path, 'w') as f:
            f.write(response.text)
        
        return self.spark.read.json(temp_path)

    def from_stock_market(self, ticker: str, period: str = "1mo") -> DataFrame:
        data = yf.Ticker(ticker)
        hist = data.history(period=period)
        return self.spark.createDataFrame(hist.reset_index())

    def from_database(self, query: str) -> DataFrame:
        return self.spark.read \
            .format("jdbc") \
            .option("url", config.Config.DATA_SOURCES["jdbc_url"]) \
            .option("query", query) \
            .load()

    def from_file(self, path: str, file_type: str = "csv") -> DataFrame:
        if file_type == "csv":
            return self.spark.read.csv(path, header=True, inferSchema=True)
        elif file_type == "parquet":
            return self.spark.read.parquet(path)
        elif file_type == "json":
            return self.spark.read.json(path)
        else:
            raise ValueError(f"Tipo de arquivo não suportado: {file_type}")