## Llamado a la API

In [None]:
import requests
import pandas as pd

# URL
url = "https://api.eia.gov/v2/petroleum/pri/gnd/data/?frequency=weekly&data[0]=value&sort[0][column]=period&sort[0][direction]=desc&offset=0&length=1000&api_key=bqwjaJLDl8NGnarM5gvFz7iDmIGNyKK47vtgmX91"

# Llamado a la API
response = requests.get(url)

# Verifica si el llamado fue exitoso (status code 200)
if response.status_code == 200:
    # Convierte la respuesta a formato JSON
    data = response.json()
    
    # Extrae los datos relevantes (ajusta si es necesario según la estructura del JSON)
    if 'response' in data and 'data' in data['response']:
        records = data['response']['data']
        
        # Cargar los datos en un DataFrame de pandas
        df = pd.DataFrame(records)
        
        # Muestra las primeras filas del DataFrame
        print(df.head())
        print(f"Total de registros obtenidos: {len(df)}")
    else:
        print("Error: La estructura del JSON no contiene los datos esperados.")
else:
    print(f"Error: Falló el llamado a la API con el código de estado {response.status_code}")

save = df.to_csv('Data/Raws/petroleum2.csv', index=False)

## Transformaciones API

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sqlalchemy import create_engine, inspect
from dotenv import load_dotenv
import os

df = pd.read_csv("../Data/Raws/petroleum.csv", delimiter=',',encoding='unicode_escape')

############################################### Drop / Rename columns

#Drop columns
drop_columns = ['duoarea', 'units', 'series']  
df = df.drop(columns=drop_columns)

#Rename columns (value)
df = df.rename(columns={'value': 'value($/GAL)'})

############################################### Formats

#Correct the types:
df['period'] = pd.to_datetime(df['period'], format='%Y-%m-%d') #Object to date
df = df.astype({col: 'string' for col in df.select_dtypes(include='object').columns}) #Object to string


############################################### Cleaning / Replace values
replaces = {
    'PADD 5 EXCEPT CALIFORNIA': 'West Coast (except California)','PADD 4': 'Rocky Mountain',
    'PADD 2': 'Midwest','PADD 5': 'West Coast','PADD 3': 'Gulf Coast','PADD 1C': 'East Coast (Central)',
    'PADD 1B': 'East Coast (North)','PADD 1A': 'East Coast (South)','PADD 1': 'East Coast'
} #Create a dictionary to replace the PADD values to a more explicit name
df['area-name'] = df['area-name'].replace(replaces) #Replaces



#Make a list for the codes to know if an area is city/state/region
city_list = ['DENVER', 'NEW YORK CITY', 'SAN FRANCISCO', 'MIAMI', 'CLEVELAND', 
              'CHICAGO', 'SEATTLE', 'HOUSTON', 'LOS ANGELES', 'BOSTON']

state_list = ['TEXAS', 'NEW YORK', 'COLORADO', 'CALIFORNIA', 'MINNESOTA', 'FLORIDA', 'MASSACHUSETTS', 
               'WASHINGTON', 'OHIO']

region_list = ['West Coast (except California)', 'Rocky Mountain', 'Midwest', 'West Coast', 
               'Gulf Coast', 'East Coast (Central)', 'East Coast (North)', 'U.S.', 
               'East Coast (South)', 'East Coast']

#Create the column 'area' based in 'area-name' values (If they are reffering to a city/state/region)
df['area'] = np.where(df['area-name'].isin(city_list), 'City',
               np.where(df['area-name'].isin(state_list), 'State', 
               np.where(df['area-name'].isin(region_list), 'Region', df['area-name'])))




#Make a list for the codes to gasoline/diesel
gasoline_codes = ['EPM0', 'EPMM', 'EPMP', 'EPMR', 'EPMMR', 'EPMRR', 'EPM0R', 'EPMMU', 'EPMPR','EPMPU', 'EPM0U', 'EPMRU']
diesel_codes = ['EPD2DXL0', 'EPD2D']

#Replaces
df['product'] = np.where(df['product'].isin(gasoline_codes), 'Gasoline', 
                np.where(df['product'].isin(diesel_codes), 'Diesel', df['product']))


############################################### Nulls cleaning


df.dropna(subset=['value($/GAL)'], inplace=True) #No nulls


############################################### Save

df.to_csv('../Data/Clean/petroleum_clean.csv', index=False)


load_dotenv()

localhost = os.getenv('LOCALHOST')
port = os.getenv('PORT')
nameDB = os.getenv('DB_NAME')
userDB = os.getenv('DB_USER')
passDB = os.getenv('DB_PASS')

  
clean_table_database =  "api_petroleum"

engine = create_engine(f'postgresql+psycopg2://{userDB}:{passDB}@{localhost}:{port}/{nameDB}')

try:
    df.to_sql(clean_table_database, engine, if_exists='replace', index=False)
    print(f"Tabla '{clean_table_database}' actualizada correctamente.")

except Exception as e:
    print(f"Error al subir los datos: {e}")

finally:
    engine.dispose()

## Modelo dimensional

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sqlalchemy import create_engine, inspect
from dotenv import load_dotenv
import os

df = pd.read_csv('Data/Clean/cars_clean.csv')
apidf = pd.read_csv('Data/Clean/petroleum_clean.csv')


############################################# DIMENSIONS FOR CARS DATASET

# Car dim columns
car_dim = df[['Year', 'Make', 'Model', 'Drivetrain', 'MinMPG', 'MaxMPG', 
                         'FuelType', 'Transmission', 'Engine', 'ExteriorColor', 'InteriorColor', 
                         'Used', 'VIN', 'Stock#']].drop_duplicates().reset_index(drop=True)
# ID
car_dim['ID_Car'] = car_dim.index + 1


# Seller dim columns
seller_dim = df[['SellerName', 'SellerType', 'State', 'Zipcode', 'StreetName']].drop_duplicates().reset_index(drop=True)
#ID
seller_dim['ID_Seller'] = seller_dim.index + 1


# Rating dim columns
rating_dim = df[['ConsumerRating', 'SellerRating', 'ComfortRating', 'InteriorDesignRating', 
                        'PerformanceRating', 'ValueForMoneyRating', 'ExteriorStylingRating', 
                        'ReliabilityRating', 'DealType']].drop_duplicates().reset_index(drop=True)
# ID
rating_dim['ID_Rating'] = rating_dim.index + 1


# Merge original df with dimensions to asign IDs
df_hechos_vendedor = pd.merge(df, seller_dim, on=['SellerName', 'SellerType', 'State', 'Zipcode', 'StreetName'], how='left')

df_hechos_vehiculo = pd.merge(df, car_dim, on=['Year', 'Make', 'Model', 'Drivetrain', 'MinMPG', 'MaxMPG', 
                                                'FuelType', 'Transmission', 'Engine' ,'ExteriorColor', 'InteriorColor', 
                                                'Used', 'VIN', 'Stock#'], how='left')

df_hechos_ratings = pd.merge(df, rating_dim, on=['ConsumerRating', 'SellerRating', 'ComfortRating','InteriorDesignRating', 
                                                        'PerformanceRating','ValueForMoneyRating', 'ExteriorStylingRating', 
                                                        'ReliabilityRating', 'DealType'], how='left')

#Fact table columns
tabla_hechos = df_hechos_vendedor[['Price', 'Mileage', 'ConsumerReviews', 'SellerReviews']].copy()

# Add IDs from dimensions
tabla_hechos['ID_Car'] = df_hechos_vehiculo['ID_Car']
tabla_hechos['ID_Rating'] = df_hechos_ratings['ID_Rating']
tabla_hechos['ID_Seller'] = df_hechos_vendedor['ID_Seller']

# Sell ID
tabla_hechos['ID_Sell'] = df.index + 1

#Reorder the columns
tabla_hechos = tabla_hechos[['ID_Sell', 'ID_Car', 'ID_Seller', 'ID_Rating', 'Price', 
                             'Mileage', 'ConsumerReviews', 'SellerReviews']]


############################################# DIMENSIONS FOR API

# Area dim columns
area_dim = apidf[['area', 'area-name']].drop_duplicates().reset_index(drop=True)
#ID
area_dim['area_ID'] = area_dim.index + 1


# Product dim
product_dim = apidf[['product', 'product-name']].drop_duplicates().reset_index(drop=True)
#ID
product_dim['product_ID'] = product_dim.index + 1


# Details dim columns
details_dim = apidf[['process', 'process-name', 'series-description']].drop_duplicates().reset_index(drop=True)
#ID
details_dim['details_ID'] = details_dim.index + 1


# Merge original df with dimensions to asign IDs
df_fuel_area = pd.merge(apidf, area_dim, on=['area', 'area-name'], how='left')
df_fuel_product = pd.merge(apidf, product_dim, on=['product', 'product-name'], how='left')
df_fuel_details = pd.merge(apidf, details_dim, on=['process', 'process-name', 'series-description'], how='left')

#Fact table columns
fuel_fact = df_fuel_area[['period', 'value($/GAL)']].copy()

# Add IDs from dimensions
fuel_fact['area_ID'] = df_fuel_area['area_ID']
fuel_fact['product_ID'] = df_fuel_product['product_ID']
fuel_fact['details_ID'] = df_fuel_details['details_ID']

# Fuel ID
fuel_fact['fuel_ID'] = fuel_fact.index + 1

#Reorder the columns
fuel_fact = fuel_fact[['fuel_ID', 'period', 'area_ID', 'product_ID', 'details_ID', 'value($/GAL)']]



# Guardar las tablas en archivos CSV
tabla_hechos.to_csv('Data/Dimensional_model/sells_fact.csv', index=False)
car_dim.to_csv('Data/Dimensional_model/car_dim.csv', index=False)
seller_dim.to_csv('Data/Dimensional_model/seller_dim.csv', index=False)
rating_dim.to_csv('Data/Dimensional_model/rating_dim.csv', index=False)

# Guardar las tablas en archivos CSV
fuel_fact.to_csv('Data/Dimensional_model/fuel_fact.csv', index=False)
area_dim.to_csv('Data/Dimensional_model/area_dim.csv', index=False)
product_dim.to_csv('Data/Dimensional_model/product_dim.csv', index=False)
details_dim.to_csv('Data/Dimensional_model/details_dim.csv', index=False)


############################################# SAVE IN DB


load_dotenv()

# Conexión a la base de datos
localhost = os.getenv('LOCALHOST')
port = os.getenv('PORT')
nameDB = os.getenv('DB_NAME')
userDB = os.getenv('DB_USER')
passDB = os.getenv('DB_PASS')

engine = create_engine(f'postgresql+psycopg2://{userDB}:{passDB}@{localhost}:{port}/{nameDB}')

csv_directory = 'Data/Dimensional_model'

for csv_file in os.listdir(csv_directory):
    if csv_file.endswith('.csv'):
        table_name = csv_file.replace('.csv', '')
        
        location_file = os.path.join(csv_directory, csv_file)
        
        try:
            df = pd.read_csv(location_file, sep=",")
            
            df.to_sql(table_name, engine, if_exists='replace', index=False)
            
            print(f"Tabla '{table_name}' creada y datos subidos exitosamente.")
        
        except Exception as e:
            print(f"Error al subir los datos del archivo '{csv_file}': {e}")

        finally:
            engine.dispose()
