In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sqlalchemy import create_engine, inspect
from dotenv import load_dotenv
import os

In [2]:
df = pd.read_csv('Data/Clean/cars_clean.csv')
apidf = pd.read_csv('Data/Clean/petroleum_clean.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8452 entries, 0 to 8451
Data columns (total 32 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Year                   8452 non-null   int64  
 1   Make                   8452 non-null   object 
 2   Model                  8452 non-null   object 
 3   Used                   8452 non-null   bool   
 4   Price                  8452 non-null   int64  
 5   ConsumerRating         8452 non-null   float64
 6   ConsumerReviews        8452 non-null   int64  
 7   SellerType             8452 non-null   object 
 8   SellerName             8452 non-null   object 
 9   SellerRating           8452 non-null   float64
 10  SellerReviews          8452 non-null   int64  
 11  StreetName             8452 non-null   object 
 12  State                  8452 non-null   object 
 13  Zipcode                8452 non-null   object 
 14  DealType               8247 non-null   object 
 15  Comf

# CAR SELLS

## Dimensions of cars dataset

### Car

In [4]:
# Car dim columns
car_dim = df[['Year', 'Make', 'Model', 'Drivetrain', 'MinMPG', 'MaxMPG', 
                         'FuelType', 'Transmission', 'Engine', 'ExteriorColor', 'InteriorColor', 
                         'Used', 'VIN', 'Stock#']].drop_duplicates().reset_index(drop=True)
# ID
car_dim['ID_Car'] = car_dim.index + 1

car_dim.head()

Unnamed: 0,Year,Make,Model,Drivetrain,MinMPG,MaxMPG,FuelType,Transmission,Engine,ExteriorColor,InteriorColor,Used,VIN,Stock#,ID_Car
0,2019,Toyota,Sienna SE,FWD,19,27,Gasoline,Automatic,3.5L V6 24V PDI DOHC,Red,Black,True,5TDXZ3DC2KS015402,22998646,1
1,2018,Ford,F-150 Lariat,4WD,19,24,Gasoline,Automatic,3.5L V6 24V PDI DOHC Twin Turbo,Shadow Black,Black,True,1FTEW1EG2JFD44217,22418A,2
2,2017,Ram,1500 Laramie,4WD,15,21,Gasoline,Automatic,5.7L V8 16V MPFI OHV,Granite Crystal Clearcoat Metallic,Black,True,1C6RR7VT5HS842283,NG277871G,3
3,2021,Honda,Accord Sport SE,FWD,29,35,Gasoline,CVT,1.5L I4 16V GDI DOHC Turbo,Gray,Â,True,1HGCV1F49MA038035,54237,4
4,2020,Lexus,RX 350,FWD,20,27,Gasoline,Automatic,3.5L V6 24V PDI DOHC,Eminent White Pearl,Birch,True,2T2AZMAA8LC156270,HDT4181A,5


### Seller

In [20]:
# Seller dim columns
seller_dim = df[['SellerName', 'SellerType', 'State', 'Zipcode', 'StreetName']].drop_duplicates().reset_index(drop=True)
#ID
seller_dim['ID_Seller'] = seller_dim.index + 1

seller_dim.head()

Unnamed: 0,SellerName,SellerType,State,Zipcode,StreetName,ID_Seller
0,CarMax Murrieta - Now offering Curbside Pickup...,Dealer,CA,92562,25560 Madison Ave Murrieta,1
1,Giant Chevrolet,Dealer,CA,93292,1001 S Ben Maddox Way Visalia,2
2,Gill Auto Group Madera,Dealer,CA,93637,1100 S Madera Ave Madera,3
3,AutoSavvy Las Vegas,Dealer,NV,89104,2121 E Sahara Ave Las Vegas,4
4,Lexus of Henderson,Dealer,NV,89011,7737 Eastgate Rd Henderson,5


## Ratings

In [6]:
# Rating dim columns
rating_dim = df[['ConsumerRating', 'SellerRating', 'ComfortRating', 'InteriorDesignRating', 
                        'PerformanceRating', 'ValueForMoneyRating', 'ExteriorStylingRating', 
                        'ReliabilityRating', 'DealType']].drop_duplicates().reset_index(drop=True)
# ID
rating_dim['ID_Rating'] = rating_dim.index + 1

# Mostrar la dimensión de ratings
rating_dim.head()

Unnamed: 0,ConsumerRating,SellerRating,ComfortRating,InteriorDesignRating,PerformanceRating,ValueForMoneyRating,ExteriorStylingRating,ReliabilityRating,DealType,ID_Rating
0,4.6,3.3,4.7,4.6,4.6,4.4,4.6,4.7,Great,1
1,4.8,4.8,4.9,4.8,4.8,4.6,4.8,4.7,Good,2
2,4.7,4.6,4.8,4.7,4.8,4.6,4.8,4.7,Good,3
3,5.0,4.6,4.9,5.0,4.9,5.0,5.0,5.0,,4
4,4.8,4.8,4.9,4.8,4.8,4.7,4.8,4.9,Good,5


## Sells fact table

In [24]:
# Merge original df with dimensions to asign IDs
df_hechos_vendedor = pd.merge(df, seller_dim, on=['SellerName', 'SellerType', 'State', 'Zipcode', 'StreetName'], how='left')

df_hechos_vehiculo = pd.merge(df, car_dim, on=['Year', 'Make', 'Model', 'Drivetrain', 'MinMPG', 'MaxMPG', 
                                                'FuelType', 'Transmission', 'Engine' ,'ExteriorColor', 'InteriorColor', 
                                                'Used', 'VIN', 'Stock#'], how='left')

df_hechos_ratings = pd.merge(df, rating_dim, on=['ConsumerRating', 'SellerRating', 'ComfortRating','InteriorDesignRating', 
                                                        'PerformanceRating','ValueForMoneyRating', 'ExteriorStylingRating', 
                                                        'ReliabilityRating', 'DealType'], how='left')

#Fact table columns
tabla_hechos = df_hechos_vendedor[['Price', 'Mileage', 'ConsumerReviews', 'SellerReviews']].copy()

# Add IDs from dimensions
tabla_hechos['ID_Car'] = df_hechos_vehiculo['ID_Car']
tabla_hechos['ID_Rating'] = df_hechos_ratings['ID_Rating']
tabla_hechos['ID_Seller'] = df_hechos_vendedor['ID_Seller']

# Sell ID
tabla_hechos['ID_Sell'] = df.index + 1

#Reorder the columns
tabla_hechos = tabla_hechos[['ID_Sell', 'ID_Car', 'ID_Seller', 'ID_Rating', 'Price', 
                             'Mileage', 'ConsumerReviews', 'SellerReviews']]


In [25]:
tabla_hechos.head(3)

Unnamed: 0,ID_Sell,ID_Car,ID_Seller,ID_Rating,Price,Mileage,ConsumerReviews,SellerReviews
0,1,1,1,1,39998,29403,45,3
1,2,2,2,2,49985,32929,817,131
2,3,3,3,3,41860,23173,495,249


In [26]:
tabla_hechos['ID_Seller'].value_counts()

ID_Seller
717     31
19      31
183     27
179     23
479     17
        ..
3962     1
3961     1
3960     1
3959     1
3958     1
Name: count, Length: 3972, dtype: int64

In [27]:
seller_dim['ID_Seller'].value_counts()

ID_Seller
3972    1
1       1
2       1
3       1
3956    1
       ..
9       1
8       1
7       1
6       1
5       1
Name: count, Length: 3972, dtype: int64

# Dimensions for petroleum dataset (API)

In [12]:
apidf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 988 entries, 0 to 987
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   period              988 non-null    object 
 1   area-name           988 non-null    object 
 2   product             988 non-null    object 
 3   product-name        988 non-null    object 
 4   process             988 non-null    object 
 5   process-name        988 non-null    object 
 6   series-description  988 non-null    object 
 7   value($/GAL)        988 non-null    float64
 8   area                988 non-null    object 
dtypes: float64(1), object(8)
memory usage: 69.6+ KB


### Area

In [13]:
# Area dim columns
area_dim = apidf[['area', 'area-name']].drop_duplicates().reset_index(drop=True)
#ID
area_dim['area_ID'] = area_dim.index + 1

area_dim.head()

Unnamed: 0,area,area-name,area_ID
0,Region,Gulf Coast,1
1,Region,East Coast,2
2,Region,West Coast (except California),3
3,Region,Rocky Mountain,4
4,Region,East Coast (South),5


### Product

In [14]:
# Product dim
product_dim = apidf[['product', 'product-name']].drop_duplicates().reset_index(drop=True)
#ID
product_dim['product_ID'] = product_dim.index + 1

product_dim.head()

Unnamed: 0,product,product-name,product_ID
0,Gasoline,Total Gasoline,1
1,Diesel,No 2 Diesel,2
2,Gasoline,Conventional Regular Gasoline,3
3,Gasoline,Reformulated Regular Gasoline,4
4,Gasoline,Regular Gasoline,5


### Details

In [15]:
# Details dim columns
details_dim = apidf[['process', 'process-name', 'series-description']].drop_duplicates().reset_index(drop=True)
#ID
details_dim['details_ID'] = details_dim.index + 1

details_dim.head()


Unnamed: 0,process,process-name,series-description,details_ID
0,PTE,Retail Sales,Gulf Coast All Grades All Formulations Retail ...,1
1,PTE,Retail Sales,East Coast No 2 Diesel Retail Prices (Dollars ...,2
2,PTE,Retail Sales,West Coast (PADD 5) Except California No 2 Die...,3
3,PTE,Retail Sales,Rocky Mountain Regular Conventional Retail Gas...,4
4,PTE,Retail Sales,New England (PADD 1A) Regular Conventional Ret...,5


### Fuel Fact

In [29]:
# Merge original df with dimensions to asign IDs
df_fuel_area = pd.merge(apidf, area_dim, on=['area', 'area-name'], how='left')
df_fuel_product = pd.merge(apidf, product_dim, on=['product', 'product-name'], how='left')
df_fuel_details = pd.merge(apidf, details_dim, on=['process', 'process-name', 'series-description'], how='left')

#Fact table columns
fuel_fact = df_fuel_area[['period', 'value($/GAL)']].copy()

# Add IDs from dimensions
fuel_fact['area_ID'] = df_fuel_area['area_ID']
fuel_fact['product_ID'] = df_fuel_product['product_ID']
fuel_fact['details_ID'] = df_fuel_details['details_ID']

# Fuel ID
fuel_fact['fuel_ID'] = fuel_fact.index + 1

#Reorder the columns
fuel_fact = fuel_fact[['fuel_ID', 'period', 'area_ID', 'product_ID', 'details_ID', 'value($/GAL)']]


In [28]:
fuel_fact.head()

Unnamed: 0,fuel_ID,period,area_ID,product_ID,details_ID,value($/GAL)
0,1,2024-09-30,1,1,1,2.793
1,2,2024-09-30,2,2,2,3.571
2,3,2024-09-30,3,2,3,3.797
3,4,2024-09-30,4,3,4,3.421
4,5,2024-09-30,5,3,5,3.058


## Save

In [17]:
# Guardar las tablas en archivos CSV
tabla_hechos.to_csv('Data/Dimensional_model/sells_fact.csv', index=False)
car_dim.to_csv('Data/Dimensional_model/car_dim.csv', index=False)
seller_dim.to_csv('Data/Dimensional_model/seller_dim.csv', index=False)
rating_dim.to_csv('Data/Dimensional_model/rating_dim.csv', index=False)

In [18]:
# Guardar las tablas en archivos CSV
fuel_fact.to_csv('Data/Dimensional_model/fuel_fact.csv', index=False)
area_dim.to_csv('Data/Dimensional_model/area_dim.csv', index=False)
product_dim.to_csv('Data/Dimensional_model/product_dim.csv', index=False)
details_dim.to_csv('Data/Dimensional_model/details_dim.csv', index=False)


Subimos a la base de datos

In [None]:
load_dotenv()

# Conexión a la base de datos
localhost = os.getenv('LOCALHOST')
port = os.getenv('PORT')
nameDB = os.getenv('DB_NAME')
userDB = os.getenv('DB_USER')
passDB = os.getenv('DB_PASS')

engine = create_engine(f'postgresql+psycopg2://{userDB}:{passDB}@{localhost}:{port}/{nameDB}')

csv_directory = 'Data/Dimensional_model'

for csv_file in os.listdir(csv_directory):
    if csv_file.endswith('.csv'):
        table_name = csv_file.replace('.csv', '')
        
        location_file = os.path.join(csv_directory, csv_file)
        
        try:
            df = pd.read_csv(location_file, sep=",")
            
            df.to_sql(table_name, engine, if_exists='replace', index=False)
            
            print(f"Tabla '{table_name}' creada y datos subidos exitosamente.")
        
        except Exception as e:
            print(f"Error al subir los datos del archivo '{csv_file}': {e}")

        finally:
            engine.dispose()