In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sqlalchemy import create_engine, inspect
from dotenv import load_dotenv
import os

In [2]:
df = pd.read_csv('Data/Clean/cars_clean.csv')
apidf = pd.read_csv('Data/Clean/petroleum_clean.csv')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8452 entries, 0 to 8451
Data columns (total 32 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Year                   8452 non-null   int64  
 1   Make                   8452 non-null   object 
 2   Model                  8452 non-null   object 
 3   Used                   8452 non-null   bool   
 4   Price                  8452 non-null   int64  
 5   ConsumerRating         8452 non-null   float64
 6   ConsumerReviews        8452 non-null   int64  
 7   SellerType             8452 non-null   object 
 8   SellerName             8452 non-null   object 
 9   SellerRating           8452 non-null   float64
 10  SellerReviews          8452 non-null   int64  
 11  StreetName             8452 non-null   object 
 12  State                  8452 non-null   object 
 13  Zipcode                8452 non-null   object 
 14  DealType               8247 non-null   object 
 15  Comf

## Dimensions

### Vehiculo

In [17]:
# Dimensión: Vehículo (sin las columnas que van a la tabla de hechos)
dimension_vehiculo = df[['Year', 'Make', 'Model', 'Drivetrain', 'MinMPG', 'MaxMPG', 
                         'FuelType', 'Transmission', 'Engine']].drop_duplicates().reset_index(drop=True)

# Asignar un ID único a cada vehículo
dimension_vehiculo['ID_Vehiculo'] = dimension_vehiculo.index + 1

# Mostrar la dimensión de vehículo
dimension_vehiculo.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3221 entries, 0 to 3220
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Year          3221 non-null   int64 
 1   Make          3221 non-null   object
 2   Model         3221 non-null   object
 3   Drivetrain    3221 non-null   object
 4   MinMPG        3221 non-null   int64 
 5   MaxMPG        3221 non-null   int64 
 6   FuelType      3221 non-null   object
 7   Transmission  3221 non-null   object
 8   Engine        3221 non-null   object
 9   ID_Vehiculo   3221 non-null   int64 
dtypes: int64(4), object(6)
memory usage: 251.8+ KB


### Vendedor

In [16]:
# Dimensión: Vendedor (sin cambios)
dimension_vendedor = df[['SellerName', 'SellerType', 'State', 'Zipcode', 'StreetName']].drop_duplicates().reset_index(drop=True)

# Asignar un ID único a cada vendedor
dimension_vendedor['ID_Seller'] = dimension_vendedor.index + 1

# Mostrar la dimensión de vendedor
dimension_vendedor.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3972 entries, 0 to 3971
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   SellerName  3972 non-null   object
 1   SellerType  3972 non-null   object
 2   State       3972 non-null   object
 3   Zipcode     3972 non-null   object
 4   StreetName  3972 non-null   object
 5   ID_Seller   3972 non-null   int64 
dtypes: int64(1), object(5)
memory usage: 186.3+ KB


## Ratings

In [23]:
# Dimensión: Ratings (nueva dimensión enfocada en calificaciones)
dimension_ratings = df[['ConsumerRating', 'SellerRating', 'ComfortRating', 'InteriorDesignRating', 
                        'PerformanceRating', 'ValueForMoneyRating', 'ExteriorStylingRating', 
                        'ReliabilityRating', 'DealType']].drop_duplicates().reset_index(drop=True)

# Asignar un ID único a cada set de ratings
dimension_ratings['ID_Rating'] = dimension_ratings.index + 1

# Mostrar la dimensión de ratings
dimension_ratings.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5285 entries, 0 to 5284
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   ConsumerRating         5285 non-null   float64
 1   SellerRating           5285 non-null   float64
 2   ComfortRating          5285 non-null   float64
 3   InteriorDesignRating   5285 non-null   float64
 4   PerformanceRating      5285 non-null   float64
 5   ValueForMoneyRating    5285 non-null   float64
 6   ExteriorStylingRating  5285 non-null   float64
 7   ReliabilityRating      5285 non-null   float64
 8   DealType               5093 non-null   object 
 9   ID_Rating              5285 non-null   int64  
dtypes: float64(8), int64(1), object(1)
memory usage: 413.0+ KB


## Fact table

In [24]:
# Unir el dataframe original con la dimensión de vendedores para asignar el ID_Seller
df_hechos_vendedor = pd.merge(df, dimension_vendedor, on=['SellerName', 'SellerType', 'State', 'Zipcode', 'StreetName'], how='left')

# Unir el dataframe original con la dimensión de vehículos para asignar el ID_Vehiculo
df_hechos_vehiculo = pd.merge(df, dimension_vehiculo, on=['Year', 'Make', 'Model', 'Drivetrain', 'MinMPG', 'MaxMPG', 
                                                          'FuelType', 'Transmission', 'Engine'], how='left')

# Unir el dataframe original con la dimensión de ratings para asignar el ID_Rating
df_hechos_ratings = pd.merge(df, dimension_ratings, on=['ConsumerRating', 'SellerRating', 'ComfortRating', 
                                                        'InteriorDesignRating', 'PerformanceRating', 
                                                        'ValueForMoneyRating', 'ExteriorStylingRating', 
                                                        'ReliabilityRating', 'DealType'], how='left')

# Crear la tabla de hechos con las columnas restantes (sin las que movimos a la dimensión de ratings)
tabla_hechos = df_hechos_vendedor[['Price', 'Mileage', 'ExteriorColor', 'InteriorColor', 'Used', 'VIN', 'Stock#', 'ID_Seller']].copy()

# Agregar el ID_Vehiculo desde df_hechos_vehiculo
tabla_hechos['ID_Vehiculo'] = df_hechos_vehiculo['ID_Vehiculo']

# Agregar el ID_Rating desde df_hechos_ratings
tabla_hechos['ID_Rating'] = df_hechos_ratings['ID_Rating']

# Asignar un ID único a cada venta (ID_Venta)
tabla_hechos['ID_Venta'] = df.index + 1

# Reordenar las columnas como en tu modelo dimensional
tabla_hechos = tabla_hechos[['ID_Venta', 'ID_Vehiculo', 'ID_Seller', 'ID_Rating', 'Price', 
                             'Mileage', 'ExteriorColor', 'InteriorColor', 'Used', 'VIN', 'Stock#']]

# Mostrar la tabla de hechos
tabla_hechos.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8452 entries, 0 to 8451
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   ID_Venta       8452 non-null   int64 
 1   ID_Vehiculo    8452 non-null   int64 
 2   ID_Seller      8452 non-null   int64 
 3   ID_Rating      8452 non-null   int64 
 4   Price          8452 non-null   int64 
 5   Mileage        8452 non-null   int64 
 6   ExteriorColor  8452 non-null   object
 7   InteriorColor  8452 non-null   object
 8   Used           8452 non-null   bool  
 9   VIN            8452 non-null   object
 10  Stock#         8452 non-null   object
dtypes: bool(1), int64(6), object(4)
memory usage: 668.7+ KB


In [25]:
tabla_hechos['ID_Rating'].value_counts()

ID_Rating
256     55
705     37
501     33
202     22
209     22
        ..
5282     1
5283     1
5284     1
5285     1
6        1
Name: count, Length: 5285, dtype: int64

In [26]:
# Guardar las tablas en archivos CSV
tabla_hechos.to_csv('Dimensional_en_csv/tabla_hechos.csv', index=False)
dimension_vehiculo.to_csv('Dimensional_en_csv/dimension_vehiculo.csv', index=False)
dimension_vendedor.to_csv('Dimensional_en_csv/dimension_vendedor.csv', index=False)
dimension_ratings.to_csv('Dimensional_en_csv/dimension_ratings.csv', index=False)
