# ETL

### Cargo el dataset correspondiente desde la carpeta /sources para posteriormente analizarlo, limpiarlo y guardarlo en la carpeta /Data 

In [40]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import warnings
warnings.filterwarnings("ignore")

# Cargar los datos
df = pd.read_csv('sources/costo_operacional_vehiculos.csv')

# Limpiar el dataset eliminando filas con valores vacíos
df_cleaned = df.dropna()

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4625 entries, 0 to 4624
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             4625 non-null   int64  
 1   Manuf                  4625 non-null   object 
 2   Model                  4625 non-null   object 
 3   Desc                   4625 non-null   object 
 4   Engine_Capacity        4625 non-null   int64  
 5   Fuel_Type              4625 non-null   object 
 6   Powertrain             4625 non-null   object 
 7   Euro_Standard          4625 non-null   object 
 8   Diesel_VED_Supplement  4625 non-null   bool   
 9   Fuel_Cost              4625 non-null   object 
 10  Electric_Cost          4625 non-null   object 
 11  Total_Cost             4625 non-null   object 
 12  Noise_Level            4625 non-null   float64
dtypes: bool(1), float64(1), int64(2), object(9)
memory usage: 438.2+ KB


In [27]:
df.head()

Unnamed: 0.1,Unnamed: 0,Manuf,Model,Desc,Engine_Capacity,Fuel_Type,Powertrain,Euro_Standard,Diesel_VED_Supplement,Fuel_Cost,Electric_Cost,Total_Cost,Noise_Level
0,0,ABARTH,595,595 1.4 145 BHP Convertible,1368,Petrol,Internal Combustion Engine (ICE),Euro 6d-TEMP,False,"£1,935",£0,"£1,935",73.5
1,1,ABARTH,595,595 1.4 145 BHP Convertible,1368,Petrol,Internal Combustion Engine (ICE),Euro 6d-TEMP,False,"£2,043",£0,"£2,043",73.5
2,2,ABARTH,595,595 1.4 145 BHP Hatchback,1368,Petrol,Internal Combustion Engine (ICE),Euro 6d-TEMP,False,"£1,935",£0,"£1,935",73.5
3,3,ABARTH,595,595 1.4 145 BHP Hatchback,1368,Petrol,Internal Combustion Engine (ICE),Euro 6d-TEMP,False,"£2,043",£0,"£2,043",73.5
4,4,ABARTH,595,595 1.4 TJET 145bhp,1368,Petrol,Internal Combustion Engine (ICE),Euro 6d,False,"£1,828",£0,"£1,828",74.0


In [28]:
df.tail()

Unnamed: 0.1,Unnamed: 0,Manuf,Model,Desc,Engine_Capacity,Fuel_Type,Powertrain,Euro_Standard,Diesel_VED_Supplement,Fuel_Cost,Electric_Cost,Total_Cost,Noise_Level
4620,4620,VOLVO,"XC90, MY23",B5 Plus AWD,1969,Petrol Electric,Mild Hybrid Electric Vehicle (MHEV),Euro 6d,False,"£2,580",£0,"£2,580",68.0
4621,4621,VOLVO,"XC90, MY23",B5 Ultimate AWD,1969,Petrol Electric,Mild Hybrid Electric Vehicle (MHEV),Euro 6d,False,"£2,338",£0,"£2,338",68.0
4622,4622,VOLVO,"XC90, MY23",Recharge Plug-in Hybrid T8 Core,1969,Electricity / Petrol,Plug-in Hybrid Electric Vehicle (PHEV),Euro 6d,False,£403,"£1,018","£1,422",67.0
4623,4623,VOLVO,"XC90, MY23",Recharge Plug-in Hybrid T8 Plus,1969,Electricity / Petrol,Plug-in Hybrid Electric Vehicle (PHEV),Euro 6d,False,£403,"£1,018","£1,422",67.0
4624,4624,VOLVO,"XC90, MY23",Recharge Plug-in Hybrid T8 Ultimate,1969,Electricity / Petrol,Plug-in Hybrid Electric Vehicle (PHEV),Euro 6d,False,£403,"£1,018","£1,422",67.0


In [29]:
summary_stats = df.describe()
print(summary_stats)

        Unnamed: 0  Engine_Capacity  Noise_Level
count  4625.000000      4625.000000  4625.000000
mean   2312.000000      1706.910270    65.141492
std    1335.266827       916.609954    15.319174
min       0.000000         0.000000     0.000000
25%    1156.000000      1199.000000    67.000000
50%    2312.000000      1499.000000    68.000000
75%    3468.000000      1997.000000    69.000000
max    4624.000000      6749.000000    89.200000


In [30]:
# Eliminar las columnas innecesarias
columns_to_keep = ['Manuf', 'Model', 'Desc', 'Fuel_Type', 'Fuel_Cost', 'Electric_Cost', 'Total_Cost', 'Noise_Level']
df_clean = df[columns_to_keep].copy()

# Eliminar el símbolo de libra y convertir las columnas de costos a numéricas
df_clean['Fuel_Cost'] = df_clean['Fuel_Cost'].replace('[£,]', '', regex=True).astype(float)
df_clean['Electric_Cost'] = df_clean['Electric_Cost'].replace('[£,]', '', regex=True).astype(float)
df_clean['Total_Cost'] = df_clean['Total_Cost'].replace('[£,]', '', regex=True).astype(float)

# Asumamos una tasa de conversión de 1 libra esterlina = 1.17 euros y 1 euro = 1.10 dólares
gbp_to_eur = 1.17
eur_to_usd = 1.10

# Convertir los costos de libras a euros y luego a dólares
df_clean['Fuel_Cost'] = df_clean['Fuel_Cost'] * gbp_to_eur * eur_to_usd
df_clean['Electric_Cost'] = df_clean['Electric_Cost'] * gbp_to_eur * eur_to_usd
df_clean['Total_Cost'] = df_clean['Total_Cost'] * gbp_to_eur * eur_to_usd

# Eliminar duplicados
df_clean = df_clean.drop_duplicates()

# Mostrar las primeras filas del dataframe limpio
df_clean.head()


Unnamed: 0,Manuf,Model,Desc,Fuel_Type,Fuel_Cost,Electric_Cost,Total_Cost,Noise_Level
0,ABARTH,595,595 1.4 145 BHP Convertible,Petrol,2490.345,0.0,2490.345,73.5
1,ABARTH,595,595 1.4 145 BHP Convertible,Petrol,2629.341,0.0,2629.341,73.5
2,ABARTH,595,595 1.4 145 BHP Hatchback,Petrol,2490.345,0.0,2490.345,73.5
3,ABARTH,595,595 1.4 145 BHP Hatchback,Petrol,2629.341,0.0,2629.341,73.5
4,ABARTH,595,595 1.4 TJET 145bhp,Petrol,2352.636,0.0,2352.636,74.0


In [31]:
# Factor de conversión de millas a kilómetros
miles_to_km = 1.60934

# Ajustar los costos de 10,000 millas a 10,000 kilómetros
conversion_factor = 10000 / (10000 * miles_to_km)

df_clean['Fuel_Cost'] *= conversion_factor
df_clean['Electric_Cost'] *= conversion_factor
df_clean['Total_Cost'] *= conversion_factor

# Mostrar el resultado final
df_clean.head()


Unnamed: 0,Manuf,Model,Desc,Fuel_Type,Fuel_Cost,Electric_Cost,Total_Cost,Noise_Level
0,ABARTH,595,595 1.4 145 BHP Convertible,Petrol,1547.432488,0.0,1547.432488,73.5
1,ABARTH,595,595 1.4 145 BHP Convertible,Petrol,1633.800813,0.0,1633.800813,73.5
2,ABARTH,595,595 1.4 145 BHP Hatchback,Petrol,1547.432488,0.0,1547.432488,73.5
3,ABARTH,595,595 1.4 145 BHP Hatchback,Petrol,1633.800813,0.0,1633.800813,73.5
4,ABARTH,595,595 1.4 TJET 145bhp,Petrol,1461.86387,0.0,1461.86387,74.0


In [32]:
df_clean.to_csv('Data/costo_operacional_vehiculos_clean.csv', index=False)

# Se analisa la informacion y estadisticas del nuevo dataset ya limpio con sus datos filtrados

In [33]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3813 entries, 0 to 4624
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Manuf          3813 non-null   object 
 1   Model          3813 non-null   object 
 2   Desc           3813 non-null   object 
 3   Fuel_Type      3813 non-null   object 
 4   Fuel_Cost      3813 non-null   float64
 5   Electric_Cost  3813 non-null   float64
 6   Total_Cost     3813 non-null   float64
 7   Noise_Level    3813 non-null   float64
dtypes: float64(4), object(4)
memory usage: 268.1+ KB


In [34]:
df_clean.head()

Unnamed: 0,Manuf,Model,Desc,Fuel_Type,Fuel_Cost,Electric_Cost,Total_Cost,Noise_Level
0,ABARTH,595,595 1.4 145 BHP Convertible,Petrol,1547.432488,0.0,1547.432488,73.5
1,ABARTH,595,595 1.4 145 BHP Convertible,Petrol,1633.800813,0.0,1633.800813,73.5
2,ABARTH,595,595 1.4 145 BHP Hatchback,Petrol,1547.432488,0.0,1547.432488,73.5
3,ABARTH,595,595 1.4 145 BHP Hatchback,Petrol,1633.800813,0.0,1633.800813,73.5
4,ABARTH,595,595 1.4 TJET 145bhp,Petrol,1461.86387,0.0,1461.86387,74.0


In [35]:
df_clean.tail()

Unnamed: 0,Manuf,Model,Desc,Fuel_Type,Fuel_Cost,Electric_Cost,Total_Cost,Noise_Level
4620,VOLVO,"XC90, MY23",B5 Plus AWD,Petrol Electric,2063.243317,0.0,2063.243317,68.0
4621,VOLVO,"XC90, MY23",B5 Ultimate AWD,Petrol Electric,1869.714293,0.0,1869.714293,68.0
4622,VOLVO,"XC90, MY23",Recharge Plug-in Hybrid T8 Core,Electricity / Petrol,322.281805,814.101433,1137.182945,67.0
4623,VOLVO,"XC90, MY23",Recharge Plug-in Hybrid T8 Plus,Electricity / Petrol,322.281805,814.101433,1137.182945,67.0
4624,VOLVO,"XC90, MY23",Recharge Plug-in Hybrid T8 Ultimate,Electricity / Petrol,322.281805,814.101433,1137.182945,67.0


In [36]:
summary_stats = df_clean.describe()
print(summary_stats)

         Fuel_Cost  Electric_Cost   Total_Cost  Noise_Level
count  3813.000000    3813.000000  3813.000000  3813.000000
mean   1383.383551      60.556024  1443.936849    64.430370
std     593.925629     197.008280   494.235737    16.764981
min       0.000000       0.000000   468.628133     0.000000
25%    1138.782358       0.000000  1138.782358    67.000000
50%    1343.507276       0.000000  1353.903464    68.000000
75%    1654.593187       0.000000  1654.593187    69.000000
max    3589.083724    1397.887333  3589.083724    89.200000


In [41]:
import numpy as np
import re

# Cargar el dataset
df_crp = pd.read_csv('sources/car_resale_prices.csv')
df_crp.info()
df_crp.duplicated().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17446 entries, 0 to 17445
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         17446 non-null  int64  
 1   full_name          17446 non-null  object 
 2   resale_price       17446 non-null  object 
 3   registered_year    17377 non-null  object 
 4   engine_capacity    17432 non-null  object 
 5   insurance          17439 non-null  object 
 6   transmission_type  17446 non-null  object 
 7   kms_driven         17443 non-null  object 
 8   owner_type         17401 non-null  object 
 9   fuel_type          17446 non-null  object 
 10  max_power          17344 non-null  object 
 11  seats              17436 non-null  float64
 12  mileage            16938 non-null  object 
 13  body_type          17446 non-null  object 
 14  city               17446 non-null  object 
dtypes: float64(1), int64(1), object(13)
memory usage: 2.0+ MB


0

In [38]:
# Eliminar las columnas innecesarias
df_crp.drop(['Unnamed: 0', 'engine_capacity', 'insurance', 'kms_driven', 'owner_type', 
         'seats', 'mileage', 'body_type', 'city'], axis=1, inplace=True)

In [39]:
import numpy as np
import re

# 1. Asegurarnos de que no haya valores NaN y eliminar el símbolo '₹' y las comas
df_crp['resale_price'] = df_crp['resale_price'].str.replace('₹', '', regex=False)
df_crp['resale_price'] = df_crp['resale_price'].str.replace(',', '', regex=False)

# 2. Convertir los valores que contienen "Lakh" a su valor numérico
def convert_lakh_to_numeric(price_str):
    if 'Lakh' in price_str:
        number = re.findall(r'\d+\.\d+|\d+', price_str)[0]
        return float(number) * 100000
    return None

df_crp['resale_price'] = df_crp['resale_price'].apply(lambda x: convert_lakh_to_numeric(x) if isinstance(x, str) else x)

# Asignamos el tipo de cambio correcto de Rupias a Dólares
conversion_rate = 0.012
df_crp['resale_price'] = df_crp['resale_price'] * conversion_rate

# Eliminar las columnas 'resale_price' y 'resale_price_cleaned'
#df.drop(['resale_price', 'resale_price_cleaned'], axis=1, inplace=True)

# Renombrar la columna 'resale_price_usd' a 'resale_price'
#df.rename(columns={'resale_price_usd': 'resale_price'}, inplace=True)

# Eliminar las columnas innecesarias
#df_crp.drop(['Unnamed: 0', 'engine_capacity', 'insurance', 'kms_driven', 'owner_type', 
#         'seats', 'mileage', 'body_type', 'city'], axis=1, inplace=True)
#df_crp = df_crp.reset_index(drop=True)

# Asegurarse de que la columna 'resale_price' esté en formato numérico
df_crp['resale_price'] = pd.to_numeric(df_crp['resale_price'], errors='coerce')
df_crp['resale_price'] = df_crp['resale_price'].round(2)

# Función para extraer solo el año
def extract_year(value):
    if pd.isnull(value):
        return None
    match = re.search(r'\b(19|20)\d{2}\b', str(value))
    if match:
        return int(match.group(0))
    return None

# Función para extraer solo el año
def extract_year(value):
    if pd.isnull(value):
        return None
    match = re.search(r'\b(19|20)\d{2}\b', str(value))
    if match:
        return int(match.group(0))
    return None

# Limpiar la columna 'registered_year' para extraer solo los años
df_crp['registered_year'] = df_crp['registered_year'].apply(extract_year)

# Convertir 'registered_year' a enteros, manejando valores nulos
df_crp['registered_year'] = df_crp['registered_year'].astype('Int64')  # Usar tipo de datos Int64 para manejar NaN
df_crp['registered_year'] = df_crp['registered_year'].fillna(pd.NA)  # Mantener NaN si es necesario

# Verificar los valores únicos después de la conversión
print("Valores únicos en 'registered_year' después de la conversión:", df_crp['registered_year'].unique())

# Verificar el DataFrame después de la conversión
print(df_crp.head())

#Eliminar duplicados
df_crp = df_crp.drop_duplicates()

# Guardar el DataFrame limpio
df_crp.to_csv('Data/car_resale_prices_clean.csv', index=False)

Valores únicos en 'registered_year' después de la conversión: <IntegerArray>
[2017, 2018, 2015, 2009, 2010, 2016, 2014, 2020, 2021, 2019, 2011, 2012, 2013,
 2022, 2004, <NA>, 2008, 2006, 2023, 2003, 2007, 2002, 2005]
Length: 23, dtype: Int64
                      full_name  resale_price  registered_year  \
0  2017 Maruti Baleno 1.2 Alpha        6540.0             2017   
1            2018 Tata Hexa XTA       12000.0             2018   
2   2015 Maruti Swift Dzire VXI        5400.0             2015   
3   2015 Maruti Swift Dzire VXI        5400.0             2015   
4    2009 Hyundai i10 Magna 1.1        1920.0             2009   

  transmission_type fuel_type  max_power  
0            Manual    Petrol    83.1bhp  
1         Automatic    Diesel  153.86bhp  
2            Manual    Petrol   83.14bhp  
3            Manual    Petrol   83.14bhp  
4            Manual    Petrol   68.05bhp  


In [23]:
# Función para categorizar los vehículos
def categorize_vehicle(row):
    if row['fuel_type'] in ['Diesel', 'Petrol', 'Petrol/LPG']:
        return 'Convencional'
    elif row['fuel_type'] in [ 'Electricity', 'Electric']:
        return 'Eléctrico'
    else:
        return 'Híbrido'

# Aplicar la función al dataframe
df['Vehicle_Type'] = df.apply(categorize_vehicle, axis=1)

# Verificar la distribución
print(df['Vehicle_Type'].value_counts())


Vehicle_Type
Convencional    16852
Híbrido           533
Eléctrico          61
Name: count, dtype: int64
