# ETL

### Cargo el dataset correspondiente desde la carpeta /sources para posteriormente analizarlo, limpiarlo y guardarlo en la carpeta /Data 

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import warnings
warnings.filterwarnings("ignore")

# Cargar los datos
df = pd.read_csv('sources/costo_operacional_vehiculos.csv')

# Limpiar el dataset eliminando filas con valores vacíos
df_cleaned = df.dropna()

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4625 entries, 0 to 4624
Data columns (total 13 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             4625 non-null   int64  
 1   Manuf                  4625 non-null   object 
 2   Model                  4625 non-null   object 
 3   Desc                   4625 non-null   object 
 4   Engine_Capacity        4625 non-null   int64  
 5   Fuel_Type              4625 non-null   object 
 6   Powertrain             4625 non-null   object 
 7   Euro_Standard          4625 non-null   object 
 8   Diesel_VED_Supplement  4625 non-null   bool   
 9   Fuel_Cost              4625 non-null   object 
 10  Electric_Cost          4625 non-null   object 
 11  Total_Cost             4625 non-null   object 
 12  Noise_Level            4625 non-null   float64
dtypes: bool(1), float64(1), int64(2), object(9)
memory usage: 438.2+ KB


In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Manuf,Model,Desc,Engine_Capacity,Fuel_Type,Powertrain,Euro_Standard,Diesel_VED_Supplement,Fuel_Cost,Electric_Cost,Total_Cost,Noise_Level
0,0,ABARTH,595,595 1.4 145 BHP Convertible,1368,Petrol,Internal Combustion Engine (ICE),Euro 6d-TEMP,False,"£1,935",£0,"£1,935",73.5
1,1,ABARTH,595,595 1.4 145 BHP Convertible,1368,Petrol,Internal Combustion Engine (ICE),Euro 6d-TEMP,False,"£2,043",£0,"£2,043",73.5
2,2,ABARTH,595,595 1.4 145 BHP Hatchback,1368,Petrol,Internal Combustion Engine (ICE),Euro 6d-TEMP,False,"£1,935",£0,"£1,935",73.5
3,3,ABARTH,595,595 1.4 145 BHP Hatchback,1368,Petrol,Internal Combustion Engine (ICE),Euro 6d-TEMP,False,"£2,043",£0,"£2,043",73.5
4,4,ABARTH,595,595 1.4 TJET 145bhp,1368,Petrol,Internal Combustion Engine (ICE),Euro 6d,False,"£1,828",£0,"£1,828",74.0


In [4]:
df.tail()

Unnamed: 0.1,Unnamed: 0,Manuf,Model,Desc,Engine_Capacity,Fuel_Type,Powertrain,Euro_Standard,Diesel_VED_Supplement,Fuel_Cost,Electric_Cost,Total_Cost,Noise_Level
4620,4620,VOLVO,"XC90, MY23",B5 Plus AWD,1969,Petrol Electric,Mild Hybrid Electric Vehicle (MHEV),Euro 6d,False,"£2,580",£0,"£2,580",68.0
4621,4621,VOLVO,"XC90, MY23",B5 Ultimate AWD,1969,Petrol Electric,Mild Hybrid Electric Vehicle (MHEV),Euro 6d,False,"£2,338",£0,"£2,338",68.0
4622,4622,VOLVO,"XC90, MY23",Recharge Plug-in Hybrid T8 Core,1969,Electricity / Petrol,Plug-in Hybrid Electric Vehicle (PHEV),Euro 6d,False,£403,"£1,018","£1,422",67.0
4623,4623,VOLVO,"XC90, MY23",Recharge Plug-in Hybrid T8 Plus,1969,Electricity / Petrol,Plug-in Hybrid Electric Vehicle (PHEV),Euro 6d,False,£403,"£1,018","£1,422",67.0
4624,4624,VOLVO,"XC90, MY23",Recharge Plug-in Hybrid T8 Ultimate,1969,Electricity / Petrol,Plug-in Hybrid Electric Vehicle (PHEV),Euro 6d,False,£403,"£1,018","£1,422",67.0


In [5]:
summary_stats = df.describe()
print(summary_stats)

        Unnamed: 0  Engine_Capacity  Noise_Level
count  4625.000000      4625.000000  4625.000000
mean   2312.000000      1706.910270    65.141492
std    1335.266827       916.609954    15.319174
min       0.000000         0.000000     0.000000
25%    1156.000000      1199.000000    67.000000
50%    2312.000000      1499.000000    68.000000
75%    3468.000000      1997.000000    69.000000
max    4624.000000      6749.000000    89.200000


In [6]:
# Eliminar las columnas innecesarias
columns_to_keep = ['Manuf', 'Model', 'Desc', 'Fuel_Type', 'Fuel_Cost', 'Electric_Cost', 'Total_Cost', 'Noise_Level']
df_clean = df[columns_to_keep].copy()

# Eliminar el símbolo de libra y convertir las columnas de costos a numéricas
df_clean['Fuel_Cost'] = df_clean['Fuel_Cost'].replace('[£,]', '', regex=True).astype(float)
df_clean['Electric_Cost'] = df_clean['Electric_Cost'].replace('[£,]', '', regex=True).astype(float)
df_clean['Total_Cost'] = df_clean['Total_Cost'].replace('[£,]', '', regex=True).astype(float)

# Asumamos una tasa de conversión de 1 libra esterlina = 1.17 euros y 1 euro = 1.10 dólares
gbp_to_eur = 1.17
eur_to_usd = 1.10

# Convertir los costos de libras a euros y luego a dólares
df_clean['Fuel_Cost'] = df_clean['Fuel_Cost'] * gbp_to_eur * eur_to_usd
df_clean['Electric_Cost'] = df_clean['Electric_Cost'] * gbp_to_eur * eur_to_usd
df_clean['Total_Cost'] = df_clean['Total_Cost'] * gbp_to_eur * eur_to_usd

# Mostrar las primeras filas del dataframe limpio
df_clean.head()


Unnamed: 0,Manuf,Model,Desc,Fuel_Type,Fuel_Cost,Electric_Cost,Total_Cost,Noise_Level
0,ABARTH,595,595 1.4 145 BHP Convertible,Petrol,2490.345,0.0,2490.345,73.5
1,ABARTH,595,595 1.4 145 BHP Convertible,Petrol,2629.341,0.0,2629.341,73.5
2,ABARTH,595,595 1.4 145 BHP Hatchback,Petrol,2490.345,0.0,2490.345,73.5
3,ABARTH,595,595 1.4 145 BHP Hatchback,Petrol,2629.341,0.0,2629.341,73.5
4,ABARTH,595,595 1.4 TJET 145bhp,Petrol,2352.636,0.0,2352.636,74.0


In [7]:
# Factor de conversión de millas a kilómetros
miles_to_km = 1.60934

# Ajustar los costos de 10,000 millas a 10,000 kilómetros
conversion_factor = 10000 / (10000 * miles_to_km)

df_clean['Fuel_Cost'] *= conversion_factor
df_clean['Electric_Cost'] *= conversion_factor
df_clean['Total_Cost'] *= conversion_factor

# Mostrar el resultado final
df_clean.head()


Unnamed: 0,Manuf,Model,Desc,Fuel_Type,Fuel_Cost,Electric_Cost,Total_Cost,Noise_Level
0,ABARTH,595,595 1.4 145 BHP Convertible,Petrol,1547.432488,0.0,1547.432488,73.5
1,ABARTH,595,595 1.4 145 BHP Convertible,Petrol,1633.800813,0.0,1633.800813,73.5
2,ABARTH,595,595 1.4 145 BHP Hatchback,Petrol,1547.432488,0.0,1547.432488,73.5
3,ABARTH,595,595 1.4 145 BHP Hatchback,Petrol,1633.800813,0.0,1633.800813,73.5
4,ABARTH,595,595 1.4 TJET 145bhp,Petrol,1461.86387,0.0,1461.86387,74.0


In [8]:
df_clean.to_csv('Data/costo_operacional_vehiculos_clean.csv', index=False)

# Se analisa la informacion y estadisticas del nuevo dataset ya limpio con sus datos filtrados

In [9]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4625 entries, 0 to 4624
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Manuf          4625 non-null   object 
 1   Model          4625 non-null   object 
 2   Desc           4625 non-null   object 
 3   Fuel_Type      4625 non-null   object 
 4   Fuel_Cost      4625 non-null   float64
 5   Electric_Cost  4625 non-null   float64
 6   Total_Cost     4625 non-null   float64
 7   Noise_Level    4625 non-null   float64
dtypes: float64(4), object(4)
memory usage: 289.2+ KB


In [10]:
df_clean.head()

Unnamed: 0,Manuf,Model,Desc,Fuel_Type,Fuel_Cost,Electric_Cost,Total_Cost,Noise_Level
0,ABARTH,595,595 1.4 145 BHP Convertible,Petrol,1547.432488,0.0,1547.432488,73.5
1,ABARTH,595,595 1.4 145 BHP Convertible,Petrol,1633.800813,0.0,1633.800813,73.5
2,ABARTH,595,595 1.4 145 BHP Hatchback,Petrol,1547.432488,0.0,1547.432488,73.5
3,ABARTH,595,595 1.4 145 BHP Hatchback,Petrol,1633.800813,0.0,1633.800813,73.5
4,ABARTH,595,595 1.4 TJET 145bhp,Petrol,1461.86387,0.0,1461.86387,74.0


In [11]:
df_clean.tail()

Unnamed: 0,Manuf,Model,Desc,Fuel_Type,Fuel_Cost,Electric_Cost,Total_Cost,Noise_Level
4620,VOLVO,"XC90, MY23",B5 Plus AWD,Petrol Electric,2063.243317,0.0,2063.243317,68.0
4621,VOLVO,"XC90, MY23",B5 Ultimate AWD,Petrol Electric,1869.714293,0.0,1869.714293,68.0
4622,VOLVO,"XC90, MY23",Recharge Plug-in Hybrid T8 Core,Electricity / Petrol,322.281805,814.101433,1137.182945,67.0
4623,VOLVO,"XC90, MY23",Recharge Plug-in Hybrid T8 Plus,Electricity / Petrol,322.281805,814.101433,1137.182945,67.0
4624,VOLVO,"XC90, MY23",Recharge Plug-in Hybrid T8 Ultimate,Electricity / Petrol,322.281805,814.101433,1137.182945,67.0


In [12]:
summary_stats = df_clean.describe()
print(summary_stats)

         Fuel_Cost  Electric_Cost   Total_Cost  Noise_Level
count  4625.000000    4625.000000  4625.000000  4625.000000
mean   1376.548391      53.606459  1430.152775    65.141492
std     568.935941     186.980553   478.923137    15.319174
min       0.000000       0.000000   468.628133     0.000000
25%    1138.782358       0.000000  1138.782358    67.000000
50%    1332.311382       0.000000  1343.507276    68.000000
75%    1590.616650       0.000000  1598.613717    69.000000
max    3589.083724    1397.887333  3589.083724    89.200000
