# Preparación del dataset de ventas de vehículos

## Librerias y ocnfiguraciones previas

In [52]:
# Tratamiento de datos
# ============================================================================================================
import pandas as pd 
import numpy as np

# Gestión de librerias
# ============================================================================================================
from importlib import reload

# Matemáticas y estadísticas
# ============================================================================================================
import math

# Preparación de datos
# ============================================================================================================
from sklearn.impute import KNNImputer
from sklearn.impute import SimpleImputer
from sklearn.neighbors import LocalOutlierFactor

# Gráficos
# =============================================================================================================
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns

# Configuración warnings
# =============================================================================================================

import warnings
warnings.filterwarnings('ignore')


## Funciones 

In [53]:
# Funciones externas
# =============================================================================================================
from utils.funciones import multiple_plot

Este dataset extraido de *Kaggle* (nombrado: **car details v4.csv** ) contiene información sobre **vehículos usados**, los cuales serán usados para la predicción de precios de venta, a partir de las diferentes variables.

A continuación una breve descripción de las variables:

****
    
| Campo | Descripción | Ejempo |
| :--- | :--- |:--- |
| Make | Marca  | Honda |
| Model | Modelo  | Amaze 1.2 VX i-VTEC |
| Price | Precio  | 505000 |
| Year | Año  | 2017 |
| Kilometer | Kilometraje  | 87150 |
| Fuel Type | Tipo de combustible  | Petrol |
| Transmission | Tipo de transmisión  | Manual |
| Location | Ubicación  | Pune |
| Color | Color  | Grey |
| Owner | Número de propietarios previos  | First |
| Seller Type | Tipo de vendedor  | Corporate |
| Engine | Tipo de motor  | 1198 cc |
| Max Power | Potencia máxima del motor  | 87 bhp @ 6000 rpm |
| Max Torque | Torque máximo del motor  | 109 Nm @ 4500 rpm |
| Drivetrain | Tipo de tracción  | FWD |
| Length | Longitud  | 3990 |
| Width | Anchura  | 1680 |
| Height | Altura  | 1505 |
| Seating Capacity | Capacidad de asientos  | 5 |
| Fuel Tank Capacity | Capacidad del tanque de combustible  | 35 |
    
****

    
Url origen datos: https://www.kaggle.com/datasets/nehalbirla/vehicle-dataset-from-cardekho?select=car+details+v4.csv

In [54]:
#Cargar el dataset
d=pd.read_csv('./datasets/01_car details v4.csv')

In [55]:
d.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2059 entries, 0 to 2058
Data columns (total 20 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Make                2059 non-null   object 
 1   Model               2059 non-null   object 
 2   Price               2059 non-null   int64  
 3   Year                2059 non-null   int64  
 4   Kilometer           2059 non-null   int64  
 5   Fuel Type           2059 non-null   object 
 6   Transmission        2059 non-null   object 
 7   Location            2059 non-null   object 
 8   Color               2059 non-null   object 
 9   Owner               2059 non-null   object 
 10  Seller Type         2059 non-null   object 
 11  Engine              1979 non-null   object 
 12  Max Power           1979 non-null   object 
 13  Max Torque          1979 non-null   object 
 14  Drivetrain          1923 non-null   object 
 15  Length              1995 non-null   float64
 16  Width 

In [56]:
d.head(2)

Unnamed: 0,Make,Model,Price,Year,Kilometer,Fuel Type,Transmission,Location,Color,Owner,Seller Type,Engine,Max Power,Max Torque,Drivetrain,Length,Width,Height,Seating Capacity,Fuel Tank Capacity
0,Honda,Amaze 1.2 VX i-VTEC,505000,2017,87150,Petrol,Manual,Pune,Grey,First,Corporate,1198 cc,87 bhp @ 6000 rpm,109 Nm @ 4500 rpm,FWD,3990.0,1680.0,1505.0,5.0,35.0
1,Maruti Suzuki,Swift DZire VDI,450000,2014,75000,Diesel,Manual,Ludhiana,White,Second,Individual,1248 cc,74 bhp @ 4000 rpm,190 Nm @ 2000 rpm,FWD,3995.0,1695.0,1555.0,5.0,42.0


## Limpieza de datos (Data Cleaning)

In [57]:
# Borrando columnas que no se utilizarán 

d["Model"].unique()


array(['Amaze 1.2 VX i-VTEC', 'Swift DZire VDI', 'i10 Magna 1.2 Kappa2',
       ..., 'Ritz Vxi (ABS) BS-IV', 'XUV500 W8 [2015-2017]',
       'Figo Duratec Petrol ZXI 1.2'], dtype=object)

In [58]:
#Se limpia la variable Engine para convertirla en una variable numérica

#Se borra todos los valores ' cc'
d['Engine']=d['Engine'].str.replace(' cc','')
d['Engine'].nbytes*10**(-6)
d['Engine'] = d['Engine'].astype('float64')
d['Engine'].nbytes*10**(-6)
#d['Engine'].dtype

16472000000.0

In [59]:
d['Engine']

0       1198.0
1       1248.0
2       1197.0
3       1197.0
4       2393.0
         ...  
2054    2179.0
2055     814.0
2056    1196.0
2057    1995.0
2058    1493.0
Name: Engine, Length: 2059, dtype: float64