#### Parte 1: Análisis Exploratorio de Datos (EDA)

 1. Carga y Exploración de Datos:

- Cargar el dataset y revisar la estructura básica.

In [1]:
import pandas as pd
df = pd.read_csv('Automobile_data.csv')
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


- Descripción de las variables y su distribución.
- Detección y tratamiento de valores nulos.

In [2]:
df.replace({'?': None}, inplace= True)
df.isna().sum()
df.dropna(inplace=True)

In [3]:
df.nunique()

symboling              6
normalized-losses     51
make                  18
fuel-type              2
aspiration             2
num-of-doors           2
body-style             5
drive-wheels           3
engine-location        1
wheel-base            40
length                56
width                 33
height                39
curb-weight          136
engine-type            5
num-of-cylinders       5
engine-size           32
fuel-system            6
bore                  33
stroke                31
compression-ratio     29
horsepower            48
peak-rpm              20
city-mpg              25
highway-mpg           28
price                145
dtype: int64

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 159 entries, 3 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          159 non-null    int64  
 1   normalized-losses  159 non-null    object 
 2   make               159 non-null    object 
 3   fuel-type          159 non-null    object 
 4   aspiration         159 non-null    object 
 5   num-of-doors       159 non-null    object 
 6   body-style         159 non-null    object 
 7   drive-wheels       159 non-null    object 
 8   engine-location    159 non-null    object 
 9   wheel-base         159 non-null    float64
 10  length             159 non-null    float64
 11  width              159 non-null    float64
 12  height             159 non-null    float64
 13  curb-weight        159 non-null    int64  
 14  engine-type        159 non-null    object 
 15  num-of-cylinders   159 non-null    object 
 16  engine-size        159 non-null

In [5]:
data_types = {
    'symboling': 'int64',
    'normalized-losses': 'float64',  # Es posible que haya valores nulos o '?' que debes convertir a NaN primero
    'make': 'category',
    'fuel-type': 'category',
    'aspiration': 'category',
    'num-of-doors': 'category',
    'body-style': 'category',
    'drive-wheels': 'category',
    'engine-location': 'category',
    'wheel-base': 'float64',
    'length': 'float64',
    'width': 'float64',
    'height': 'float64',
    'curb-weight': 'int64',
    'engine-type': 'category',
    'num-of-cylinders': 'category',
    'engine-size': 'int64',
    'fuel-system': 'category',
    'bore': 'float64',
    'stroke': 'float64',
    'compression-ratio': 'float64',
    'horsepower': 'float64',
    'peak-rpm': 'float64',
    'city-mpg': 'int64',
    'highway-mpg': 'int64',
    'price': 'float64'  # Asegúrate de manejar los valores nulos si hay precios faltantes
}

# Aplicar los tipos de datos al DataFrame
df = df.astype(data_types)

In [6]:
df.corr(numeric_only=True)['price'].sort_values(ascending=False)

price                1.000000
curb-weight          0.893639
width                0.843371
engine-size          0.841496
length               0.760952
horsepower           0.759874
wheel-base           0.734419
bore                 0.533890
height               0.244836
compression-ratio    0.209361
normalized-losses    0.202761
stroke               0.160664
symboling           -0.162794
peak-rpm            -0.171916
city-mpg            -0.692273
highway-mpg         -0.720090
Name: price, dtype: float64

#### Parte 2: Preparación de Datos

 2. Preprocesamiento:

- Selección de características importantes.
- Transformación de variables categóricas.
- División del conjunto de datos en entrenamiento y prueba.
- Escalado de características.

In [7]:

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score


# Seleccionar características importantes (ejemplo)
important_features = ['curb-weight', 'engine-size', 'horsepower', 'width', 'make', 'fuel-type']

# Transformación de variables categóricas en dummies
df_final = pd.get_dummies(df[important_features], columns=['make', 'fuel-type'])


4. K-Nearest Neighbors (KNN):

In [8]:

# Separar variables predictoras y objetivo
X = df_final
y = df['price']

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Escalado de datos
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Entrenamiento del modelo KNN
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train, y_train)

# Realizar predicciones
y_pred = knn.predict(X_test)

# Evaluar el modelo
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Error Cuadrático Medio: {mse}')
print(f'R-cuadrado: {r2}')

Error Cuadrático Medio: 6425201.785
R-cuadrado: 0.8302411456449115


[WinError 2] The system cannot find the file specified
  File "c:\Users\Diego\AppData\Local\Programs\Python\Python312\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "c:\Users\Diego\AppData\Local\Programs\Python\Python312\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Diego\AppData\Local\Programs\Python\Python312\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\Diego\AppData\Local\Programs\Python\Python312\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


 5. Árbol de Decisión:

In [9]:
from sklearn.tree import DecisionTreeRegressor


# Entrenamiento del modelo Árbol de Decisión
tree = DecisionTreeRegressor(random_state=0)
tree.fit(X_train, y_train)

# Realizar predicciones
y_pred = tree.predict(X_test)

# Evaluar el modelo
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Error Cuadrático Medio: {mse}')
print(f'R-cuadrado: {r2}')

Error Cuadrático Medio: 5342580.75
R-cuadrado: 0.8588448397781254


In [10]:
# Entrenamiento del modelo con poda
tree_pruned = DecisionTreeRegressor(max_depth=3, random_state=0)
tree_pruned.fit(X_train, y_train)

# Realizar predicciones
y_pred_pruned = tree_pruned.predict(X_test)

# Evaluar el modelo podado
mse_pruned = mean_squared_error(y_test, y_pred_pruned)
r2_pruned = r2_score(y_test, y_pred_pruned)
print(f'Error Cuadrático Medio (Podado): {mse_pruned}')
print(f'R-cuadrado (Podado): {r2_pruned}')

Error Cuadrático Medio (Podado): 4563097.906419595
R-cuadrado (Podado): 0.8794393859018867


 3. Regresión Lineal:

In [11]:
from sklearn.linear_model import LinearRegression
# Crear y entrenar el modelo de regresión lineal
modelo = LinearRegression()
modelo.fit(X_train, y_train)

# Realizar predicciones
y_pred = modelo.predict(X_test)

# Evaluar el modelo
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Error Cuadrático Medio: {mse}')
print(f'R-cuadrado: {r2}')

Error Cuadrático Medio: 4533703.296415317
R-cuadrado: 0.8802160144787807


Los 3 modelos tuvieron un rendimiento similar, y cualquiera de los 3 nos da resultados buenos, podriamos intentar reducir algunas columnas redundantes para evitar problemas de de multicolinealidad