In [None]:
import pandas as pd
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
column_names = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration","num_doors", "body_style", "drive_wheels", "engine_location", "wheel_base",	"length", "width", "height", "curb_weight", "engine_type", "num_cylinders","engine_size", "fuel_system", "bore", "stroke", "compression_ratio","horsepower", "peak_rpm", "city_mpg", "highway_mpg", "price"]
# Cargar dataset con nombres de columnas
df = pd.read_csv(url, names=column_names, na_values="?")  # "?" indica valores faltantes
df.head()


Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


In [None]:
print('Los datos de entrenamiento tienen {} filas y {} columnas'.format(df.shape[0],df.shape[1]))

Los datos de entrenamiento tienen 159 filas y 26 columnas


In [None]:
df.isnull().sum()

Unnamed: 0,0
symboling,0
normalized_losses,41
make,0
fuel_type,0
aspiration,0
num_doors,2
body_style,0
drive_wheels,0
engine_location,0
wheel_base,0


Eliminamos los datos nulos

In [None]:
df = df.dropna().reset_index(drop=True)

In [None]:
print('Los datos de entrenamiento tienen {} filas y {} columnas'.format(df.shape[0],df.shape[1]))

Los datos de entrenamiento tienen 159 filas y 26 columnas


Escalamos las columnas numericas

In [None]:
# from sklearn.preprocessing import MinMaxScaler

# scaler = MinMaxScaler()
#df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

#df

Codificamos las columnas categoricas

In [None]:
import pandas as pd

# Aplicar One-Hot Encoding a todas las columnas categóricas automáticamente
df_encoded = pd.get_dummies(df, drop_first=True)  # drop_first=True evita multicolinealidad

# Convertir los valores booleanos a enteros (0 y 1)
df_encoded = df_encoded.astype(int)

df_encoded.head() # Ver las primeras filas


Unnamed: 0,symboling,normalized_losses,wheel_base,length,width,height,curb_weight,engine_size,bore,stroke,...,engine_type_ohcv,num_cylinders_five,num_cylinders_four,num_cylinders_six,num_cylinders_three,fuel_system_2bbl,fuel_system_idi,fuel_system_mfi,fuel_system_mpfi,fuel_system_spdi
0,2,164,99,176,66,54,2337,109,3,3,...,0,0,1,0,0,0,0,0,1,0
1,2,164,99,176,66,54,2824,136,3,3,...,0,1,0,0,0,0,0,0,1,0
2,1,158,105,192,71,55,2844,136,3,3,...,0,1,0,0,0,0,0,0,1,0
3,1,158,105,192,71,55,3086,131,3,3,...,0,1,0,0,0,0,0,0,1,0
4,2,192,101,176,64,54,2395,108,3,2,...,0,0,1,0,0,0,0,0,1,0


Separacion de los datos de entrenamiento y de validacion

In [None]:
from sklearn.model_selection import train_test_split

# Definir variables independientes (X) y dependientes (y)
X = df_encoded.drop(columns=['price'])  # Características
y = df_encoded['price']  # Variable a predecir

# Dividir el dataset en entrenamiento (80%) y validación (20%)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Tamaño de X_train: {X_train.shape}")
print(f"Tamaño de X_val: {X_val.shape}")


Tamaño de X_train: (127, 54)
Tamaño de X_val: (32, 54)


Entrenamiento del modelo con la regresión Lineal Múltiple

In [None]:
from sklearn.linear_model import LinearRegression

# Crear el modelo de regresión lineal
model = LinearRegression()

# Entrenar el modelo con los datos de entrenamiento
model.fit(X_train, y_train)

# Obtener los coeficientes (β) y el intercepto (β0)
beta_0 = model.intercept_
betas = model.coef_

# Mostrar la ecuación de la regresión
equation = f"y = {beta_0:.4f} + " + " + ".join([f"{b:.4f}*{col}" for b, col in zip(betas, X.columns)])
print(f"Ecuación de la regresión lineal: {equation}")


Ecuación de la regresión lineal: y = 17357.1171 + -99.7643*symboling + 8.9517*normalized_losses + 364.0524*wheel_base + -97.6996*length + 132.6654*width + -375.0752*height + 4.3381*curb_weight + -10.6009*engine_size + -371.4591*bore + -1493.0179*stroke + -41.0736*compression_ratio + 11.1299*horsepower + -0.3254*peak_rpm + -264.2219*city_mpg + 179.1391*highway_mpg + -608.2590*make_bmw + -4266.1559*make_chevrolet + -6257.4419*make_dodge + -2672.0628*make_honda + 4824.4806*make_jaguar + -3269.4480*make_mazda + 1805.2783*make_mercedes-benz + -5918.7635*make_mitsubishi + -3576.1632*make_nissan + -1760.1892*make_peugot + -5869.6259*make_plymouth + 3783.9360*make_porsche + -912.0260*make_saab + -2821.5579*make_subaru + -4674.7754*make_toyota + -3384.9703*make_volkswagen + -2871.6853*make_volvo + -822.2459*fuel_type_gas + 2399.1307*aspiration_turbo + -555.4046*num_doors_two + -5120.9894*body_style_hardtop + -5272.8144*body_style_hatchback + -4900.0458*body_style_sedan + -4628.9160*body_style_w

Probar el modelo

In [None]:
# Hacer predicciones en los datos de validación
y_pred = model.predict(X_val)

# Mostrar las primeras predicciones
print(y_pred[:5])


[16854.92813587 21124.28983325 10931.88774584  8692.12894244
 13118.73437763]


In [None]:
print(y_val[:5])

78     13200
155    19045
128     9989
55      6989
94     15510
Name: price, dtype: int64


Evaluacion del modelo

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Calcular RMSE (Root Mean Squared Error)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))

# Calcular R² (Coeficiente de Determinación)
r2 = r2_score(y_val, y_pred)

print(f"RMSE: {rmse:.4f}")
print(f"R²: {r2:.4f}")


RMSE: 1705.0454
R²: 0.8366


Creacion y entrenamiento del modelo con la Formula matemática del algoritmo de regresión lineal

In [None]:
from sklearn.preprocessing import add_dummy_feature

X_train = add_dummy_feature(X)


theta_best = np.linalg.inv(X_train.T @ X_train) @ X_train.T @ y

theta_best

array([ 2.73672714e+08,  4.41864465e+06, -4.55643324e+04,  1.67891205e+06,
       -1.65113751e+04, -5.07001427e+06, -1.67837190e+06,  3.83729629e+03,
        7.51496401e+04,  4.93782101e+06, -6.71757950e+06,  1.11663944e+05,
       -1.36545993e+04,  1.75643441e+03,  1.29295064e+05, -1.68430816e+05,
       -7.69808289e+06, -4.37116188e+06,  2.78092277e+06, -8.65774453e+05,
       -4.48421681e+07, -6.59869005e+05,  5.06015173e+06,  7.55578081e+05,
       -8.50385586e+05, -9.37816467e+07,  2.26469442e+06,  5.99842464e+04,
       -2.74852297e+06,  7.98800552e+08, -1.64738632e+04, -1.24081181e+05,
       -3.25864122e+05, -3.17120426e+07, -2.47372624e+04, -1.05158873e+04,
       -1.49017561e+05, -3.33100767e+05, -2.57244167e+05, -1.52748062e+05,
        7.28489496e+04,  3.02256491e+05,  1.03561726e+08, -1.37605521e+03,
       -7.99062748e+08, -1.26713700e+03, -6.08280204e+03, -7.87411500e+03,
       -5.25005366e+03, -1.02250080e+08,  2.36220624e+03, -3.17675010e+07,
        4.41241787e+03,  