In [1]:
import pandas as pd
import os
import pickle
from sklearn.preprocessing import FunctionTransformer
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


In [2]:
df = pd.read_csv("datos_limpios.csv")

In [3]:
df

Unnamed: 0,id_extraccion,timestamp_extraccion,marca,modelo,precio_contado,precio_financiado,año,mes,combustible,kilometraje,ubicacion,latitud,longitud,garantia,transmision,potencia_cv,puertas,asientos,tipo_carroceria
0,https://www.autocasion.com/coches-segunda-mano...,2025-06-20 21:10:20.086439,ABARTH,500 C 695C 1.4 16v T-Jet 132kW (180 CV),22990.0,20900.0,2022,12.0,Gasolina,23213.0,Madrid,40.4168,-3.7038,12,Manual,179.0,2.0,,Descapotable
1,https://www.autocasion.com/coches-segunda-mano...,2025-06-20 21:10:20.086694,ABARTH,124 SPIDER Spider Turbo Multiair 125kW Auto,23990.0,20990.0,2018,12.0,Gasolina,97814.0,Barcelona,41.3851,2.1734,12,Automático,170.0,2.0,2.0,Descapotable
2,https://www.autocasion.com/coches-segunda-mano...,2025-06-20 21:10:20.086996,ABARTH,595 1.4T JET 121KW,22500.0,20500.0,2022,3.0,Gasolina,33602.0,Malaga,36.7213,-4.4214,12,Manual,165.0,3.0,4.0,Berlina
3,https://www.autocasion.com/coches-segunda-mano...,2025-06-20 21:10:20.087274,ABARTH,695 Pequeño Manual de 3 Puertas,22950.0,,2018,2.0,Gasolina,48500.0,Islas Baleares,39.5696,2.6502,12,Manual,180.0,3.0,4.0,Pequeño
4,https://www.autocasion.com/coches-segunda-mano...,2025-06-20 21:10:20.087595,ABARTH,500 1.4 Turbo 595,16999.0,15713.0,2020,7.0,Gasolina,32922.0,Barcelona,41.3851,2.1734,12,Manual,145.0,3.0,,Berlina
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88419,https://www.autocasion.com/coches-km0/km-0/yud...,2025-06-21 00:33:12.156646,YOOUDOOO,Y2 1.5L Comfort,17590.0,15590.0,2025,2.0,Gasolina,2.0,La Rioja,42.4627,-2.4450,36,Manual,109.0,5.0,5.0,Todo Terreno
88420,https://www.autocasion.com/coches-km0/km-0/yud...,2025-06-21 00:33:12.156777,YOOUDOOO,K3 YUDO Pequeño Automático de 5 Puertas,26838.0,25838.0,2024,8.0,Eléctrico,10.0,Barcelona,41.3851,2.1734,12,Automático,95.0,5.0,5.0,Pequeño
88421,https://www.autocasion.com/coches-km0/km-0/yud...,2025-06-21 00:33:12.156928,YOOUDOOO,K3 70kW,21900.0,21900.0,2024,7.0,Eléctrico,3000.0,Sevilla,37.3891,-5.9845,96,Automático,95.0,5.0,5.0,Todo Terreno
88422,https://www.autocasion.com/coches-km0/km-0/yud...,2025-06-21 00:33:12.157050,YOOUDOOO,K3 70kW,22990.0,19990.0,2025,1.0,Eléctrico,4585.0,La Rioja,42.4627,-2.4450,84,Automático,95.0,5.0,5.0,Todo Terreno


In [4]:
def binary_encoding_nan(X):
    return (~np.isnan(X[:, 0])).astype(int).reshape(-1, 1)

binary_nan_encoder = FunctionTransformer(binary_encoding_nan)

X_precio_financiado = df[['precio_financiado']].values
binary_encoded = binary_nan_encoder.fit_transform(X_precio_financiado)

df['tiene_financiacion'] = binary_encoded

df.drop(columns=['precio_financiado'], inplace=True)

os.makedirs('encoders', exist_ok=True)

with open('encoders/binary_nan_encoder.pkl', 'wb') as f:
    pickle.dump(binary_nan_encoder, f)


In [5]:
categorical_cols = ['combustible', 'transmision', 'tipo_carroceria']

imputers = {}

for col in categorical_cols:
    imputer = SimpleImputer(strategy='most_frequent')
    df[[col]] = imputer.fit_transform(df[[col]])
    imputers[col] = imputer
    
    with open(f'encoders/imputer_{col}.pkl', 'wb') as f:
        pickle.dump(imputer, f)


In [6]:
num_cols_to_impute = ['mes', 'potencia_cv', 'puertas', 'asientos']

knn_imputer = KNNImputer(n_neighbors=3)

df[num_cols_to_impute] = knn_imputer.fit_transform(df[num_cols_to_impute])

with open('encoders/knn_imputer_num_cols.pkl', 'wb') as f:
    pickle.dump(knn_imputer, f)


In [7]:
cols_int = ['mes', 'potencia_cv', 'puertas', 'asientos']

for col in cols_int:
    df[col] = df[col].round().astype('Int64')


In [8]:
def target_encode(train_df, col, target):
    encoding_map = train_df.groupby(col)[target].mean()
    
    train_df[col + '_te'] = train_df[col].map(encoding_map)
    
    with open(f'encoders/target_encoding_{col}.pkl', 'wb') as f:
        pickle.dump(encoding_map, f)
    
    return train_df

df = target_encode(df, 'marca', 'precio_contado')
df = target_encode(df, 'modelo', 'precio_contado')

df = df.drop(['marca', 'modelo'], axis=1)


In [9]:
transmision_map = {'Manual': 0, 'Automático': 1}

df['transmision_bin'] = df['transmision'].map(transmision_map)

df.drop(columns=['transmision'], inplace=True)

with open('encoders/bin_encoder_transmision.pkl', 'wb') as f:
    pickle.dump(transmision_map, f)


In [10]:

ohe_combustible = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
combustible_encoded = ohe_combustible.fit_transform(df[['combustible']])

combustible_cols = [f"combustible_{cat}" for cat in ohe_combustible.categories_[0]]
df_combustible = pd.DataFrame(combustible_encoded, columns=combustible_cols, index=df.index)

df = pd.concat([df, df_combustible], axis=1)
df.drop(columns=['combustible'], inplace=True)

with open('encoders/ohe_combustible.pkl', 'wb') as f:
    pickle.dump(ohe_combustible, f)


In [11]:
ohe_carroceria = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
carroceria_encoded = ohe_carroceria.fit_transform(df[['tipo_carroceria']])

carroceria_cols = [f"tipo_carroceria_{cat}" for cat in ohe_carroceria.categories_[0]]
df_carroceria = pd.DataFrame(carroceria_encoded, columns=carroceria_cols, index=df.index)

df = pd.concat([df, df_carroceria], axis=1)
df.drop(columns=['tipo_carroceria'], inplace=True)

with open('encoders/ohe_tipo_carroceria.pkl', 'wb') as f:
    pickle.dump(ohe_carroceria, f)

In [12]:
df.drop(columns=[
    'id_extraccion',
    'timestamp_extraccion',
    'ubicacion'
], inplace=True)


In [13]:
X = df.drop(columns=['precio_contado'])
y = df['precio_contado']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

with open("encoders/standard_scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns, index=df.index)


In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled_df, y, test_size=0.2, random_state=42
)

print(f"X_train: {X_train.shape}, X_test: {X_test.shape}")
print(f"y_train: {y_train.shape}, y_test: {y_test.shape}")


X_train: (70739, 28), X_test: (17685, 28)
y_train: (70739,), y_test: (17685,)


In [15]:
df.to_csv('datos_limpios_modelo.csv', index=False)
