In [None]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error

## Pipeline de la data de Autos

In [None]:
!wget -q http://archive.ics.uci.edu/static/public/10/automobile.zip
!unzip -q automobile.zip

In [None]:
# Nombres de características ya que el dataset original no las incluye
headers = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
           "num_doors", "body_style", "drive_wheels", "engine_location",
           "wheel_base", "length", "width", "height", "curb_weight",
           "engine_type", "num_cylinders", "engine_size", "fuel_system",
           "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm",
           "city_mpg", "highway_mpg", "price"]

file_path = "imports-85.data"
data_df = pd.read_csv(file_path, header=None, names=headers, na_values="?" )
data_df.head()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


In [None]:
X = data_df.drop('price', axis=1)
y = data_df['price']

In [None]:
SEED = 42
np.random.seed(SEED)

from sklearn.model_selection import train_test_split

X, X_test, y, y_test = train_test_split(X, y, train_size=0.8, random_state=SEED)

print('Tamaño original del dataset: ', X.shape)
#print('Tamaño de la data de entrenamiento: ', _X_train.shape)
print('Tamaño de la data de validacion: ', X_test.shape)
print('Tamaño del target de validacion: ', y_test.shape)

Tamaño original del dataset:  (164, 25)
Tamaño de la data de validacion:  (41, 25)
Tamaño del target de validacion:  (41,)


## Pipeline

In [None]:
variables_numericas = X.select_dtypes(include=['int', 'float']).columns.tolist()
print(variables_numericas)

['symboling', 'normalized_losses', 'wheel_base', 'length', 'width', 'height', 'curb_weight', 'engine_size', 'bore', 'stroke', 'compression_ratio', 'horsepower', 'peak_rpm', 'city_mpg', 'highway_mpg']


In [None]:
variables_categoricas = X.select_dtypes(include=['object']).columns.tolist()
print(variables_categoricas)

['make', 'fuel_type', 'aspiration', 'num_doors', 'body_style', 'drive_wheels', 'engine_location', 'engine_type', 'num_cylinders', 'fuel_system']


In [None]:
from sklearn.pipeline import Pipeline

numeric_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

In [None]:
X_numeric = X[variables_numericas]

In [None]:
X_numeric_transf = numeric_preprocessor.fit_transform(X_numeric)
X_numeric_transf

array([[-0.72738032,  0.        ,  1.03708546, ..., -1.91903274,
         0.96431609,  1.3135007 ],
       [-0.72738032,  1.13091929,  1.54123134, ..., -0.29403017,
        -0.98812636, -1.02456857],
       [-0.72738032, -1.43287055, -0.50896191, ..., -0.70028081,
         0.96431609,  1.00175813],
       ...,
       [ 0.07863571, -0.05941171, -0.71062026, ...,  0.11222047,
         0.96431609,  1.00175813],
       [ 1.69066776,  2.22968637,  0.70098821, ...,  0.11222047,
        -0.98812636, -1.02456857],
       [-0.72738032, -0.48671001,  0.28086664, ...,  0.11222047,
        -1.31353343, -1.33631113]])

#### Ejemplo en data de prueba


In [None]:
X_test_numeric = X_test[variables_numericas]
X_test_numeric_transf = numeric_preprocessor.transform(X_test_numeric)

## Variables Categoricas

In [None]:
categorical_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

## Ejemplo de transformacion

In [None]:
X_train_categ = X[variables_categoricas]

X_train_cat_tran = categorical_preprocessor.fit_transform(X_train_categ).toarray()
X_train_cat_tran

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.]])

In [None]:
X_test_cat = X_test[variables_categoricas]
X_test_cat_transf = categorical_preprocessor.transform(X_test_cat).toarray()
X_test_cat_transf

array([[0., 0., 1., ..., 1., 0., 0.],
       [0., 1., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## Column Transformer

In [None]:
from sklearn.compose import ColumnTransformer

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        #('numeric-transformer', StandardScaler(), variables_numericas),
        ('numeric-transformer', numeric_preprocessor, variables_numericas),
        ('categ-transformer', categorical_preprocessor, variables_categoricas)
    ]
)

In [None]:
X_train_transformed = preprocessor.fit_transform(X)

In [None]:
X_train_transformed

array([[-0.72738032,  0.        ,  1.03708546, ...,  0.        ,
         0.        ,  0.        ],
       [-0.72738032,  1.13091929,  1.54123134, ...,  1.        ,
         0.        ,  0.        ],
       [-0.72738032, -1.43287055, -0.50896191, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.07863571, -0.05941171, -0.71062026, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.69066776,  2.22968637,  0.70098821, ...,  1.        ,
         0.        ,  0.        ],
       [-0.72738032, -0.48671001,  0.28086664, ...,  1.        ,
         0.        ,  0.        ]])

In [None]:
X_test_transformed = preprocessor.fit_transform(X_test) # Aca no seria solo ".transform"?
X_test_transformed

array([[-0.4633482 ,  0.        ,  0.74388711, ...,  0.        ,
         1.        ,  0.        ],
       [-0.4633482 ,  0.        ,  0.10145696, ...,  0.        ,
         1.        ,  0.        ],
       [-0.4633482 , -0.32704225, -0.26794038, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.4633482 ,  0.        , -0.70158074, ...,  0.        ,
         0.        ,  0.        ],
       [ 2.01455741,  0.        , -0.47673018, ...,  0.        ,
         0.        ,  1.        ],
       [-0.4633482 , -0.99909611, -0.30006189, ...,  0.        ,
         0.        ,  0.        ]])

## Pipeline de Regresión

In [None]:
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])
# ¿La variable "y" no se toca en la parte de preprocessor?

In [None]:
media= y.mean()

In [None]:
y_reemplazado = y.fillna(media)



In [None]:
model_pipeline.fit(X, y_reemplazado)