# Import libraries

In [1]:
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn import model_selection # train_test_split
from sklearn import pipeline
from sklearn import ensemble   # RandomForestClassifier
from sklearn import impute
from sklearn import compose
from sklearn import metrics    # accuracy_score, balanced_accuracy_score, plot_confusion_matrix
from sklearn import inspection # permutation_importance, plot_partial_dependence

# Get data

In [10]:
path = "../../Datasets/Tabular/car-challenge/"

In [11]:
##### DOWNLOAD DATASET. ONLY IN COLAB !!!
#!wget -P $path https://raw.githubusercontent.com/CenticMurcia/curso-ciencia-datos/master/Datasets/Tabular/car-challenge/train.csv
#!wget -P $path https://raw.githubusercontent.com/CenticMurcia/curso-ciencia-datos/master/Datasets/Tabular/car-challenge/test.csv

In [12]:
train = pd.read_csv(path + "train.csv", index_col="Id")
test  = pd.read_csv(path + "test.csv",  index_col="Id")

In [14]:
train.head(1)

Unnamed: 0_level_0,Marca,Modelo,Tiempo,Provincia,Localidad,Año,Kms,Cambio,Cv,Combust,Puertas,Vendedor,Precio
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,OPEL,VECTRA,33 días,la_rioja,Arnedo,2005.0,200000.0,manual,120.0,diesel,5.0,Profesional,2200


# Preprocessing

In [15]:
# 'Modelo',  'Localidad' 'Puertas',
cat_vars = ['Marca',  'Provincia', 'Cambio', 'Combust',  'Vendedor']
num_vars = ['Año', 'Kms', 'Cv']
target_var = 'Precio'

x = train[cat_vars + num_vars]
y = train[target_var]

In [16]:
x.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 160435 entries, 0 to 160434
Data columns (total 8 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   Marca      160435 non-null  object 
 1   Provincia  160435 non-null  object 
 2   Cambio     160375 non-null  object 
 3   Combust    160413 non-null  object 
 4   Vendedor   160409 non-null  object 
 5   Año        157104 non-null  float64
 6   Kms        150278 non-null  float64
 7   Cv         145378 non-null  float64
dtypes: float64(3), object(5)
memory usage: 11.0+ MB


In [17]:
num_preprocessing = pipeline.Pipeline(steps=[
    ('imputer', impute.SimpleImputer(strategy='median')),
    ('encoder', preprocessing.StandardScaler())
])

cat_preporcessing = pipeline.Pipeline(steps=[
    ('imputer', impute.SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', preprocessing.OrdinalEncoder())
])

preprocessor = compose.ColumnTransformer(transformers=[
    ('num', num_preprocessing, num_vars),
    ('cat', cat_preporcessing, cat_vars)
])

# Select train (80%) and validation (20%)

In [18]:
x_train, x_valid, y_train, y_valid = model_selection.train_test_split(x, y,
                                                      test_size=0.2,
                                                      random_state=0)

# Train some ML model

In [19]:
model = ensemble.RandomForestRegressor(n_jobs=-1)

prep_model = pipeline.Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', ensemble.RandomForestRegressor(
                              n_estimators=100,
                              n_jobs=-1
    ))
])

In [20]:
prep_model.fit(x_train, y_train);

# Evaluate model

In [21]:
preds = prep_model.predict(x_valid)
preds

array([ 2561.78, 27537.7 ,  2902.38, ..., 25961.  ,  9164.77,  5215.5 ])

In [22]:
metrics.mean_squared_log_error(y_valid, preds)

0.201460949426968