In [22]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import random
import statsmodels.api as sm
import statsmodels.stats.diagnostic as smd
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
import seaborn as sns
import statsmodels.api as sm
import scipy.stats as stats
import statsmodels.stats.diagnostic as diag
from sklearn.naive_bayes import GaussianNB
#Metrics
from sklearn.metrics import make_scorer, accuracy_score,precision_score,recall_score,f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import root_mean_squared_error
from sklearn.compose import make_column_selector as selector
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

plt.rcParams['figure.figsize'] = (15, 9)
plt.style.use('ggplot')
random.seed(123)

In [2]:
carros = pd.read_csv("Datos/cars.csv")

***Número de filas:*** 205  
***Número de atributos o columnas:*** 26 

### Información de los atributos:  

|Atributo:|Rango del atributo:|
|--------:|:-------------------:|
|1. symboling:|-3, -2, -1, 0, 1, 2, 3.|
|2. normalized-losses:|numérico 65 hasta 256.|
|3. make:|alfa-romero, audi, bmw, chevrolet, dodge, honda, isuzu, jaguar, mazda, mercedes-benz, mercury, mitsubishi, nissan, peugot, plymouth, porsche,  renault, saab, subaru, hastayota, volkswagen, volvo|
|4. fuel-type:|diesel, gas.|
|5. aspiration:|std, turbo.|
|6. num-of-doors:|four, two.|
|7. body-style:|hardtop, wagon, sedan, hatchback, convertible.|
|8. drive-wheels:|4wd, fwd, rwd.|
|9. engine-location:|front, rear.|
|10. wheel-base:|numérico desde 86.6 hasta 120.9.|
|11. length:|numérico desde 141.1 hasta 208.1.|
|12. width:|numérico desde 60.3 hasta 72.3.|
|13. height:|numérico desde 47.8 hasta 59.8.|
|14. curb-weight:|numérico desde 1488 hasta 4066.|
|15. engine-type:|dohc, dohcv, l, ohc, ohcf, ohcv, rotor.|
|16. num-of-cylinders:|eight, five, four, six, three, twelve, two.|
|17. engine-size:|numérico desde 61 to 326.|
|18. fuel-system:|1bbl, 2bbl, 4bbl, idi, mfi, mpfi, spdi, spfi.|
|19. bore:|numérico desde 2.54 hasta 3.94.|
|20. stroke:|numérico desde 2.07 hasta 4.17.|
|21. compression-ratio:|numérico desde 7 hasta 23.|
|22. horsepower:|numérico desde 48 hasta 288.|
|23. peak-rpm:|numérico desde 4150 hasta 6600.|
|24. city-mpg:|numérico desde 13 hasta 49.|
|25. highway-mpg:|numérico desde 16 hasta 54.|
|26. price:|numérico desde 5118 hasta 45400.|



## Variable Respuesta:

La variable que se pretende predecir es el consumo por galón en ciudad (city_mpg).
Separemos en conjuntos de entrenamiento y prueba.

## Conjuntos de entrenamiento y prueba.   

Se separarán en 70% de los datos en el conjunto de entrenamiento y 30% en el conjunto de prueba, usando un muestreo aleatorio simple.   

In [3]:

carros["bore"].fillna(carros["bore"].median(numeric_only=True),inplace=True)
carros["stroke"].fillna(carros["stroke"].median(numeric_only=True),inplace=True)
carros["horsepower"].fillna(carros["horsepower"].median(numeric_only=True),inplace=True)
carros["peak_rpm"].fillna(carros["peak_rpm"].median(numeric_only=True),inplace=True)
carros["price"].fillna(carros["price"].median(numeric_only=True),inplace=True)
carros.pop("normalized_losses")

0        NaN
1        NaN
2        NaN
3      164.0
4      164.0
       ...  
200     95.0
201     95.0
202     95.0
203     95.0
204     95.0
Name: normalized_losses, Length: 205, dtype: float64

In [4]:
y = carros.pop("city_mpg")
X = carros

In [10]:
X_train, X_test,y_train, y_test = train_test_split(X, y,test_size=0.3,train_size=0.7,random_state=0)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(143, 24)
(62, 24)
(143,)
(62,)


## Modelos

### Naive Bayes


Separando variables en numéricas vs categóricas

In [6]:
numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

numerical_columns = numerical_columns_selector(carros)
categorical_columns = categorical_columns_selector(carros)
print(categorical_columns)
print(numerical_columns)

['make', 'fuel_type', 'aspiration', 'num_of_doors', 'body_style', 'drive_wheels', 'engine_location', 'engine_type', 'num_of_cylinders', 'fuel_system']
['symboling', 'wheel_base', 'length', 'width', 'height', 'curb_weight', 'engine_size', 'bore', 'stroke', 'compression_ratio', 'horsepower', 'peak_rpm', 'highway_mpg', 'price']


Vamos a codificar las variables cualitativas con un onehot encoder y vamos a normalizar las variables cuantitativas

In [7]:


categorical_preprocessor = OneHotEncoder(handle_unknown="ignore")
numerical_preprocessor = StandardScaler()
preprocessor = ColumnTransformer([
    ('one-hot-encoder', categorical_preprocessor, categorical_columns),
    ('standar-scaler', numerical_preprocessor,numerical_columns)
    ])
    

In [23]:
#param_grid = {"max_depth": np.arange(2, 10, 1)}
pipeline = Pipeline(
    [('preprocessor',preprocessor),
     ('regressor',GaussianNB())])
#params = [{
#    "regressor__max_depth": np.arange(2, 10, 1)
#}]
pipeline.get_params()

{'memory': None,
 'steps': [('preprocessor',
   ColumnTransformer(transformers=[('one-hot-encoder',
                                    OneHotEncoder(handle_unknown='ignore'),
                                    ['make', 'fuel_type', 'aspiration',
                                     'num_of_doors', 'body_style', 'drive_wheels',
                                     'engine_location', 'engine_type',
                                     'num_of_cylinders', 'fuel_system']),
                                   ('standar-scaler', StandardScaler(),
                                    ['symboling', 'wheel_base', 'length', 'width',
                                     'height', 'curb_weight', 'engine_size',
                                     'bore', 'stroke', 'compression_ratio',
                                     'horsepower', 'peak_rpm', 'highway_mpg',
                                     'price'])])),
  ('regressor', GaussianNB())],
 'verbose': False,
 'preprocessor': ColumnTransformer(t

El error es de 5.5 millas por galón aproximadamente. 

In [21]:
modelo = pipeline.fit(X_train,y_train)
y_pred = modelo.predict(X_test)
rmse = root_mean_squared_error(y_test,y_pred)
rmse

5.497800146568889

Vamos a tunnear el modelo

In [None]:
params =  [{
    "regressor__var_smoothing": np.logspace(0,-9, num=100)
}]
modelo = GridSearchCV(pipeline,param_grid=parag_jobs=2, cv=5,scoring="neg_root_mean_squared_error")
modelo.fit(X_train,y_train)
modelo.best_params_



{'regressor__var_smoothing': 0.04328761281083057}

El mejor valor es 0.043. Vamos a predecir con el mejor modelo.

In [29]:
print(modelo.best_estimator_)
print('best score:')
print(modelo.best_score_)
y_mejor_pred = modelo.best_estimator_.predict(X_test)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('one-hot-encoder',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['make', 'fuel_type',
                                                   'aspiration', 'num_of_doors',
                                                   'body_style', 'drive_wheels',
                                                   'engine_location',
                                                   'engine_type',
                                                   'num_of_cylinders',
                                                   'fuel_system']),
                                                 ('standar-scaler',
                                                  StandardScaler(),
                                                  ['symboling', 'wheel_base',
                                                   'length', 'width', 'height',
     

In [31]:
rmse_tunned = root_mean_squared_error(y_test,y_mejor_pred)
rmse_tunned

2.884888324556306

Como se puede observar el error del modelo que se mejoró tiene menos error medio cuadrado que el primer modelo. Por lo que sí mejoró