In [229]:
import pandas as pd
import numpy as np

from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, Binarizer

from feature_engine.imputation import AddMissingIndicator, MeanMedianImputer, CategoricalImputer
from feature_engine.encoding import RareLabelEncoder, OrdinalEncoder
from feature_engine.transformation import LogTransformer
from feature_engine.selection import DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper

import myPreprocessors as mypp #nuestra librerías de transformaciones.

import joblib

In [230]:
dataTrain = pd.read_csv('train.csv')
dataTrain

Unnamed: 0,id,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,9744,-121.65,36.77,15,2191,358.0,1150,330,4.7969,227500.0,<1H OCEAN
1,13893,-116.27,34.13,37,452,109.0,184,59,3.7292,65800.0,INLAND
2,18277,-122.07,37.34,35,1172,184.0,512,175,7.3561,500001.0,<1H OCEAN
3,16176,-122.39,37.74,52,126,24.0,37,27,10.2264,225000.0,NEAR BAY
4,8843,-118.38,34.09,28,4001,1352.0,1799,1220,2.5784,272900.0,<1H OCEAN
...,...,...,...,...,...,...,...,...,...,...,...
14442,557,-122.25,37.77,43,4329,1110.0,2086,1053,2.9750,243400.0,NEAR BAY
14443,16090,-122.48,37.71,43,3850,1018.0,1497,829,3.5296,400000.0,NEAR OCEAN
14444,9083,-118.17,34.69,12,4881,803.0,2188,724,4.1667,171900.0,INLAND
14445,13138,-121.44,38.47,5,5666,1178.0,3139,1131,3.3608,108900.0,INLAND


#### Cambiamos ocean_proximity por variable categorica ya que en el futuro es un requisito para que .fit funcione en nuestra linea de codigo

In [231]:
dataTrain['ocean_proximity'] = dataTrain['ocean_proximity'].astype('category')

In [232]:
dataTrain.dtypes

id                       int64
longitude              float64
latitude               float64
housing_median_age       int64
total_rooms              int64
total_bedrooms         float64
population               int64
households               int64
median_income          float64
median_house_value     float64
ocean_proximity       category
dtype: object

### Hacemos  el test Train y Test utilizando el 'id' y 'median_house_value' 

In [233]:
X_train, X_test, y_train, y_test = train_test_split(
    dataTrain.drop(['id', 'median_house_value'], axis=1),
    dataTrain['median_house_value'],
    test_size=0.3,
    random_state=2022)

#### Configuración del Pipeline

In [234]:
#Imputación de variables numéricas
NUMERICAL_VARS_WITH_NA = ['total_bedrooms']

#Variables para transformaicón logarítmica
NUMERICAL_LOG_VARS = ["total_bedrooms", "total_rooms", "population","households",'housing_median_age', 'median_income']

QUAL_VARS = ['ocean_proximity']

CATEGORICAL_VARS = ['ocean_proximity']

#Mapeo para varibels categótricas para calidad.
QUAL_MAPPINGS = {'<1H OCEAN': 1, 'INLAND': 2, 'NEAR OCEAN': 3,
                 'NEAR BAY': 4, 'ISLAND': 5}

#Variables a utilzar en el entrenamiento
FEATURES = ["total_bedrooms", "total_rooms", "population","households",'housing_median_age', 'median_income','ocean_proximity'
]

In [235]:
X_train = X_train[FEATURES]

In [236]:
y_train = np.log(y_train)
y_test = np.log(y_test)

#### Construcción del Pipeline

In [237]:
tarea1_pipeline_v1 = Pipeline([
    
    #=========== IMPUTACIONES ===============
    
    #1. Indicador faltane en variables numericas para imputación
    ('missing_indicator_numeric',
        AddMissingIndicator(variables=NUMERICAL_VARS_WITH_NA)
    ),
    
    #2. Imputación de variables numéricas
    ('mean_imputation',
        MeanMedianImputer(imputation_method='mean', variables=NUMERICAL_VARS_WITH_NA)
    ),
    
    #============= CODIFICACIÓN DE VARIABLES CATEGORICAS ORINALES ==================
    ('quality_mapper',
        mypp.Mapper(variables=QUAL_VARS, mappings=QUAL_MAPPINGS)
    ),

    #=============== TRANSFORMACIÓN DE VARIABLES CONTINUAS ============
    ('log_transformer',
        LogTransformer(variables=NUMERICAL_LOG_VARS)
    ),
    
    #============= CODIFICACIÓN DE VARIABLES CATEGORICAS NOMINALES ==================
    ('rare_label_encoder',
        RareLabelEncoder(n_categories=1, tol=0.01, variables=CATEGORICAL_VARS)
    ),
    
    ('categorical_encoder',
        OrdinalEncoder(encoding_method='ordered', variables=CATEGORICAL_VARS)
    ),
     #=============== SCALER ============
    ('scaler',
        MinMaxScaler()
    ),
    
    ('modelo_lasso', 
         Lasso(alpha=0.01, random_state=2022)
    )
])

In [238]:
tarea1_pipeline_v1.fit(X_train, y_train)

Pipeline(steps=[('missing_indicator_numeric',
                 AddMissingIndicator(variables=['total_bedrooms'])),
                ('mean_imputation',
                 MeanMedianImputer(imputation_method='mean',
                                   variables=['total_bedrooms'])),
                ('quality_mapper',
                 Mapper(mappings={'<1H OCEAN': 1, 'INLAND': 2, 'ISLAND': 5,
                                  'NEAR BAY': 4, 'NEAR OCEAN': 3},
                        variables=['ocean_proximity'])),
                ('log_transformer',
                 LogTransforme...iables=['total_bedrooms', 'total_rooms',
                                           'population', 'households',
                                           'housing_median_age',
                                           'median_income'])),
                ('rare_label_encoder',
                 RareLabelEncoder(n_categories=1, tol=0.01,
                                  variables=['ocean_proximity'])),
           

In [239]:
X_test = X_test[FEATURES]

In [240]:
preds = tarea1_pipeline_v1.predict(X_test)

### RMSE que poseemos para el test

In [241]:
rmseTest = np.sqrt(mean_squared_error(np.exp(y_test), np.exp(preds)))
rmseTest

82821.45318033535

### Mejorar el RMSE 

In [242]:
np.min(np.exp(y_train)), np.max(np.exp(y_train))

(14999.00000000001, 500000.99999999977)

In [243]:
rmseTest / (np.max(np.exp(y_train)) - np.min(np.exp(y_train)))

0.17076517865974858

### Comparacion Final: dataset original 

In [248]:
dataTrain

Unnamed: 0,id,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,9744,-121.65,36.77,15,2191,358.0,1150,330,4.7969,227500.0,<1H OCEAN
1,13893,-116.27,34.13,37,452,109.0,184,59,3.7292,65800.0,INLAND
2,18277,-122.07,37.34,35,1172,184.0,512,175,7.3561,500001.0,<1H OCEAN
3,16176,-122.39,37.74,52,126,24.0,37,27,10.2264,225000.0,NEAR BAY
4,8843,-118.38,34.09,28,4001,1352.0,1799,1220,2.5784,272900.0,<1H OCEAN
...,...,...,...,...,...,...,...,...,...,...,...
14442,557,-122.25,37.77,43,4329,1110.0,2086,1053,2.9750,243400.0,NEAR BAY
14443,16090,-122.48,37.71,43,3850,1018.0,1497,829,3.5296,400000.0,NEAR OCEAN
14444,9083,-118.17,34.69,12,4881,803.0,2188,724,4.1667,171900.0,INLAND
14445,13138,-121.44,38.47,5,5666,1178.0,3139,1131,3.3608,108900.0,INLAND


### Dataset Nuevo

In [251]:
X_train ## TRAIN

Unnamed: 0,total_bedrooms,total_rooms,population,households,housing_median_age,median_income,ocean_proximity
6344,312.0,2349,809,282,7,5.5520,NEAR OCEAN
11737,125.0,797,385,133,33,6.7974,NEAR OCEAN
8476,686.0,3418,970,453,18,3.7738,NEAR OCEAN
8932,598.0,1632,3356,659,13,1.5054,<1H OCEAN
4507,411.0,2010,1501,422,34,2.0417,INLAND
...,...,...,...,...,...,...,...
14001,491.0,2695,1059,451,31,4.7841,NEAR OCEAN
4720,652.0,2338,3289,631,31,2.6734,<1H OCEAN
173,376.0,1640,939,340,29,2.8321,NEAR BAY
1244,213.0,1279,444,204,52,5.2269,INLAND


In [249]:
X_test ## TEST

Unnamed: 0,total_bedrooms,total_rooms,population,households,housing_median_age,median_income,ocean_proximity
6157,245.0,1323,705,261,39,3.1968,NEAR BAY
3405,283.0,988,475,242,52,1.3684,<1H OCEAN
4093,682.0,4289,1981,705,4,5.3366,INLAND
3001,895.0,5406,2337,882,17,6.0137,<1H OCEAN
13021,197.0,1185,588,196,29,5.0832,<1H OCEAN
...,...,...,...,...,...,...,...
2595,429.0,3182,1663,428,16,7.0592,<1H OCEAN
3375,298.0,1496,778,284,16,3.8589,NEAR OCEAN
1877,524.0,3007,1152,486,47,4.0000,NEAR BAY
10644,1233.0,10018,4253,1120,5,8.9063,<1H OCEAN
