# Preparação

In [1]:
from util.data import *
from util.model_func import *
import joblib

# Início de um aprendizado

In [2]:
print('Hello World!')

Hello World!


Deste trecho de código até o ".describe()" será o acesso e a demonstração da base de dados (dados que serão usados para 
prever o preço das casas de qualquer região baseado em outras características regionais e populacionais que a base
forneceu).

In [3]:
fetch_housing_data()

In [4]:
housing = load_housing_data()
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [5]:
housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [6]:
housing["ocean_proximity"].value_counts()

ocean_proximity
<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: count, dtype: int64

In [7]:
housing.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,20640.0,20640.0,20640.0,20640.0,20433.0,20640.0,20640.0,20640.0,20640.0
mean,-119.569704,35.631861,28.639486,2635.763081,537.870553,1425.476744,499.53968,3.870671,206855.816909
std,2.003532,2.135952,12.585558,2181.615252,421.38507,1132.462122,382.329753,1.899822,115395.615874
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.8,33.93,18.0,1447.75,296.0,787.0,280.0,2.5634,119600.0
50%,-118.49,34.26,29.0,2127.0,435.0,1166.0,409.0,3.5348,179700.0
75%,-118.01,37.71,37.0,3148.0,647.0,1725.0,605.0,4.74325,264725.0
max,-114.31,41.95,52.0,39320.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [8]:
train_set, test_set = split_train_test(housing, 0.2)
print(len(train_set), len(test_set))

16512 4128


In [9]:
housing_with_id = housing.reset_index()
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, "index")

Agora nós iremos preparar a renda média para uma amostragem estratificada: iremos separar em categorias de renda
para podermos formar nossos estratos e, posteriormente, fazer a divisão justa dos dados para os conjuntos.

In [10]:
housing["income_cat"] = pd.cut(housing["median_income"],
                               bins = [0., 1.5, 3., 4.5, 6., np.inf],
                               labels = [1, 2, 3, 4, 5])

Aqui está nossa tabela nova com as categorias separadas:

In [11]:
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,income_cat
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,5
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,5
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,5
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,4
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,3


# Estratificação e plotagens de comparação

## Aqui nós temos novamente a separação do conjunto de testes e de treinamento. Mas dessa vez feito de forma estratificada com a função autonôma criada em "housing_plots.ipynb"

In [12]:
strat_train_set, strat_test_set = stratified_shuffle_split(housing, n_splits=1, test_size=0.2, random_state=42)

print(strat_test_set["income_cat"].value_counts() / len(strat_test_set))

income_cat
3    0.350533
2    0.318798
4    0.176357
5    0.114341
1    0.039971
Name: count, dtype: float64


In [13]:
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)
    
housing = strat_train_set.copy()

## Detecção de correlações crescentes ou decrescentes em relação ao "median_house_value"

In [14]:
corr_matrix = housing.corr(numeric_only=True)

print(corr_matrix["median_house_value"].sort_values(ascending=False))

median_house_value    1.000000
median_income         0.687151
total_rooms           0.135140
housing_median_age    0.114146
households            0.064590
total_bedrooms        0.047781
population           -0.026882
longitude            -0.047466
latitude             -0.142673
Name: median_house_value, dtype: float64


# Extração de característica

## Abaixo está novas colunas criadas para enxergar relações mais específicas e assim melhorar a regressão linear

In [15]:
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_rooms"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"] = housing["population"]/housing["households"]

corr_matrix = housing.corr(numeric_only=True)
corr_matrix["median_house_value"].sort_values(ascending=False)

median_house_value          1.000000
median_income               0.687151
rooms_per_household         0.146255
total_rooms                 0.135140
housing_median_age          0.114146
households                  0.064590
total_bedrooms              0.047781
population_per_household   -0.021991
population                 -0.026882
longitude                  -0.047466
latitude                   -0.142673
bedrooms_per_rooms         -0.259952
Name: median_house_value, dtype: float64

# Prepare os Dados Para o Aprendizado de Máquina

In [16]:
housing = strat_train_set.drop("median_house_value", axis=1) # drop labels for training set
housing_labels = strat_train_set["median_house_value"].copy()

Neste, e no próximo trecho, será usado da SimpleImputer para processar a mediana dos dados númericos, e assim preencher todos os valores nulos presentes  em cada coluna.

In [17]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")

housing_num = housing.drop("ocean_proximity", axis=1)

imputer.fit(housing_num)

print(imputer.strategy)
print(imputer.statistics_)
print(housing_num.median().values)

median
[-118.51      34.26      29.      2119.       433.      1164.
  408.         3.54155]
[-118.51      34.26      29.      2119.       433.      1164.
  408.         3.54155]


In [18]:
X = imputer.transform(housing_num)

housing_tr = pd.DataFrame(X, columns=housing_num.columns,
                          index=housing_num.index)
housing_tr.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
12655,-121.46,38.52,29.0,3873.0,797.0,2237.0,706.0,2.1736
15502,-117.23,33.09,7.0,5320.0,855.0,2015.0,768.0,6.3373
2908,-119.04,35.37,44.0,1618.0,310.0,667.0,300.0,2.875
14053,-117.13,32.75,24.0,1877.0,519.0,898.0,483.0,2.2264
20496,-118.7,34.28,27.0,3536.0,646.0,1837.0,580.0,4.4964


# Categorizando

## Agora nós iremos tratar os dados de texto antes desprezados como dados categóricos. Isso permitirá lidarmos com eles como componentes de predição completamente normais.

In [19]:
housing_cat = housing[["ocean_proximity"]]
housing_cat.head(10)

Unnamed: 0,ocean_proximity
12655,INLAND
15502,NEAR OCEAN
2908,INLAND
14053,NEAR OCEAN
20496,<1H OCEAN
1481,NEAR BAY
18125,<1H OCEAN
5830,<1H OCEAN
17989,<1H OCEAN
4861,<1H OCEAN


In [20]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder()
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
housing_cat_encoded[:10]

array([[1.],
       [4.],
       [1.],
       [4.],
       [0.],
       [3.],
       [0.],
       [0.],
       [0.],
       [0.]])

Já que os algorítmos de máquina podem pensar que esses números possuem alguma relação entre si. Nessa situação é melhor utilizar a OneHotEncoder, que trata de maneira indiferente cada categoria.

In [21]:
from sklearn.preprocessing import OneHotEncoder
hot_encoder = OneHotEncoder()
housing_cat_1hot = hot_encoder.fit_transform(housing_cat)
housing_cat_1hot

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 16512 stored elements and shape (16512, 5)>

In [22]:
housing_cat_1hot.toarray()

array([[0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0.],
       ...,
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.]])

In [23]:
hot_encoder.categories_

[array(['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN'],
       dtype=object)]

## Um estimador customizado que gera novas colunas para o housing dataset

In [24]:
from util.custom_estimators import CombinedAtribbutesAdder

attr_adder = CombinedAtribbutesAdder(False)
housing_extra_attribs = attr_adder.transform(housing.values)
housing.values

array([[-121.46, 38.52, 29.0, ..., 706.0, 2.1736, 'INLAND'],
       [-117.23, 33.09, 7.0, ..., 768.0, 6.3373, 'NEAR OCEAN'],
       [-119.04, 35.37, 44.0, ..., 300.0, 2.875, 'INLAND'],
       ...,
       [-122.72, 38.44, 48.0, ..., 172.0, 3.1797, '<1H OCEAN'],
       [-122.7, 38.31, 14.0, ..., 501.0, 4.1964, '<1H OCEAN'],
       [-122.14, 39.97, 27.0, ..., 197.0, 3.1319, 'INLAND']], dtype=object)

In [25]:
housing_extra_attribs

array([[-121.46, 38.52, 29.0, ..., 'INLAND', 5.485835694050992,
        3.168555240793201],
       [-117.23, 33.09, 7.0, ..., 'NEAR OCEAN', 6.927083333333333,
        2.6236979166666665],
       [-119.04, 35.37, 44.0, ..., 'INLAND', 5.3933333333333335,
        2.223333333333333],
       ...,
       [-122.72, 38.44, 48.0, ..., '<1H OCEAN', 4.1104651162790695,
        2.6627906976744184],
       [-122.7, 38.31, 14.0, ..., '<1H OCEAN', 6.297405189620759,
        2.411177644710579],
       [-122.14, 39.97, 27.0, ..., 'INLAND', 5.477157360406092,
        3.1725888324873095]], dtype=object)

In [26]:
housing_extra_attribs = pd.DataFrame(
    housing_extra_attribs,
    columns=list(housing.columns)+["rooms_per_household", "bedrooms_per_room"],
    index=housing.index
)

housing_extra_attribs.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,rooms_per_household,bedrooms_per_room
12655,-121.46,38.52,29.0,3873.0,797.0,2237.0,706.0,2.1736,INLAND,5.485836,3.168555
15502,-117.23,33.09,7.0,5320.0,855.0,2015.0,768.0,6.3373,NEAR OCEAN,6.927083,2.623698
2908,-119.04,35.37,44.0,1618.0,310.0,667.0,300.0,2.875,INLAND,5.393333,2.223333
14053,-117.13,32.75,24.0,1877.0,519.0,898.0,483.0,2.2264,NEAR OCEAN,3.886128,1.859213
20496,-118.7,34.28,27.0,3536.0,646.0,1837.0,580.0,4.4964,<1H OCEAN,6.096552,3.167241



# Transformações de Pipelines

In [27]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('attr_adder', CombinedAtribbutesAdder()),
    ('std_scaler', StandardScaler())
])

housing_num_tr = num_pipeline.fit_transform(housing_num)

In [28]:
housing_num_tr

array([[-0.94135046,  1.34743822,  0.02756357, ...,  0.01739526,
         0.00622264, -0.12112176],
       [ 1.17178212, -1.19243966, -1.72201763, ...,  0.56925554,
        -0.04081077, -0.81086696],
       [ 0.26758118, -0.1259716 ,  1.22045984, ..., -0.01802432,
        -0.07537122, -0.33827252],
       ...,
       [-1.5707942 ,  1.31001828,  1.53856552, ..., -0.5092404 ,
        -0.03743619,  0.32286937],
       [-1.56080303,  1.2492109 , -1.1653327 , ...,  0.32814891,
        -0.05915604, -0.45702273],
       [-1.28105026,  2.02567448, -0.13148926, ...,  0.01407228,
         0.00657083, -0.12169672]])

In [29]:
from sklearn.compose import ColumnTransformer

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

In [30]:
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs)
])

housing_prepared = full_pipeline.fit_transform(housing)

In [31]:
housing_prepared

array([[-0.94135046,  1.34743822,  0.02756357, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.17178212, -1.19243966, -1.72201763, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.26758118, -0.1259716 ,  1.22045984, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-1.5707942 ,  1.31001828,  1.53856552, ...,  0.        ,
         0.        ,  0.        ],
       [-1.56080303,  1.2492109 , -1.1653327 , ...,  0.        ,
         0.        ,  0.        ],
       [-1.28105026,  2.02567448, -0.13148926, ...,  0.        ,
         0.        ,  0.        ]])

## Treinando e avaliando o conjunto de treinamento

In [32]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

0,1,2
,"fit_intercept  fit_intercept: bool, default=True Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered).",True
,"copy_X  copy_X: bool, default=True If True, X will be copied; else, it may be overwritten.",True
,"tol  tol: float, default=1e-6 The precision of the solution (`coef_`) is determined by `tol` which specifies a different convergence criterion for the `lsqr` solver. `tol` is set as `atol` and `btol` of :func:`scipy.sparse.linalg.lsqr` when fitting on sparse training data. This parameter has no effect when fitting on dense data. .. versionadded:: 1.7",1e-06
,"n_jobs  n_jobs: int, default=None The number of jobs to use for the computation. This will only provide speedup in case of sufficiently large problems, that is if firstly `n_targets > 1` and secondly `X` is sparse or if `positive` is set to `True`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"positive  positive: bool, default=False When set to ``True``, forces the coefficients to be positive. This option is only supported for dense arrays. For a comparison between a linear regression model with positive constraints on the regression coefficients and a linear regression without such constraints, see :ref:`sphx_glr_auto_examples_linear_model_plot_nnls.py`. .. versionadded:: 0.24",False


In [33]:
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)

print("Predicted:", lin_reg.predict(some_data_prepared))

Predicted: [ 85657.90192014 305492.60737488 152056.46122456 186095.70946094
 244550.67966089]


In [34]:
print("labels:", list(some_labels))
print("The predict precision:\n", (1 - some_labels / lin_reg.predict(some_data_prepared)))

labels: [72100.0, 279600.0, 82700.0, 112500.0, 238300.0]
The predict precision:
 12655    0.158280
15502    0.084757
2908     0.456123
14053    0.395472
20496    0.025560
Name: median_house_value, dtype: float64


## A função "model_score_rmse" abrevia o cálculo do rmse, do treinamento e das predições do modelo

In [35]:
from sklearn.metrics import mean_squared_error

housing_predictions =lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

np.float64(68627.87390018745)

In [36]:
from sklearn.metrics import mean_absolute_error

mae = mean_absolute_error(housing_labels, housing_predictions)
mae

49438.66860915801

## "model_score_rmse": Uma função que realiza a validação cruzada k-fold e retorna seu RMSE

In [37]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state=42)
model_score_rmse(tree_reg, housing_prepared, housing_labels)

np.float64(0.0)

In [38]:
tree_rmse_scores = model_cross_rmse(tree_reg, housing_prepared, housing_labels)

## Função "display_score" para mostrar um resumo da avaliação

In [39]:
display_scores(tree_rmse_scores)

Scores: [72831.45749112 69973.18438322 69528.56551415 72517.78229792
 69145.50006909 79094.74123727 68960.045444   73344.50225684
 69826.02473916 71077.09753998]
Mean: 71629.89009727491
Standard deviation: 2914.035468468928


In [40]:
lin_rsme_scores = model_cross_rmse(lin_reg, housing_prepared, housing_labels)
display_scores(lin_rsme_scores)

Scores: [71762.76364394 64114.99166359 67771.17124356 68635.19072082
 66846.14089488 72528.03725385 73997.08050233 68802.33629334
 66443.28836884 70139.79923956]
Mean: 69104.07998247063
Standard deviation: 2880.3282098180653


## Todo uso da RandomForestRegressor será feito por seu modelo salvo, por questões de otimização

In [41]:
from sklearn.ensemble import RandomForestRegressor

# RMSE direto no treino normal
forest_reg = RandomForestRegressor()
forest_rmse = model_score_rmse(forest_reg, housing_prepared, housing_labels, "models/forest_regression.pkl")
forest_rmse

np.float64(18680.33964258878)

In [42]:
# Também irei utilizar de dados já salvos aqui
# Por causa desse maldito RandomForestRegressor...

# RMSE do treino de validação cruzada k-fold
forest_rmse_scores = model_cross_rmse(forest_reg, housing_prepared, housing_labels, path_cross="results/forest_cross_kfold.pkl")

# joblib.dump(forest_rmse_scores, "results/forest_cross_kfold.pkl")

display_scores(forest_rmse_scores)

Scores: [51799.25867018 49136.66064727 46992.63122309 51580.38696479
 47371.95469146 51717.05137057 52353.3471104  50077.96789751
 48264.47612001 53688.80812534]
Mean: 50298.25428206059
Standard deviation: 2159.943935449968


# Hora de salvar

## "Agora iremos salvar o modelo linear e a floresta aleátoria... nossos melhores candidatos"
## Já foi salvo :)

In [43]:
# Código passado de salvar o RandomForestRegressor

# forest_reg = RandomForestRegressor()
# forest_reg.fit(housing_prepared, housing_labels)
# joblib.dump(forest_reg, "models/forest_regression.pkl")

In [44]:
forest_reg_loaded = joblib.load("models/forest_regression.pkl")

housing_predictions = forest_reg_loaded.predict(housing_prepared)
forest_loaded_rmse = mean_squared_error(housing_labels, housing_predictions)
forest_loaded_rmse = np.sqrt(forest_loaded_rmse)

display_scores(forest_loaded_rmse)

Scores: 18680.33964258878
Mean: 18680.33964258878
Standard deviation: 0.0


In [45]:
# Esse também já foi salvo

# lin_reg = LinearRegression()
# lin_reg.fit(housing_prepared, housing_labels)
# joblib.dump(lin_reg, "models/linear_regression.pkl")

In [46]:
lin_reg_loaded = joblib.load("models/linear_regression.pkl")
model_score_rmse(lin_reg_loaded, housing_prepared, housing_labels)

np.float64(68627.87390018745)

In [47]:
from sklearn.model_selection import GridSearchCV

param_grids = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]}
]

forest_reg = RandomForestRegressor()

filename = "models/grid_housing_forest.pkl"

try:
    grid_search = joblib.load(filename)
except FileNotFoundError:
    grid_search = GridSearchCV(forest_reg, param_grid=param_grids,
                          cv=5, scoring='neg_mean_squared_error',
                          return_train_score=True)
    grid_search.fit(housing_prepared, housing_labels)

joblib.dump(grid_search, filename)

['models/grid_housing_forest.pkl']

In [48]:
grid_search.best_params_

{'max_features': 6, 'n_estimators': 30}

In [49]:
grid_search.best_estimator_

0,1,2
,"n_estimators  n_estimators: int, default=100 The number of trees in the forest. .. versionchanged:: 0.22  The default value of ``n_estimators`` changed from 10 to 100  in 0.22.",30
,"criterion  criterion: {""squared_error"", ""absolute_error"", ""friedman_mse"", ""poisson""}, default=""squared_error"" The function to measure the quality of a split. Supported criteria are ""squared_error"" for the mean squared error, which is equal to variance reduction as feature selection criterion and minimizes the L2 loss using the mean of each terminal node, ""friedman_mse"", which uses mean squared error with Friedman's improvement score for potential splits, ""absolute_error"" for the mean absolute error, which minimizes the L1 loss using the median of each terminal node, and ""poisson"" which uses reduction in Poisson deviance to find splits. Training using ""absolute_error"" is significantly slower than when using ""squared_error"". .. versionadded:: 0.18  Mean Absolute Error (MAE) criterion. .. versionadded:: 1.0  Poisson criterion.",'squared_error'
,"max_depth  max_depth: int, default=None The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.",
,"min_samples_split  min_samples_split: int or float, default=2 The minimum number of samples required to split an internal node: - If int, then consider `min_samples_split` as the minimum number. - If float, then `min_samples_split` is a fraction and  `ceil(min_samples_split * n_samples)` are the minimum  number of samples for each split. .. versionchanged:: 0.18  Added float values for fractions.",2
,"min_samples_leaf  min_samples_leaf: int or float, default=1 The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least ``min_samples_leaf`` training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression. - If int, then consider `min_samples_leaf` as the minimum number. - If float, then `min_samples_leaf` is a fraction and  `ceil(min_samples_leaf * n_samples)` are the minimum  number of samples for each node. .. versionchanged:: 0.18  Added float values for fractions.",1
,"min_weight_fraction_leaf  min_weight_fraction_leaf: float, default=0.0 The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided.",0.0
,"max_features  max_features: {""sqrt"", ""log2"", None}, int or float, default=1.0 The number of features to consider when looking for the best split: - If int, then consider `max_features` features at each split. - If float, then `max_features` is a fraction and  `max(1, int(max_features * n_features_in_))` features are considered at each  split. - If ""sqrt"", then `max_features=sqrt(n_features)`. - If ""log2"", then `max_features=log2(n_features)`. - If None or 1.0, then `max_features=n_features`. .. note::  The default of 1.0 is equivalent to bagged trees and more  randomness can be achieved by setting smaller values, e.g. 0.3. .. versionchanged:: 1.1  The default of `max_features` changed from `""auto""` to 1.0. Note: the search for a split does not stop until at least one valid partition of the node samples is found, even if it requires to effectively inspect more than ``max_features`` features.",6
,"max_leaf_nodes  max_leaf_nodes: int, default=None Grow trees with ``max_leaf_nodes`` in best-first fashion. Best nodes are defined as relative reduction in impurity. If None then unlimited number of leaf nodes.",
,"min_impurity_decrease  min_impurity_decrease: float, default=0.0 A node will be split if this split induces a decrease of the impurity greater than or equal to this value. The weighted impurity decrease equation is the following::  N_t / N * (impurity - N_t_R / N_t * right_impurity  - N_t_L / N_t * left_impurity) where ``N`` is the total number of samples, ``N_t`` is the number of samples at the current node, ``N_t_L`` is the number of samples in the left child, and ``N_t_R`` is the number of samples in the right child. ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, if ``sample_weight`` is passed. .. versionadded:: 0.19",0.0
,"bootstrap  bootstrap: bool, default=True Whether bootstrap samples are used when building trees. If False, the whole dataset is used to build each tree.",True


In [50]:
np.sqrt(-grid_search.best_score_)

np.float64(49874.17698711616)

In [51]:
cvres = grid_search.cv_results_
cvres

{'mean_fit_time': array([0.08172231, 0.26810436, 0.80119925, 0.135358  , 0.44567251,
        1.32130203, 0.18636875, 0.61789708, 1.84325933, 0.23832793,
        0.79041147, 2.37426572, 0.12835002, 0.42435188, 0.17369285,
        0.56328583, 0.2156045 , 0.70943236]),
 'std_fit_time': array([0.00390183, 0.00357423, 0.01205945, 0.00165186, 0.00419778,
        0.01050677, 0.00261354, 0.00617738, 0.01516393, 0.00392711,
        0.00767123, 0.0157531 , 0.00268816, 0.00739241, 0.00660419,
        0.00676704, 0.00315064, 0.00848125]),
 'mean_score_time': array([0.0043653 , 0.01168051, 0.03248324, 0.00434909, 0.01182752,
        0.03236094, 0.00434217, 0.01159964, 0.03279476, 0.00436525,
        0.01166897, 0.03252783, 0.00475898, 0.0132679 , 0.00487003,
        0.01325579, 0.00483193, 0.01329155]),
 'std_score_time': array([7.42257347e-05, 1.18844676e-04, 1.00207485e-04, 4.16125054e-05,
        1.48041651e-04, 8.50784398e-05, 8.03125432e-05, 6.20245097e-05,
        1.66120745e-04, 1.05959332e-

In [52]:
pd.DataFrame(cvres).head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_features,param_n_estimators,param_bootstrap,params,split0_test_score,split1_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,0.081722,0.003902,0.004365,7.4e-05,2,3,,"{'max_features': 2, 'n_estimators': 3}",-4108951000.0,-3788545000.0,...,-4024736000.0,128129000.0,18,-1168883000.0,-1046681000.0,-1120012000.0,-1148970000.0,-1054076000.0,-1107725000.0,49390100.0
1,0.268104,0.003574,0.011681,0.000119,2,10,,"{'max_features': 2, 'n_estimators': 10}",-3224043000.0,-2988747000.0,...,-3092557000.0,80750490.0,11,-576306800.0,-578340000.0,-606242800.0,-574753500.0,-564477000.0,-580024100.0,13953970.0
2,0.801199,0.012059,0.032483,0.0001,2,30,,"{'max_features': 2, 'n_estimators': 30}",-2807360000.0,-2631974000.0,...,-2787387000.0,95712790.0,8,-418930100.0,-443196900.0,-445694100.0,-426924600.0,-441664300.0,-435282000.0,10478750.0
3,0.135358,0.001652,0.004349,4.2e-05,4,3,,"{'max_features': 4, 'n_estimators': 3}",-3599115000.0,-3748019000.0,...,-3721229000.0,86531290.0,16,-937030600.0,-1012839000.0,-989311900.0,-1036741000.0,-943691700.0,-983922800.0,38658310.0
4,0.445673,0.004198,0.011828,0.000148,4,10,,"{'max_features': 4, 'n_estimators': 10}",-2905712000.0,-2689205000.0,...,-2853599000.0,123550000.0,9,-531387700.0,-525135700.0,-535520300.0,-532278100.0,-526786600.0,-530221700.0,3777049.0


In [53]:
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

63440.804992299294 {'max_features': 2, 'n_estimators': 3}
55610.76286783186 {'max_features': 2, 'n_estimators': 10}
52795.70644331073 {'max_features': 2, 'n_estimators': 30}
61001.87324885665 {'max_features': 4, 'n_estimators': 3}
53419.08990568453 {'max_features': 4, 'n_estimators': 10}
50470.8254829937 {'max_features': 4, 'n_estimators': 30}
59733.10918290811 {'max_features': 6, 'n_estimators': 3}
52450.84404180255 {'max_features': 6, 'n_estimators': 10}
49874.17698711616 {'max_features': 6, 'n_estimators': 30}
58741.769879046355 {'max_features': 8, 'n_estimators': 3}
51931.94568361105 {'max_features': 8, 'n_estimators': 10}
50313.49943067421 {'max_features': 8, 'n_estimators': 30}
62823.71268328564 {'bootstrap': False, 'max_features': 2, 'n_estimators': 3}
53759.73032972733 {'bootstrap': False, 'max_features': 2, 'n_estimators': 10}
59449.155156378176 {'bootstrap': False, 'max_features': 3, 'n_estimators': 3}
52346.109355711786 {'bootstrap': False, 'max_features': 3, 'n_estimators':

In [58]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([7.49007555e-02, 6.82801541e-02, 4.47855508e-02, 1.77532994e-02,
       1.56190953e-02, 1.70456661e-02, 1.57680777e-02, 2.94616971e-01,
       5.47075277e-02, 1.04115678e-01, 1.12815045e-01, 1.40391397e-02,
       1.51097839e-01, 8.24498202e-05, 6.98449692e-03, 7.38825451e-03])

In [59]:
cat_encoder = full_pipeline.named_transformers_["cat"]
cat_one_hot_attribs = list(cat_encoder.categories_[0])
cat_one_hot_attribs

['<1H OCEAN', 'INLAND', 'ISLAND', 'NEAR BAY', 'NEAR OCEAN']

In [60]:
extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)

[(np.float64(0.2946169708703344), 'median_income'),
 (np.float64(0.15109783924674994), 'INLAND'),
 (np.float64(0.11281504477273246), 'bedrooms_per_room'),
 (np.float64(0.10411567754630108), 'pop_per_hhold'),
 (np.float64(0.07490075546417783), 'longitude'),
 (np.float64(0.06828015408010946), 'latitude'),
 (np.float64(0.05470752773920955), 'rooms_per_hhold'),
 (np.float64(0.044785550819989264), 'housing_median_age'),
 (np.float64(0.01775329940244729), 'total_rooms'),
 (np.float64(0.01704566606998501), 'population'),
 (np.float64(0.01576807766842703), 'households'),
 (np.float64(0.015619095340095281), 'total_bedrooms'),
 (np.float64(0.014039139724058104), '<1H OCEAN'),
 (np.float64(0.007388254511861209), 'NEAR OCEAN'),
 (np.float64(0.006984496923327307), 'NEAR BAY'),
 (np.float64(8.244982019480363e-05), 'ISLAND')]

# Avalie seu modelo no conjunto de teste!

In [62]:
final_model = grid_search.best_estimator_

X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)
final_rmse

np.float64(48204.6815991176)

In [63]:
from scipy import stats
confidence = 0.95
squared_errors = (final_predictions - y_test) ** 2
np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1,
                        loc=squared_errors.mean(),
                        scale=stats.sem(squared_errors)))

array([46194.77068465, 50134.07840519])