In [3]:
# imports
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time


from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

In [4]:
# load data
HOUSING_FILE = 'housing.csv'

def load_housing_data(housing_file=HOUSING_FILE):
    return pd.read_csv(housing_file)


housing = load_housing_data()

In [5]:
# analyse data
"""
type(housing)
housing.info()
housing.head(n=10)
housing.describe()
housing["ocean_proximity"].value_counts()
type(housing["ocean_proximity"])
housing.hist(bins=50, figsize=(20, 15))
plt.show()
"""

'\ntype(housing)\nhousing.info()\nhousing.head(n=10)\nhousing.describe()\nhousing["ocean_proximity"].value_counts()\ntype(housing["ocean_proximity"])\nhousing.hist(bins=50, figsize=(20, 15))\nplt.show()\n'

In [6]:
# train_test split

train_set, test_set = train_test_split(
    housing,
    test_size=0.2,
    random_state=RANDOM_SEED,
)

In [7]:
# analyse correlation

"""
train_set.corr()
print("{} train + {} test".format(len(train_set), len(test_set)))
print("{} total".format(len(train_set)+len(test_set)))
"""

'\ntrain_set.corr()\nprint("{} train + {} test".format(len(train_set), len(test_set)))\nprint("{} total".format(len(train_set)+len(test_set)))\n'

In [8]:
# remove dependent variable (remove o label que queremos treinar)

X_train = train_set.drop(columns=["median_house_value"]) 
y_train = train_set["median_house_value"]   


In [9]:

# para criar novas colunas com base nos dados atuais
# Realizar a separação estratificada do dataset com base numa variável categórica "criada" para relacionar a renda
# média das regiões das casa. 

# Constroi uma coluna nova com categorias de renda fictícias.
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)

# Divide, de modo estratificado, o conjunto de dados.
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(
    n_splits=1,
    test_size=0.2,
    random_state=RANDOM_SEED)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]
    
strat_train_set["income_cat"].value_counts() / len(strat_train_set)
strat_test_set["income_cat"].value_counts() / len(strat_test_set)
strat_train_set.info()

# Remove a coluna nova, que foi adicionada apenas temporariamente.
strat_train_set.drop(["income_cat"], axis=1, inplace=True)
strat_test_set.drop(["income_cat"], axis=1, inplace=True)


strat_train_set.info()
strat_test_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16512 entries, 17606 to 15775
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           16512 non-null  float64
 1   latitude            16512 non-null  float64
 2   housing_median_age  16512 non-null  float64
 3   total_rooms         16512 non-null  float64
 4   total_bedrooms      16354 non-null  float64
 5   population          16512 non-null  float64
 6   households          16512 non-null  float64
 7   median_income       16512 non-null  float64
 8   median_house_value  16512 non-null  float64
 9   ocean_proximity     16512 non-null  object 
 10  income_cat          16512 non-null  float64
dtypes: float64(10), object(1)
memory usage: 1.5+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 16512 entries, 17606 to 15775
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              ----------

In [10]:
# analise de correlação ... na aula 02 e 03 temos exemplos 


In [11]:
# preparando os dados para o modelo

# Variáveis independentes: dataset original menos a coluna de valores dependentes.
housing = strat_train_set.drop("median_house_value", axis=1)

# Variável dependente, também chamada de label.
housing_labels = strat_train_set["median_house_value"].copy()

df = housing.copy()
df['median_house_value'] = housing_labels


# resolvendo o problema dos valores faltantes com imputer

# Antes de treinar o SimpleImputer, remover a coluna de dados categóricos. O dataset resultante tem apenas
# as variáveis independentes numéricas.
housing_num = housing.drop("ocean_proximity", axis=1)

# Cria um imputer que substitui células inválidas (NaN) pela mediana dos valores da coluna à qual a célula pertence.
imputer = SimpleImputer(strategy="median")

# Agora treinar o Imputer. Isto vai causar o cálculo da mediana de cada coluna,
# que ficará armazenado no Imputer para uso futuro.
imputer.fit(housing_num)

# O Imputer agora tem as estatísticas desejadas armazenadas.
print("Estatísticas do Imputer:")
print(imputer.statistics_)

# Compare com as medianas do DataFrame:
print("Medianas")
print(housing_num.median().values)


# Aplicar o Imputer aos nossos dados. O valor de retorno é um ndarray do NumPy.
temp = imputer.transform(housing_num)
#print(type(temp))

# Trabalhar com DataFrames geralmente é mais legal - dá para referenciar colunas por nome, ao invés de indices.
# Vamos transformar de volta o ndarray em DataFrame.
housing_tr = pd.DataFrame(temp, columns=housing_num.columns)




Estatísticas do Imputer:
[-118.51     34.26     29.     2119.5     433.     1164.      408.
    3.5409]
Medianas
[-118.51     34.26     29.     2119.5     433.     1164.      408.
    3.5409]


In [12]:
# Codificando variáveis categóricas

# Separar apenas as variáveis categóricas (neste caso temos apenas uma).
housing_cat = housing[["ocean_proximity"]]
#print(type(housing_cat))
#print(housing_cat.head())
ordinal_encoder = OrdinalEncoder()
housing_cat_encoded = ordinal_encoder.fit_transform(housing_cat)
#housing_cat_encoded[:10]
#ordinal_encoder.categories_

# criando o codificador 

encoder = OneHotEncoder(categories="auto")

# Aprende a codificação e já aplica a mesma ao dataset fornecido. Todo transformador no sklearn
# tem os métodos fit() para aprender a transformação, e transform() para aplicá-la.
# O método fit_transform() faz os dois atos em sequência.
housing_cat_1hot = encoder.fit_transform(housing_cat)

# print(housing_cat_1hot.toarray()[:5])
# encoder.categories_

In [13]:
# criando um Transformer 

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    # column index
    rooms_ix, bedrooms_ix, population_ix, household_ix = 3, 4, 5, 6

    def __init__(self, add_bedrooms_per_room=True):  # no *args or **kargs
        self.add_bedrooms_per_room = add_bedrooms_per_room

    def fit(self, X, y=None):
        return self  # nothing else to do

    def transform(self, X, y=None):
        rooms_per_household = (X[:, CombinedAttributesAdder.rooms_ix] /
                               X[:, CombinedAttributesAdder.household_ix])
        population_per_household = (
            X[:, CombinedAttributesAdder.population_ix] /
            X[:, CombinedAttributesAdder.household_ix])

        if self.add_bedrooms_per_room:
            bedrooms_per_room = (X[:, CombinedAttributesAdder.bedrooms_ix] /
                                 X[:, CombinedAttributesAdder.rooms_ix])
            return np.c_[X, rooms_per_household, population_per_household,
                         bedrooms_per_room]
        else:
            return np.c_[X, rooms_per_household, population_per_household]


attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)  #"""N entendi pq n funciona com valor True"""
housing_extra_attribs = attr_adder.transform(housing.values)

# Transformando em DataFrame, porque DataFrames são mais amigáveis.
columns_housing_extra_attribs = list(housing.columns) + [
    "rooms_per_household",
    "population_per_household",
]
housing_extra_attribs = pd.DataFrame(housing_extra_attribs,
                                     columns=columns_housing_extra_attribs)
housing_extra_attribs.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,rooms_per_household,population_per_household
0,-121.89,37.29,38,1568,351,710,339,2.7042,<1H OCEAN,4.62537,2.0944
1,-121.93,37.05,14,679,108,306,113,6.4214,<1H OCEAN,6.00885,2.70796
2,-117.2,32.77,31,1952,471,936,462,2.8621,NEAR OCEAN,4.22511,2.02597
3,-119.61,36.31,25,1847,371,1460,353,1.8839,INLAND,5.23229,4.13598
4,-118.59,34.23,17,6592,1525,4459,1463,3.0347,<1H OCEAN,4.50581,3.04785


In [14]:
# criando pipelines

meu_imputer = SimpleImputer(strategy="median")
meu_adder = CombinedAttributesAdder()
meu_scaler = StandardScaler()


# numerical pipeline
num_pipeline = Pipeline([
    ("imputer", meu_imputer),
    ("attribs_adder", meu_adder),
    ("std_scaler", meu_scaler),
])


"""
# outra forma de fazer...
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("attribs_adder", CombinedAttributesAdder()),
    ("std_scaler", StandardScaler()),
])

housing_num_tr = num_pipeline.fit_transform(housing_num)
housing_num_tr"""

housing_num_tr = num_pipeline.fit_transform(housing_num)
#housing_num_tr


# categorical pipeline
cat_pipeline = Pipeline([
    ("cat_encoder", OneHotEncoder(sparse=False)),
])

housing_cat_tr = cat_pipeline.fit_transform(housing_cat)
#housing_cat_tr


# full pipeline
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(sparse=False), cat_attribs),
])

housing_prepared = full_pipeline.fit_transform(housing)
#housing_prepared[:5]

In [15]:
# criando modelos 

# Regressor Linear

from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)


# Seleciona 5 pontos do conjunto de treinamento.
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)


# Para obter as previsões, basta chamar o método predict()
predicted_labels = lin_reg.predict(some_data_prepared)
print("Predição: {}".format(predicted_labels.round(decimals=2)))

# Compare com os valores originais:
print("Original: {}".format(some_labels.values.round(decimals=2)))


from sklearn.metrics import mean_squared_error

housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)
print("Regressão linear: RMSE = {:.2f}".format(lin_rmse))


residuo = housing_labels - housing_predictions
#plt.hist(residuo, bins=50)
pd.Series(residuo).describe()

Predição: [210644.6  317768.81 210956.43  59218.99 189747.56]
Original: [286600. 340600. 196900.  46300. 254500.]
Regressão linear: RMSE = 68628.20


count    1.651200e+04
mean    -2.253114e-11
std      6.863028e+04
min     -6.184707e+05
25%     -4.232504e+04
50%     -1.068880e+04
75%      2.824534e+04
max      8.157162e+05
Name: median_house_value, dtype: float64

In [16]:
# criando modelo
# arvore de decisão

from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state=RANDOM_SEED)
tree_reg.fit(housing_prepared, housing_labels)

predicted_labels = tree_reg.predict(some_data_prepared)
print("Predição: {}".format(predicted_labels))
print("Original: {}".format(some_labels.values))

housing_predictions = tree_reg.predict(housing_prepared)
tree_mse = mean_squared_error(housing_labels, housing_predictions)
tree_rmse = np.sqrt(tree_mse)
print("Regressão linear: RMSE = {:.2f}".format(tree_rmse))

"""

Clássico over-fitting

"""

Predição: [286600. 340600. 196900.  46300. 254500.]
Original: [286600. 340600. 196900.  46300. 254500.]
Regressão linear: RMSE = 0.00


'\n\nClássico over-fitting\n\n'

In [17]:
# Melhorando a avaliação usando o Validação Cruzada (Cross-Validation)


# por simplicidade, não usaremos neste exemplo a divisão estratificada


X_train, X_test, y_train, y_test = train_test_split(housing_prepared,
                                                    housing_labels,
                                                    test_size=0.2,
                                                    random_state=RANDOM_SEED)

In [18]:
lin_reg.fit(X_train, y_train)

y_pred = lin_reg.predict(X_test)
lin_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Regressão linear: RMSE = {:.2f}".format(lin_rmse))

tree_reg.fit(X_train, y_train)

y_pred = tree_reg.predict(X_test)
tree_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Regressão árvore de decisão: RMSE = {:.2f}".format(tree_rmse))

Regressão linear: RMSE = 69392.52
Regressão árvore de decisão: RMSE = 71023.94


In [19]:
# Criando um modelo 

# Regressor Random Forest 

from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor(n_estimators=10, random_state=RANDOM_SEED)

forest_reg.fit(X_train, y_train)

y_pred = forest_reg.predict(X_test)
forest_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Regressão random forest: RMSE = {:.2f}".format(forest_rmse))






Regressão random forest: RMSE = 52601.87


In [20]:
# como saber se esses resultados não são pura sorte?
# Seguinte ferramenta do sci-kit learn

from sklearn.model_selection import cross_val_score

# scores lineares
lin_scores = cross_val_score(
    lin_reg,
    housing_prepared,
    housing_labels,
    scoring="neg_mean_squared_error",
    cv=10,
    n_jobs=-1,
)
lin_rmse_scores = np.sqrt(-lin_scores)

# scores arvore
tree_scores = cross_val_score(
    tree_reg,
    housing_prepared,
    housing_labels,
    scoring="neg_mean_squared_error",
    cv=10,
    n_jobs=-1,
)
tree_rmse_scores = np.sqrt(-tree_scores)


# scores random forest
forest_scores = cross_val_score(
    forest_reg,
    housing_prepared,
    housing_labels,
    scoring="neg_mean_squared_error",
    cv=10,
    n_jobs=-1,
)
forest_rmse_scores = np.sqrt(-forest_scores)


def display_scores(scores):
    print("Scores:", scores.round(decimals=2))
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

print('linear')
display_scores(lin_rmse_scores)
print('arvore')
display_scores(tree_rmse_scores)
print('random forest')
display_scores(forest_rmse_scores)

linear
Scores: [66782.74 66960.12 70347.95 74739.57 68031.13 71193.84 64969.63 68281.61
 71552.92 67665.1 ]
Mean: 69052.46136345083
Standard deviation: 2731.674001798348
arvore
Scores: [70194.34 66855.16 72432.58 70758.74 71115.88 75585.14 70262.86 70273.63
 75366.88 71231.66]
Mean: 71407.68766037929
Standard deviation: 2439.4345041191004
random forest
Scores: [51646.45 48940.6  53050.86 54408.99 50922.15 56482.51 51864.52 49760.85
 55434.22 53326.1 ]
Mean: 52583.72407377466
Standard deviation: 2298.353351147122


In [21]:
# Encontrando o  melhor conjunto de hiper-parâmetros 


import time

from sklearn.model_selection import GridSearchCV



param_grid = [
    # try 6 (2×3) combinations of hyperparameters.
    {
        "n_estimators": [5, 10, 15, 30, 40],
        "max_features": [4, 6, 8, 10, 12],
    },
    # then try 4 (1x2×2) combinations with bootstrap set as False.
    {
        "bootstrap": [False],
        "n_estimators": [3, 10, 18, 27],
        "max_features": [3, 4, 8, 15],
    },
]

forest_reg = RandomForestRegressor(random_state=RANDOM_SEED)

# train across 5 folds, that's a total of (6+4)*5=50 rounds of training.
grid_search = GridSearchCV(
    forest_reg,
    param_grid,
    cv=5,
    scoring="neg_mean_squared_error",
    return_train_score=True,
    n_jobs=-1,
)

t1 = time.perf_counter()
grid_search.fit(housing_prepared, housing_labels)
t2 = time.perf_counter()

print(f"Tempo gasto: {t2 - t1:.2f} s")

Tempo gasto: 57.78 s


In [22]:
grid_search.best_params_


{'bootstrap': False, 'max_features': 8, 'n_estimators': 27}

In [23]:
# melhor modelo treinado

grid_search.best_estimator_

RandomForestRegressor(bootstrap=False, max_features=8, n_estimators=27,
                      random_state=42)

In [24]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

56405.55344386644 {'max_features': 4, 'n_estimators': 5}
52741.04704299915 {'max_features': 4, 'n_estimators': 10}
51775.16663319472 {'max_features': 4, 'n_estimators': 15}
50377.40461678399 {'max_features': 4, 'n_estimators': 30}
50143.14410447875 {'max_features': 4, 'n_estimators': 40}
54906.23833596068 {'max_features': 6, 'n_estimators': 5}
52006.19873526564 {'max_features': 6, 'n_estimators': 10}
51077.825559965466 {'max_features': 6, 'n_estimators': 15}
50146.51167415009 {'max_features': 6, 'n_estimators': 30}
49892.295944853344 {'max_features': 6, 'n_estimators': 40}
54242.75738147613 {'max_features': 8, 'n_estimators': 5}
51711.127883959234 {'max_features': 8, 'n_estimators': 10}
50710.992838563994 {'max_features': 8, 'n_estimators': 15}
49682.273345071546 {'max_features': 8, 'n_estimators': 30}
49504.61171066941 {'max_features': 8, 'n_estimators': 40}
55422.14720483451 {'max_features': 10, 'n_estimators': 5}
52243.03120763529 {'max_features': 10, 'n_estimators': 10}
51181.11588

In [25]:
# analise da importância das caracteri(features)

feature_importances = grid_search.best_estimator_.feature_importances_
#feature_importances
extra_attribs = ["rooms_per_hhold", "pop_per_hhold", "bedrooms_per_room"]
cat_encoder = cat_pipeline.named_steps["cat_encoder"]
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
sorted(zip(feature_importances, attributes), reverse=True)

# como podemos observar, os extra atrr foram acertados, estão dentre os top-7 mais importantes 

[(0.36646090386578456, 'median_income'),
 (0.16499671600370852, 'INLAND'),
 (0.10732516127770518, 'pop_per_hhold'),
 (0.07775302201262438, 'longitude'),
 (0.06821700672673928, 'latitude'),
 (0.053368610739487794, 'rooms_per_hhold'),
 (0.051785354880582044, 'bedrooms_per_room'),
 (0.04245236014079516, 'housing_median_age'),
 (0.015069172233995457, 'total_rooms'),
 (0.014042557999651956, 'households'),
 (0.013854304754456974, 'population'),
 (0.013552068991095461, 'total_bedrooms'),
 (0.006569084170173832, '<1H OCEAN'),
 (0.002696539911105351, 'NEAR OCEAN'),
 (0.0018048974941087441, 'NEAR BAY'),
 (5.223879798543149e-05, 'ISLAND')]

In [30]:
print(num_attribs)

['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']


In [29]:
feature_importances

array([7.77530220e-02, 6.82170067e-02, 4.24523601e-02, 1.50691722e-02,
       1.35520690e-02, 1.38543048e-02, 1.40425580e-02, 3.66460904e-01,
       5.33686107e-02, 1.07325161e-01, 5.17853549e-02, 6.56908417e-03,
       1.64996716e-01, 5.22387980e-05, 1.80489749e-03, 2.69653991e-03])

In [26]:
# Validação conjunto de testes

final_model = grid_search.best_estimator_

X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

print("RMSE = {}".format(final_rmse))

RMSE = 47220.87908393705
