In [1]:
RANDOM_SEED = 42

In [None]:
import pickle
import pathlib

import numpy as np
import pandas as pd

In [None]:
DATA_DIR = pathlib.Path.cwd().parent / 'data'
print(DATA_DIR)

In [None]:
clean_data_path = DATA_DIR / 'processed' / 'ames_clean.pkl'
clean_data_path

In [None]:
with open(clean_data_path, 'rb') as file:
    data = pickle.load(file)

In [None]:
data.info()

In [None]:
model_data = data.copy()
model_data.head()

In [None]:
categorical_columns = []
ordinal_columns = []
for col in model_data.select_dtypes('category').columns:
    if model_data[col].cat.ordered:
        ordinal_columns.append(col)
    else:
        categorical_columns.append(col)

In [None]:
numerical_data = data.select_dtypes('number').drop(columns='SalePrice').copy()
target = data['SalePrice'].copy()
numerical_data.corrwith(target).sort_values()



ordinal_columns

In [None]:
categorical_columns


In [None]:
for col in ordinal_columns:
    codes, _ = pd.factorize(data[col], sort=True)
    model_data[col] = codes

In [None]:
model_data[ordinal_columns].info()

In [None]:
data['Lot.Shape'].value_counts()

In [None]:
model_data['Lot.Shape'].value_counts()

In [None]:
model_data['Exterior'].value_counts()

In [None]:
original_data = model_data['Exterior']
encoded_data = pd.get_dummies(original_data)

aux_dataframe = encoded_data
aux_dataframe['Exterior'] = original_data.copy()

aux_dataframe.head().transpose()

In [None]:
original_data = model_data['Exterior']
encoded_data = pd.get_dummies(original_data, drop_first=True)

aux_dataframe = encoded_data
aux_dataframe['Exterior'] = original_data.copy()

aux_dataframe.head().transpose()

In [None]:
model_data = pd.get_dummies(model_data, drop_first=True)

In [None]:
model_data.info()

In [None]:
for cat in categorical_columns:
    dummies = []
    for col in model_data.columns:
        if col.startswith(cat + "_"):
            dummies.append(f'"{col}"')
    dummies_str = ', '.join(dummies)
    print(f'From column "{cat}" we made {dummies_str}\n')

In [None]:
X = model_data.drop(columns=['SalePrice']).copy()
y = model_data['SalePrice'].copy()

In [None]:
X.values, y.values

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
Xtrain, Xtest, ytrain, ytest = train_test_split(
    X,
    y,
    test_size=0.25,
    random_state=RANDOM_SEED,
)


In [None]:
X.shape, Xtrain.shape, Xtest.shape

In [None]:
y.shape, ytrain.shape, ytest.shape

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV



grid_search_forest = GridSearchCV(
    RandomForestRegressor(n_estimators=100, random_state=RANDOM_SEED, n_jobs=-1),
    {'n_estimators': [300, 1000, 3000]},
    cv=4, 
    n_jobs=-1, 
    scoring='neg_mean_squared_error', 
    return_train_score=True
)

grid_search_tree = GridSearchCV(
    DecisionTreeRegressor(random_state=RANDOM_SEED),
    {'max_depth': [3, 5, 7]},
    cv=4,
    n_jobs=-1,
    scoring='neg_mean_squared_error',
    return_train_score=True
)

In [None]:
grid_search_forest.fit(Xtrain, ytrain)
forest_reg = grid_search_forest.best_estimator_
forest_reg_scores = cross_val_score(forest_reg, Xtrain, ytrain, 
                                    scoring="neg_mean_squared_error", cv=8, n_jobs=-1)
forest_reg_rmse_scores = np.sqrt(-forest_reg_scores)

In [None]:
grid_search_tree.fit(Xtrain, ytrain)
tree_reg = grid_search_tree.best_estimator_
tree_reg_scores = cross_val_score(tree_reg, Xtrain, ytrain, 
                                  scoring="neg_mean_squared_error", cv=8, n_jobs=-1)
tree_reg_rmse_scores = np.sqrt(-tree_reg_scores)

In [None]:
print(forest_reg_rmse_scores)
print(forest_reg_rmse_scores.mean())

print(tree_reg_rmse_scores)
print(tree_reg_rmse_scores.mean())

In [None]:
# testando um teste parametrico
from scipy.stats import ttest_ind

def compara_scores(scores_1, scores_2):
    t_stat, p_value = ttest_ind(scores_1, scores_2, equal_var=False)
    print("Valor da estatística t: {:.2f}".format(t_stat))
    print("Valor-p: {}".format(p_value))

In [None]:
from scipy.stats import mannwhitneyu

U, p_value = mannwhitneyu(forest_reg_rmse_scores, tree_reg_rmse_scores)
print('U =', U)
print('p-value =', p_value)

In [None]:
compara_scores(forest_reg_rmse_scores, tree_reg_rmse_scores)

In [None]:
from sklearn.linear_model import ElasticNet, Lasso, Ridge
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_squared_error

In [None]:
def outro_experimento(msg, pipeline, X_train, y_train, X_test, y_test):
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    RMSE = np.sqrt(mean_squared_error(y_pred, y_test))

    model = pipeline.named_steps['reg']
    print(f'{msg}:')
    print(f'RMSE: {RMSE}')
    print()

In [65]:
poly_reg = Pipeline([
    ('poly_features', PolynomialFeatures(degree=2, include_bias=False)),
    ('std_scaler', StandardScaler()),
    ('reg', RandomForestRegressor(n_estimators=100, random_state=RANDOM_SEED, n_jobs=-1))
])

  ('lasso_reg', Lasso(alpha=0.1, random_state=RANDOM_SEED))


TypeError: 'tuple' object is not callable

In [64]:
outro_experimento('Random Forest com PolynomialFeatures', poly_reg, Xtrain, ytrain, Xtest, ytest)

Random Forest com PolynomialFeatures:
RMSE: 0.06142623923277575



In [None]:
# teste 2