In [None]:
import os
import tarfile
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Get the Data

#### Load the Data

In [None]:
DATASET_NAME = "diamonds" 
DATASET_DIR = "data"
UNPACKED_DIR = os.path.join("data", "unpacked")

In [None]:
os.makedirs(UNPACKED_DIR, exist_ok=True)
tgz_path = os.path.join(DATASET_DIR, DATASET_NAME + ".tgz")
dataset_tgz = tarfile.open(tgz_path)
dataset_tgz.extractall(path=UNPACKED_DIR)
dataset_tgz.close()

In [None]:
csv_path = os.path.join(UNPACKED_DIR, DATASET_NAME + ".csv")
housing = pd.read_csv(csv_path)

#### Take a Quick Look at the Data Structure

In [None]:
housing.head()

In [None]:
housing.info()

In [None]:
housing["cut"].value_counts()

In [None]:
housing["color"].value_counts()

In [None]:
housing["clarity"].value_counts()

In [None]:
housing.describe()

In [None]:
housing[["x", "y", "z"]][housing[["x", "y", "z"]] == 0].count()

In [None]:
housing.drop("Unnamed: 0", axis=1, inplace=True)

In [None]:
housing.hist(bins=50, figsize=(20, 15))
plt.show()

#### Create a Test Set

In [None]:
housing["carat_cat"] = pd.cut(housing["carat"], bins=[0., 1., 2., 3., 4., np.inf], labels=[1, 2, 3, 4, 5])
housing["carat_cat"].value_counts()

In [None]:
housing["carat_cat"].hist();

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_index, test_index = next(split.split(housing, housing["carat_cat"]))
strat_train_set = housing.loc[train_index]
strat_test_set = housing.loc[test_index]
strat_test_set["carat_cat"].value_counts() / len(strat_test_set)

In [None]:
for set_ in (strat_train_set, strat_test_set):
    set_.drop("carat_cat", axis=1, inplace=True)

## Discover and Visualize the Data to Gain Insights

In [None]:
housing = strat_train_set.copy()

#### Looking for Correlations

In [None]:
corr_matrix = housing.corr(numeric_only=True)
corr_matrix["price"].sort_values(ascending=False)

In [None]:
from pandas.plotting import scatter_matrix

attributes = ["price", "carat", "x", "y", "z"]
scatter_matrix(housing[attributes], figsize=(20, 12));

In [None]:
housing.plot(kind="scatter", x="carat", y="price", alpha=0.1);

#### Experimenting with Attribute Combinations

In [None]:
comb_attributes = ["carat", "depth", "table", "x", "y", "z"]
combinations_n = 3

attributes_combinations = []
for i in range(len(comb_attributes)):
    atr1 = comb_attributes[i]    
    for j in range(i + 1, len(comb_attributes)):
        atr2 = comb_attributes[j]
        attributes_combinations.append(f"{atr1}_per_{atr2}")
        housing[attributes_combinations[-1]] = housing[atr1] / housing[atr2]
        
corr_price = housing.corr(numeric_only=True)["price"].sort_values(ascending=False)
corr_price_combinations = corr_price[corr_price.index.isin(attributes_combinations)]
for atr in corr_price_combinations.iloc[combinations_n:].index:
    housing.drop(atr, axis=1, inplace=True)

In [None]:
corr_matrix = housing.corr(numeric_only=True)
corr_matrix["price"].sort_values(ascending=False)

## Prepare the Data for Machine Learning Algorithms

In [None]:
housing = strat_train_set.drop("price", axis=1)
housing_labels = strat_train_set["price"].copy()

#### Data Cleaning

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")
housing_num = housing.drop(["cut", "color", "clarity"], axis=1)
imputer.fit(housing_num)
imputer.statistics_

In [None]:
housing_num.median().values

In [None]:
X = imputer.transform(housing_num)
housing_tr = pd.DataFrame(X, columns=housing_num.columns, index=housing_num.index)

#### Handling Text and Categorical Attributes

In [None]:
housing_cat = housing[["cut", "color", "clarity"]]
housing_cat.head(10)

In [None]:
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)
housing_cat_1hot

In [None]:
housing_cat_1hot.toarray()

In [None]:
cat_encoder.categories_

#### Custom Transformers

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    def __init__(self, combinations_n): # no *args or **kwargs
        self.combinations_n = combinations_n
    def fit(self, X, y=None):
        return self # nothing else to do
    def transform(self, X, y=None):
        combined_attributes = X.copy()
        for atr in corr_price_combinations.iloc[:self.combinations_n].index:
            atr = str.split(atr, sep="_per_")
            atr1, atr2 = [housing_num.columns.get_loc(c) for c in atr]
            atr1, atr2 = X[:, atr1], X[:, atr2]
            combined_attribute = np.divide(atr1, atr2, out=np.full_like(atr1, np.nan), where=(atr2 != 0.0))
            combined_attributes = np.c_[combined_attributes, combined_attribute]
        return combined_attributes

attr_adder = CombinedAttributesAdder(combinations_n=combinations_n)
housing_extra_attribs = attr_adder.transform(housing_num.values)

#### Transformation Pipelines (with Feature Scaling)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ("imputer_1", SimpleImputer(strategy="median")),
    ("attribs_adder", CombinedAttributesAdder(combinations_n=3)),
    ("imputer_2", SimpleImputer(strategy="median")),
    ("std_scaler", StandardScaler())
], verbose=True)

housing_num_tr = num_pipeline.fit_transform(housing_num)
housing_num_tr[0]

In [None]:
from sklearn.compose import ColumnTransformer

num_attribs = list(housing_num)
cat_attribs = ["cut", "color", "clarity"]
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(categories=cat_encoder.categories_), cat_attribs)
], verbose=True)

housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared[0]

In [None]:
housing_prepared.shape

## Select and Train a Model

#### Training and Evaluating on the Training Set

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

In [None]:
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)

In [None]:
print("Predictions:", lin_reg.predict(some_data_prepared))

In [None]:
print("Labels:", list(some_labels))

In [None]:
from sklearn.metrics import root_mean_squared_error

housing_predictions = lin_reg.predict(housing_prepared)
lin_rmse = root_mean_squared_error(housing_labels, housing_predictions)
lin_rmse

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(housing_prepared, housing_labels)

In [None]:
housing_predictions = tree_reg.predict(housing_prepared)
tree_rmse = root_mean_squared_error(housing_labels, housing_predictions)
tree_rmse

#### Better Evaluation Using Cross-Validation

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg, housing_prepared, housing_labels, cv=10, scoring="neg_root_mean_squared_error",
                         n_jobs=-1, pre_dispatch="2*n_jobs")
tree_rmse_scores = -scores

In [None]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

display_scores(tree_rmse_scores)

In [None]:
lin_scores = cross_val_score(lin_reg, housing_prepared, housing_labels, cv=10, scoring="neg_root_mean_squared_error",
                             n_jobs=-1, pre_dispatch="2*n_jobs")
lin_rmse_scores = -lin_scores
display_scores(lin_rmse_scores)

In [None]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(random_state=42)
forest_reg.fit(housing_prepared, housing_labels)

In [None]:
housing_predictions = forest_reg.predict(housing_prepared)
forest_rmse = root_mean_squared_error(housing_labels, housing_predictions)
forest_rmse

In [None]:
forest_scores = cross_val_score(forest_reg, housing_prepared, housing_labels, cv=10, scoring="neg_root_mean_squared_error",
                                n_jobs=-1, pre_dispatch="2*n_jobs")
forest_rmse_scores = -forest_scores
display_scores(forest_rmse_scores)

## Fine-Tune Your Model

#### Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    # try 12 (3×4) combinations of hyperparameters
    {"n_estimators": [3, 10, 30], "max_features": [2, 4, 6, 8]},
    # then try 6 (2×3) combinations with bootstrap set as False
    {"bootstrap": [False], "n_estimators": [3, 10], "max_features": [2, 3, 4]}
]

forest_reg = RandomForestRegressor(random_state=42)
# train across 5 folds, that's a total of (12+6)*5=90 rounds of training 
grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring="neg_root_mean_squared_error", return_train_score=True,
                           n_jobs=-1, verbose=2, pre_dispatch="2*n_jobs")
grid_search.fit(housing_prepared, housing_labels)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_

In [None]:
grid_search_rmse = -grid_search.best_score_
grid_search_rmse

In [None]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(-mean_score, params)

#### Analyze the Best Models and Their Errors

In [None]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

In [None]:
extra_attribs = list(corr_price_combinations.iloc[:combinations_n].index)
cat_encoder = full_pipeline.named_transformers_["cat"]
cat_one_hot_attribs = list(cat_encoder.categories_[0])
attributes = num_attribs + extra_attribs + cat_one_hot_attribs
feature_importance_tuples = sorted(zip(feature_importances, attributes), reverse=True)
feature_importance_tuples

#### Evaluate Your System on the Test Set

In [None]:
final_model = grid_search.best_estimator_
X_test = strat_test_set.drop("price", axis=1)
y_test = strat_test_set["price"].copy()
X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)
final_rmse = root_mean_squared_error(y_test, final_predictions)
final_rmse

In [None]:
from scipy import stats

confidence = 0.95
squared_errors = (final_predictions - y_test) ** 2
np.sqrt(stats.t.interval(confidence, len(squared_errors) - 1, loc=squared_errors.mean(), scale=stats.sem(squared_errors)))

## Exercises

#### 1.
Try a Support Vector Machine regressor (```sklearn.svm.SVR```) with various hyperparameters, such as ```kernel="linear"``` (with various values for the ```C``` hyperparameter) or ```kernel="rbf"``` (with various values for the ```C``` and ```gamma``` hyperparameters). Don’t worry about what these hyperparameters mean for now. How does the best ```SVR``` predictor perform?

In [None]:
from sklearn.svm import SVR

param_grid = [
    {"kernel": ["linear"], "C": [128, 256, 512, 1024]},
    {"kernel": ["rbf"], "C": [128, 256, 512, 1024], "gamma": [0.1, 0.5, 1, 2]}
]

svr_reg = SVR()
grid_search_svr = GridSearchCV(svr_reg, param_grid, cv=5, scoring="neg_root_mean_squared_error", return_train_score=True, 
                               n_jobs=-1, verbose=2, pre_dispatch="2*n_jobs")
grid_search_svr.fit(housing_prepared, housing_labels)

In [None]:
grid_search_svr.best_params_

In [None]:
grid_search_svr.best_estimator_

In [None]:
grid_search_svr_rmse = -grid_search_svr.best_score_
grid_search_svr_rmse

#### 2.
Try replacing ```GridSearchCV``` with ```RandomizedSearchCV```.

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

param_rand = {
    "kernel": ["linear", "rbf"],
    "C": uniform(0, 1000),
    "gamma": uniform(0, 10)
}

svr_reg = SVR()
n_iter = 20
rand_search = RandomizedSearchCV(svr_reg, param_rand, cv=5, scoring="neg_root_mean_squared_error", return_train_score=True, 
                                 n_jobs=-1, verbose=2, pre_dispatch="2*n_jobs", n_iter=n_iter, random_state=42)
rand_search.fit(housing_prepared, housing_labels)

In [None]:
rand_search.best_params_

In [None]:
rand_search.best_estimator_

In [None]:
rand_search_rmse = -rand_search.best_score_
rand_search_rmse

#### 3. 
Try adding a transformer in the preparation pipeline to select only the most important attributes.

In [None]:
class AttributesSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribs_n):
        self.attribs_n = attribs_n
        self.feature_importances = None
        self.attribs_list = None
    def fit(self, X, y=None):
        self.feature_importances = feature_importances # ... calculate feature_importances based on X here
        self.attribs_list = sorted(np.argsort(self.feature_importances)[-self.attribs_n:])
        return self
    def transform(self, X, y=None):
        return X[:, self.attribs_list]

In [None]:
attribs_n = 5
full_select_pipeline = Pipeline([
    ("full", full_pipeline),
    ("select", AttributesSelector(attribs_n))
], verbose=True)

housing_selector_prepared = full_select_pipeline.fit_transform(housing)
housing_selector_prepared[0]

In [None]:
[feature for feature in feature_importance_tuples[:attribs_n]]

In [None]:
important_features_indices = sorted(np.argsort(feature_importances)[-attribs_n:])
important_features_indices

In [None]:
np.all(housing_selector_prepared == housing_prepared[:, important_features_indices])

#### 4. 
Try creating a single pipeline that does the full data preparation plus the final prediction.

In [None]:
full_select_predict_pipeline = Pipeline([
    ("full_select", full_select_pipeline),
    ("predict", final_model)
], verbose=True)

full_select_predict_pipeline.fit(housing, housing_labels)

In [None]:
some_data = housing.iloc[:5]
some_labels = housing_labels.iloc[:5]

In [None]:
print("Predictions:", full_select_predict_pipeline.predict(some_data))

In [None]:
print("Labels:", list(some_labels))

#### 5. 
Automatically explore some preparation options using GridSearchCV.

In [None]:
param_grid = [{
    "full_select__full__num__imputer__strategy": ["mean", "median", "most_frequent"],
    "full_select__select__attribs_n": list(range(1, len(attributes) + 1))
}]

grid_search_prep = GridSearchCV(full_select_predict_pipeline, param_grid, cv=5, scoring="neg_root_mean_squared_error", 
                                return_train_score=True, n_jobs=-1, verbose=2, pre_dispatch="2*n_jobs")
grid_search_prep.fit(housing, housing_labels)

In [None]:
grid_search_prep.best_params_

In [None]:
grid_search_prep.best_estimator_

In [None]:
grid_search_prep_rmse = -grid_search_prep.best_score_
grid_search_prep_rmse