# Data description and plotting

In [None]:
from sklearn.datasets import fetch_california_housing
import plotly.graph_objects as go
import numpy as np

In [None]:
california_housing = fetch_california_housing(as_frame=True)
df = california_housing.frame
df

In [None]:
# Get data description
print(california_housing.DESCR)

In [None]:
# Use built-in pandas description of the data
df.describe()

In [None]:
fig = df.hist(bins=30, figsize=(20, 15))

In [None]:
fig = df["MedHouseVal"].hist(bins=30, backend="plotly")
fig.update_layout(template="plotly_dark")

# Principal component analysis

In [None]:
import plotly.graph_objects as go
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA

In [None]:
iris_df = load_iris(as_frame=True).frame
X, y = iris_df.drop("target", axis=1), iris_df["target"]
iris_df

In [None]:

pca = PCA(n_components=2)
pca.fit(X)
t = pca.transform(X)

fig = go.Figure(
    data=[go.Scatter(x=t[:, 0], y=t[:, 1], mode="markers", marker=dict(color=y))]
)
fig.update_layout(template='plotly_dark')
fig.show()

# Preprocessing missing values

In [None]:
df

In [None]:
mask = df["MedInc"] > 10

In [None]:
df.loc[mask, "MedInc"] = np.nan

In [None]:
df.isna().sum()

In [None]:
df.interpolate(method="nearest")[mask]

In [None]:
df.fillna(10)[mask]

In [None]:
df.dropna()

# Scaling and encoding

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder, LabelBinarizer, OneHotEncoder

In [None]:
data = [
    [-1, 2],
    [-0.5, 6],
    [0, 10],
    [1, 18]
]

In [None]:
# Scale data to feature range
MinMaxScaler(feature_range=(0, 1)).fit_transform(data)

In [None]:
# Scale data by removing mean and scaling to unit variance
StandardScaler().fit_transform(data)

In [None]:
label_data = ["cat", "dog", "cat", "fish", "dog", "cat", "dog", "fish"]


In [None]:
labels = LabelEncoder().fit_transform(label_data)
labels

In [None]:
LabelBinarizer().fit_transform(label_data)

# Data pipeline & scoring

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_validate

In [None]:
scaler = StandardScaler()
dim_reducer = PCA(n_components=6)
estimator = RandomForestRegressor(n_estimators=30, max_depth=10)

pipeline = Pipeline([
    ("scaler", scaler),
    ("dim_reducer", dim_reducer),
    ("rf", estimator)
])

In [None]:
X, y = df.drop("MedHouseVal", axis=1), df["MedHouseVal"]

In [None]:
# metrics to evaluate the score by: https://scikit-learn.org/stable/modules/model_evaluation.html
scoring_metrics = ("r2", "neg_mean_absolute_error")

In [None]:
scores = cross_validate(pipeline, X, y, cv=5, scoring=scoring_metrics, return_train_score=True)
mean_train_score, std_train_score = scores["train_r2"].mean(), scores["train_r2"].std()
mean_test_score, std_test_score = scores["test_r2"].mean(), scores["test_r2"].std()
print("Train r2: %0.3f (+/- %0.2f)" % (mean_train_score, std_train_score))
print("Test r2: %0.3f (+/- %0.2f)" % (mean_test_score, std_test_score))
mean_train_score, std_train_score = scores["train_neg_mean_absolute_error"].mean(), scores["train_neg_mean_absolute_error"].std()
mean_test_score, std_test_score = scores["test_neg_mean_absolute_error"].mean(), scores["test_neg_mean_absolute_error"].std()
print("Train MAE: %0.3f (+/- %0.2f)" % (mean_train_score, std_train_score))
print("Test MAE: %0.3f (+/- %0.2f)" % (mean_test_score, std_test_score))


# Task
play around with parameters to get a better score for the pipeline.
What happens if we remove the PCA for instance?
Different scores?