In [None]:
from pathlib import Path
import pandas as pd
import tarfile
import urllib.request

In [None]:
def load_housing_data():
    tarball_path = Path("datasets/housing.tgz")
    if not tarball_path.is_file():
        Path("datasets").mkdir(parents=True, exist_ok=True)
        url = "https://github.com/ageron/data/raw/main/housing.tgz"
        urllib.request.urlretrieve(url, tarball_path)
    with tarfile.open(tarball_path) as housing_tarball:
            housing_tarball.extractall(path="datasets")
    return pd.read_csv(Path("datasets/housing/housing.csv"))

In [None]:
housing = load_housing_data()

In [None]:
#lectura rapida
housing.head()

In [None]:
#info rapida
housing.info()

In [None]:
housing["ocean_proximity"].value_counts()

In [None]:
#metodos numericos
housing.describe()

In [None]:
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(12,8))
plt.show()


In [None]:
from sklearn.model_selection import train_test_split
import numpy as np


In [None]:
def shuffle_and_split_data(data,test_ratio):
    shuffle_indices = np.random.permutation(len(data))
    test_set_size = int(len(data)*test_ratio)
    test_indices = shuffle_indices[:test_set_size]
    train_indices = shuffle_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]

In [None]:
train_set, test_set = shuffle_and_split_data(housing,0.2)
len(train_set)


In [None]:
len(test_set)

In [None]:
#  total_bedrooms      20433 non-null  float64
median = housing["total_bedrooms"].median()
housing["total_bedrooms"].fillna(median, inplace=True)

In [None]:
housing.info()

In [None]:
from zlib import crc32

def is_id_in_set(identifier, test_ratio):
    return crc32(np.int64(identifier))< test_ratio* 2**32

def split_data_with_id_hash(data,test_ratio, id_column):
    ids = data[id_column]
    in_test_set = ids.apply(lambda id_: is_id_in_set(id_,test_ratio))
    return data.loc[~in_test_set], data.loc[in_test_set]


In [None]:
housing_with_id = housing.reset_index() #añade una columna index
train_set, test_set = split_data_with_id_hash(housing_with_id, 0.2, "index")


In [None]:
housing_with_id["id"] = housing["longitude"]* 1000+housing["latitude"]
train_set, test_set = split_data_with_id_hash(housing_with_id, 0.2, "id")

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2,random_state=42)

In [None]:
housing["income_cat"] =pd.cut(housing["median_income"],
                              bins=[0., 1.5,3.0,4.5,6.0, np.inf],
                              labels=[1,2,3,4,5])

In [None]:
housing["income_cat"].value_counts().sort_index().plot.bar(rot=0, grid=True)
plt.xlabel("Income Category")
plt.ylabel("Number of districts")


In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

splitter = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=42)
strats_split = []
for train_index, test_index in splitter.split(housing, housing["income_cat"]):
    strat_train_n = housing.iloc[train_index]
    strat_test_n = housing.iloc[test_index]
    strats_split.append([strat_train_n, strat_test_n])

In [None]:
strat_train_set, strat_test_set = strats_split[0]

In [None]:
strat_train_set, strat_test_set =train_test_split(housing, test_size=0.2, random_state=42, stratify=housing["income_cat"])

In [None]:
strat_test_set["income_cat"].value_counts()/ len(strat_test_set)

In [None]:
for set_ in (strat_train_set,strat_test_set):
    set_.drop("income_cat",axis=1,inplace=True)

In [None]:
housing = strat_train_set.copy()

In [None]:
import matplotlib.pyplot as plt
housing.plot(kind="scatter", x="longitude", y="latitude", grid=True, alpha=0.2,
              s=housing["population"]/100,label="population",c="median_house_value", cmap="jet", colorbar=True,
              legend=True, sharex=False,figsize=(10,7))
# el "s" es para el radio de la population, y el C es el precio
plt.legend()

In [None]:
corr_matrix = housing.corr(numeric_only=True)

In [None]:
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
from pandas.plotting import scatter_matrix

attributes = ["median_house_value", "median_income", "total_rooms",
              "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(12,8))

In [None]:
housing.plot(kind="scatter", x="median_income", y="median_house_value",alpha=0.1, grid=True)

In [None]:
#Nuevos atributos
housing["rooms_per_house"] = housing["total_rooms"] / housing["households"]
housing["bedrooms_per_ratio"] = housing["total_bedrooms"]/ housing["total_rooms"]
housing["people_per_house"] = housing["population"]/ housing["households"]

In [None]:
corr_matrix = housing.corr(numeric_only=True)
corr_matrix["median_house_value"].sort_values(ascending=False)

In [None]:
housing = strat_train_set.drop("median_house_value", axis=1)
housing_labels = strat_train_set["median_house_value"].copy()

In [None]:
# housing_labels.head()
# housing.head

In [None]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median")

In [None]:
#crea copia solo con atributos numericos
housing_num = housing.select_dtypes(include=[np.number])

In [None]:
imputer.fit(housing_num)

In [None]:
imputer.statistics_

In [None]:
housing_num.median().values

In [None]:
#usar la instancia entrenada para transformar los datos de entrenamiento
# sustituyendo valores que faltan por las medianas aprendidas
X = imputer.fit(housing_num)

In [None]:
#recuperar nombres de columnas y el Indice de X
housing_tr = pd.DataFrame(X, columns=housing_num.columns, index=housing_num.index)

In [None]:
housing_tr.head()

In [None]:
housing_cat = housing[["ocean_proximity"]]

housing_cat.head(8)

In [None]:
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder()

housing_cat_encoder = ordinal_encoder.fit_transform(housing_cat)

housing_cat_encoder[:8]

In [None]:
ordinal_encoder.categories_

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
cat_encoder = OneHotEncoder()
housing_cat_1hot = cat_encoder.fit_transform(housing_cat)

In [None]:
# housing_cat_1hot.toarray()

In [None]:
cat_encoder.categories_

In [None]:
df_test = pd.DataFrame({"ocean_proximity":["INLAND", "NEAR BAY"]})

pd.get_dummies(df_test)

In [None]:
cat_encoder.transform(df_test)

In [None]:
df_test_unknow = pd.DataFrame({"ocean_proximity": ["<2H OCEAN", "ISLAND"]})
pd.get_dummies(df_test_unknow)

In [None]:
cat_encoder.handle_unknown = "ignore"
cat_encoder.transform(df_test_unknow)



In [None]:
cat_encoder.feature_names_in_

In [None]:

cat_encoder.get_feature_names_out()

In [None]:
# df_output = pd.DataFrame(cat_encoder.transform(df_test_unknow),
#                          columns=cat_encoder.get_feature_names_out(),
#                          index=df_test_unknow.index)


In [None]:
from sklearn.preprocessing import MinMaxScaler

min_max_scaler = MinMaxScaler(feature_range=(-1,1))
housing_num_min_max_scaler = min_max_scaler.fit_transform(housing_num)

In [None]:
from sklearn.preprocessing import StandardScaler

std_scaler = StandardScaler()

housing_num_std_scaler = std_scaler.fit_transform(housing_num)

In [None]:
from sklearn.metrics.pairwise import rbf_kernel

age_simil_35 = rbf_kernel(housing[["housing_median_age"]], [[35]],gamma=0.1)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
target_scaler = StandardScaler()
scaled_labels = target_scaler.fit_transform(housing_labels.to_frame())

model = LinearRegression()
model.fit(housing[["median_income"]], scaled_labels)

some_new_data = housing[["median_income"]].iloc[:5] #imaginemos datos nuevos

scaled_predictions = model.predict(some_new_data)
predictions = target_scaler.inverse_transform(scaled_predictions)


In [None]:
from sklearn.compose import TransformedTargetRegressor

model = TransformedTargetRegressor(LinearRegression(),
                                   transformer=StandardScaler())


model.fit(housing[["median_income"]],housing_labels)
predictions = model.predict(some_new_data)


In [None]:
#Transformador de logaritmos
#ya que a menudo es bueno cambiar caracteristicas con distribucion de cola pesada
#por su logaritmo
from sklearn.preprocessing import FunctionTransformer
log_transformer = FunctionTransformer(np.log, inverse_func=np.exp)
log_pop = log_transformer.transform(housing[["population"]])
log_pop

In [None]:
#crear transformador que calcule la misma medida de similitud de FBR Gausianna que antes
rbr_transformer = FunctionTransformer(rbf_kernel,
                                      kw_args=dict(Y=[[35.]],gamma=0.1))

age_simil_35 = rbr_transformer.transform(housing[["housing_median_age"]])

In [None]:
sf_coords = 37.7749,-122.41
sf_transform = FunctionTransformer(rbf_kernel,
                                   kw_args=dict(Y=[sf_coords],gamma=0.1))
sf_simil = sf_transform.transform(housing[["latitude", "longitude"]])

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.validation import check_array, check_is_fitted

class StandardScalerClone(BaseEstimator, TransformerMixin):
    def __init__(self, with_mean=TransformerMixin):
        self.with_mean = with_mean
    def fit(self, X, y=None): #se requiere aunque no se use
        X = check_array(X) #Comprueba que X es una matriz con valores flotantes finitos
        self.mean = X.mean(axis=0)
        self.scale_ = X.std(axis=0)
        self.n_features_in = X.shape[1] #cada estimador devuelve esto en fit
        return self
    def transform(self, X):
        check_is_fitted(self)#busca atributos aprendidos (con trailing_)
        X = check_array(X)
        assert self.n_features_in == X.shape[1]
        if self.with_mean:
            X = X - self.mean
            return X / self.scale_


In [None]:
from sklearn.cluster import KMeans
class ClusterSimilarity(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters =10, gamma=1.0, random_state =None):

        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state

    def fit(self, X, y=None, sample_weight=None):
        self.kmeans_ = KMeans(n_clusters=self.n_clusters,n_init=10,
                              random_state=self.random_state)
        self.kmeans_.fit(X, sample_weight=sample_weight)
        return self

    def transform(self, X):
        return rbf_kernel(X, self.kmeans_.cluster_centers_, gamma=self.gamma)

    def get_feature_names_out(self, input_features=None):
        # Genera nombres como: "Cluster_0_similarity", "Cluster_1_similarity", etc.
        return [f"Cluster_{i}_similarity" for i in range(self.n_clusters)]


In [None]:
cluster_simil = ClusterSimilarity(n_clusters=10, gamma=1., random_state=42)
similarities = cluster_simil.fit_transform(housing[["latitude", "longitude"]],
                                           sample_weight=housing_labels)


In [None]:
similarities[:3].round(2)

In [None]:
housing_renamed = housing.rename(columns={
    "latitude": "Latitude", "longitude": "Longitude",
    "population": "Population",
    "median_house_value": "Median house value (ᴜsᴅ)"})
housing_renamed["Max cluster similarity"] = similarities.max(axis=1)

housing_renamed.plot(kind="scatter", x="Longitude", y="Latitude", grid=True,
                     s=housing_renamed["Population"] / 100, label="Population",
                     c="Max cluster similarity",
                     cmap="jet", colorbar=True,
                     legend=True, sharex=False, figsize=(10, 7))
plt.plot(cluster_simil.kmeans_.cluster_centers_[:, 1],
         cluster_simil.kmeans_.cluster_centers_[:, 0],
         linestyle="", color="black", marker="X", markersize=20,
         label="Cluster centers")
plt.legend(loc="upper right")
# save_fig("district_cluster_plot")
plt.show()

In [None]:
from sklearn.pipeline import Pipeline

num_pipeline = Pipeline([
    ("impute", SimpleImputer(strategy="median")),
    ("standarize", StandardScaler())
]
)

In [None]:
housing_num_prepared = num_pipeline.fit_transform(housing_num)
housing_num_prepared[:2].round(2)

In [None]:
df_housing_num_prepared = pd.DataFrame(
    housing_num_prepared, columns=num_pipeline.get_feature_names_out(),
      index=housing_num.index
)

In [None]:
df_housing_num_prepared[:2].round(2)

In [None]:
lista = housing.columns.to_list
lista



In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline

num_attribs = ['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       ]
cat_attribs = ['ocean_proximity']

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore")
)

preprocesing = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", cat_pipeline, cat_attribs)])

In [None]:
from sklearn.compose import make_column_selector, make_column_transformer

preprocesing = make_column_transformer(
    (num_pipeline, make_column_selector(dtype_include=np.number)),
     (cat_pipeline, make_column_selector(dtype_include=object))
)

In [None]:
housing_prepared = preprocesing.fit_transform(housing)

In [None]:
preprocesing.get_feature_names_out()

In [None]:
def column_ratio(X):
    return X[:, [0]] / X[:,[1]]

def ratio_name(function_transformer, feature_names_in):
    return["ratio"] #Nombres de las caracteristicas fuera

def ratio_pipeline():
    return make_pipeline(
        SimpleImputer(strategy="median"),
        FunctionTransformer(column_ratio, feature_names_out=ratio_name),
        StandardScaler())
log_pipeline = make_pipeline(
        SimpleImputer(strategy="median"),
        FunctionTransformer(np.log, feature_names_out="one-to-one"),
        StandardScaler())
cluster_simil = ClusterSimilarity(n_clusters=10, gamma=1., random_state=42)

defaul_num_pipeline = make_pipeline(SimpleImputer(strategy="median"),
                                    StandardScaler())


preprocesing = ColumnTransformer([
    ("bedrooms", ratio_pipeline(), ["total_bedrooms", "total_rooms"]),
    ("rooms_per_house", ratio_pipeline(), ["total_rooms", "households"]),
    ("people_per_house", ratio_pipeline(),["population", "households"]),
    ("log", log_pipeline, ["total_bedrooms", "total_rooms", "population",
                            "households", "median_income"]),
    ("geo", cluster_simil, ["latitude", "longitude"]),
    ("cat", cat_pipeline,make_column_selector(dtype_include=object))

],
remainder=defaul_num_pipeline)# una columna restante : housing_median_age


In [None]:
housing_prepared = preprocesing.fit_transform(housing)
housing_prepared.shape
preprocesing.get_feature_names_out

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = make_pipeline(preprocesing, LinearRegression())
lin_reg.fit(housing, housing_labels)

In [None]:
housing_predictions = lin_reg.predict(housing)

housing_predictions[:5].round(-2)

In [None]:
housing_labels.iloc[:5].values

In [None]:
try:
    from sklearn.metrics import root_mean_squared_error
except ImportError:
    from sklearn.metrics import mean_squared_error

    def root_mean_squared_error(labels, predictions):
        return mean_squared_error(labels, predictions, squared=False)

lin_rmse = root_mean_squared_error(housing_labels, housing_predictions)
lin_rmse

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = make_pipeline(preprocesing, DecisionTreeRegressor(random_state=42))

tree_reg.fit(housing, housing_labels)

In [None]:
housing_predictions = tree_reg.predict(housing)

In [None]:
tree_rmse = root_mean_squared_error(housing_labels, housing_predictions)

In [None]:
tree_rmse

In [None]:
from sklearn.model_selection import cross_val_score

tree_rmses = -cross_val_score(tree_reg, housing, housing_labels,
                              scoring="neg_root_mean_squared_error", cv=10)

In [None]:
pd.Series(tree_rmses).describe()

In [None]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = make_pipeline(preprocesing,
                           RandomForestRegressor(random_state=42))

forest_rmeses = -cross_val_score(forest_reg, housing, housing_labels,
                                  scoring="neg_root_mean_squared_error",cv=10)

In [None]:

from sklearn.model_selection import GridSearchCV
full_pipeline = Pipeline([
    ("preprocessing", preprocesing),
    ("random_forest", RandomForestRegressor(random_state=42)),
])
param_grid = [
    {'preprocessing__geo__n_clusters':[5,8,10],
     'random_forest__max_features': [4,6,8]},
    {'preprocessing__geo__n_clusters': [10,15],
     'random_forest__max_features':[6,8,10]}
]

grid_search = GridSearchCV(full_pipeline, param_grid,cv=3,
                           scoring='neg_root_mean_squared_error',n_jobs=-1)

In [None]:
grid_search.fit(housing,housing_labels)

In [None]:
grid_search.best_params_

{'preprocessing__geo__n_clusters': 15, 'random_forest__max_features': 6}

In [None]:
grid_search.cv_results_

{'mean_fit_time': array([18.63110542, 33.12425741, 39.50145014, 20.19996516, 34.01026066,
        44.01632229, 24.23766605, 33.16129708, 39.4776117 , 33.56318164,
        37.27506193, 57.73467048, 37.42364542, 40.89844561, 38.6618255 ]),
 'std_fit_time': array([2.89514289, 6.26864851, 3.14385128, 1.14015037, 4.55407141,
        7.53477791, 2.91586585, 4.90682359, 5.58765733, 6.70157835,
        2.53348336, 1.98282506, 2.08723199, 2.42905653, 1.54014842]),
 'mean_score_time': array([0.6752522 , 0.65860661, 1.89721068, 1.41724626, 0.83586137,
        1.30377992, 0.91812499, 0.80304599, 0.65589762, 2.37694089,
        0.94585196, 0.55637081, 0.82405535, 0.91389863, 1.00278147]),
 'std_score_time': array([0.17751513, 0.09555495, 1.64288398, 0.93522503, 0.20893914,
        0.60460276, 0.18295685, 0.25879689, 0.11545179, 0.81295311,
        0.53140124, 0.01839934, 0.31358449, 0.49295903, 0.36758901]),
 'param_preprocessing__geo__n_clusters': masked_array(data=[5, 5, 5, 8, 8, 8, 10, 10, 10, 1

In [None]:
cv_res = pd.DataFrame(grid_search.cv_results_)
cv_res.sort_values(by="mean_test_score",ascending=False,inplace=True)
cv_res

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_preprocessing__geo__n_clusters,param_random_forest__max_features,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
12,37.423645,2.087232,0.824055,0.313584,15,6,"{'preprocessing__geo__n_clusters': 15, 'random...",-43006.650208,-43669.481935,-44192.812758,-43622.981634,485.363855,1
13,40.898446,2.429057,0.913899,0.492959,15,8,"{'preprocessing__geo__n_clusters': 15, 'random...",-43696.807883,-44045.517131,-44797.314782,-44179.879932,459.215912,2
7,33.161297,4.906824,0.803046,0.258797,10,6,"{'preprocessing__geo__n_clusters': 10, 'random...",-43709.66105,-44111.823218,-45041.163417,-44287.549228,557.604616,3
9,33.563182,6.701578,2.376941,0.812953,10,6,"{'preprocessing__geo__n_clusters': 10, 'random...",-43709.66105,-44111.823218,-45041.163417,-44287.549228,557.604616,3
6,24.237666,2.915866,0.918125,0.182957,10,4,"{'preprocessing__geo__n_clusters': 10, 'random...",-43797.854175,-44218.235341,-44972.030745,-44329.37342,485.754677,5
14,38.661825,1.540148,1.002781,0.367589,15,10,"{'preprocessing__geo__n_clusters': 15, 'random...",-44209.438092,-44583.303303,-45134.516273,-44642.419222,379.967919,6
3,20.199965,1.14015,1.417246,0.935225,8,4,"{'preprocessing__geo__n_clusters': 8, 'random_...",-44386.450165,-44512.240149,-45280.444382,-44726.378232,395.135212,7
4,34.010261,4.554071,0.835861,0.208939,8,6,"{'preprocessing__geo__n_clusters': 8, 'random_...",-44578.615017,-44732.042571,-45455.954971,-44922.204186,382.581084,8
8,39.477612,5.587657,0.655898,0.115452,10,8,"{'preprocessing__geo__n_clusters': 10, 'random...",-44498.988402,-44978.63769,-45679.14741,-45052.257834,484.602081,9
10,37.275062,2.533483,0.945852,0.531401,10,8,"{'preprocessing__geo__n_clusters': 10, 'random...",-44498.988402,-44978.63769,-45679.14741,-45052.257834,484.602081,9


In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_distribs = {'preprocessing__geo__n_clusters': randint(low=3, high=50),
                  'random_forest__max_features': randint(low=2, high=20)}

rnd_serarch = RandomizedSearchCV(full_pipeline, param_distributions=param_distribs, n_iter=10,
                                 cv=3,n_jobs=-1,scoring='neg_root_mean_squared_error',random_state=42)

rnd_serarch.fit(housing,housing_labels)

In [None]:
final_model = rnd_serarch.best_estimator_
feature_importances = final_model["random_forest"].feature_importances_

In [None]:
feature_importances.round(2)

array([0.06, 0.05, 0.05, 0.01, 0.01, 0.01, 0.01, 0.19, 0.01, 0.01, 0.02,
       0.04, 0.01, 0.  , 0.02, 0.01, 0.01, 0.  , 0.02, 0.01, 0.  , 0.01,
       0.01, 0.01, 0.01, 0.02, 0.02, 0.01, 0.01, 0.01, 0.02, 0.01, 0.01,
       0.01, 0.01, 0.01, 0.01, 0.  , 0.01, 0.01, 0.02, 0.01, 0.01, 0.02,
       0.01, 0.01, 0.01, 0.01, 0.02, 0.02, 0.01, 0.01, 0.01, 0.  , 0.08,
       0.  , 0.  , 0.  , 0.01])

In [None]:
sorted(zip(feature_importances,
           final_model["preprocessing"].get_feature_names_out()),reverse=True)

[(np.float64(0.1918683750201251), 'log__median_income'),
 (np.float64(0.07578507877721054), 'cat__ocean_proximity_INLAND'),
 (np.float64(0.06386172808342915), 'bedrooms__ratio'),
 (np.float64(0.051952550969981644), 'rooms_per_house__ratio'),
 (np.float64(0.047894693052643524), 'people_per_house__ratio'),
 (np.float64(0.04194493610673221), 'geo__Cluster_3_similarity'),
 (np.float64(0.022594096329412425), 'geo__Cluster_18_similarity'),
 (np.float64(0.022225321712153245), 'geo__Cluster_17_similarity'),
 (np.float64(0.02210833076731358), 'geo__Cluster_22_similarity'),
 (np.float64(0.018499605133320433), 'geo__Cluster_40_similarity'),
 (np.float64(0.017685021452087165), 'geo__Cluster_10_similarity'),
 (np.float64(0.0174618712350783), 'geo__Cluster_35_similarity'),
 (np.float64(0.017367410300855365), 'geo__Cluster_41_similarity'),
 (np.float64(0.01642428932110295), 'geo__Cluster_2_similarity'),
 (np.float64(0.016317284791765075), 'geo__Cluster_6_similarity'),
 (np.float64(0.01516491475663121

In [None]:
x_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

In [None]:
final_predictions = final_model.predict(x_test)

In [None]:
final_rmse = root_mean_squared_error(y_test, final_predictions)
print(final_rmse)

41605.466105927866


In [None]:
from scipy import stats
condifence = 0.5
squared_errors = (final_predictions - y_test **2)

np.sqrt(stats.t.interval(condifence,len(squared_errors)-1,
                         loc=squared_errors.mean(),
                         scale=stats.sem(squared_errors)))

  np.sqrt(stats.t.interval(condifence,len(squared_errors)-1,


array([nan, nan])

In [None]:
import joblib
joblib.dump(final_model, "My_california_housing_model.pkl")

['My_california_housing_model.pkl']

In [None]:
def column_ratio(X):
    return X[:,[0]] / X[:,[1]]

In [None]:
# @title 🏠 Predice el precio de una casa
# @markdown Ingresa los datos de la vivienda y presiona el botón de "Ejecutar" (el ícono de Play a la izquierda) para ver el precio estimado.

import pandas as pd
import numpy as np

# --- ENTRADAS DEL USUARIO (Formulario) ---

median_income = 8.32 # @param {type:"slider", min:0.5, max:15.0, step:0.1}
housing_median_age = 25 # @param {type:"slider", min:1, max:52, step:1}
total_rooms = 2000 # @param {type:"integer"}
ocean_proximity = "INLAND" # @param ["<1H OCEAN", "INLAND", "NEAR OCEAN", "NEAR BAY", "ISLAND"]
latitude = 34.05 # @param {type:"number"}
longitude = -118.24 # @param {type:"number"}

# --- LÓGICA DE PREDICCIÓN ---

def predecir_precio():
    # 1. Crear un DataFrame con los datos ingresados
    # IMPORTANTE: Debemos incluir TODAS las columnas que usaste para entrenar.
    # Para las que no pedimos al usuario (para simplificar), usaremos promedios genéricos.

    data = {
        'longitude': [longitude],
        'latitude': [latitude],
        'housing_median_age': [housing_median_age],
        'total_rooms': [total_rooms],
        'total_bedrooms': [total_rooms * 0.2], # Estimación: asumiendo 20% son recámaras
        'population': [1000], # Valor promedio genérico
        'households': [350],  # Valor promedio genérico
        'median_income': [median_income],
        'ocean_proximity': [ocean_proximity]
    }

    X_new = pd.DataFrame(data)

    # 2. Hacer la predicción usando tu pipeline completo
    try:
        prediction = final_model.predict(X_new)
        precio_final = prediction[0]

        print("------------------------------------------------------")
        print(f"💰 PRECIO ESTIMADO: ${precio_final:,.2f} USD")
        print("------------------------------------------------------")

        # Un pequeño análisis extra para el usuario
        if precio_final > 400000:
            print("⚠️ Esta propiedad está clasificada como de ALTO valor.")
        elif precio_final < 100000:
            print("✅ Esta es una propiedad económica.")

    except Exception as e:
        print(f"Ocurrió un error: {e}")
        print("Asegúrate de que 'final_model' esté cargado en memoria.")

predecir_precio()