In [1]:
import numpy as np
import pandas as pd
import sklearn as sk

In [2]:
data_path = 'housing/housing.csv'
data = pd.read_csv(data_path)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [3]:
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [4]:
train_set, test_set = sk.model_selection.train_test_split(data, test_size=0.4, random_state=42)

In [5]:
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.cluster import KMeans
from sklearn.base import BaseEstimator, TransformerMixin

class ClusterSimilarity(BaseEstimator, TransformerMixin):
    kmeans_index = None
    def __init__(self, n_clusters=10, gamma=1.0, random_state=None):
        self.n_clusters = n_clusters
        self.gamma = gamma
        self.random_state = random_state

    def fit(self, X, y=None, sample_weight=None):
        self.kmeans_index = KMeans(self.n_clusters, n_init=10,
                              random_state=self.random_state)

        self.kmeans_index.fit(X, sample_weight=sample_weight)
        return self 

    def transform(self, X):
        return rbf_kernel(X, self.kmeans_index.cluster_centers_, gamma=self.gamma)

    def get_feature_names_out(self, names=None):
        return [f"Cluster {i} similarity" for i in range(self.n_clusters)]

In [6]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_selector, make_column_transformer, ColumnTransformer

num_attribs = ["longitude", "latitude", "housing_median_age", "total_rooms",
               "total_bedrooms", "population", "households", "median_income"]
cat_attribs = ["ocean_proximity"]

num_pipeline = make_pipeline(SimpleImputer(strategy="most_frequent"), StandardScaler())
str_pipeline = make_pipeline(SimpleImputer(strategy="most_frequent"), OneHotEncoder(handle_unknown="ignore"))

def columns_ratio(X):
    return X[:, [0]] / X[:, [1]]

def ratio_name(function_transformer, feature_names_in):
    return [f"ratio:{feature_names_in}"]

ratio_pipeline = make_pipeline(
        SimpleImputer(strategy="most_frequent"),
        FunctionTransformer(columns_ratio, feature_names_out=ratio_name),
        StandardScaler())

log_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    FunctionTransformer(np.log, feature_names_out="one-to-one"),
    StandardScaler())

cluster_simil = ClusterSimilarity(n_clusters=10, gamma=1., random_state=42)
means_num_pipeline = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())

preprocessing = ColumnTransformer([
        ("bedrooms", ratio_pipeline, ["total_bedrooms", "total_rooms"]),
        ("rooms_per_house", ratio_pipeline, ["total_rooms", "households"]),
        ("people_per_house", ratio_pipeline, ["population", "households"]),
        ("log", log_pipeline, ["total_bedrooms", "total_rooms", "population",
                               "households", "median_income"]),
        ("geog", cluster_simil, ["latitude", "longitude"]),
        ("ocean", str_pipeline, make_column_selector(dtype_include=object))
    ],
    remainder=num_pipeline)

In [7]:
from sklearn.svm import SVR

svr = SVR(
    kernel='rbf',      
    C=0.5,             
    epsilon=0.1,       
    gamma='scale'     
)
X = train_set.drop(columns=['median_house_value'])
Y = train_set['median_house_value']
X_test = test_set.drop(columns=['median_house_value'])
Y_test = test_set['median_house_value']

In [8]:
from sklearn.model_selection import cross_val_score
train_reg = make_pipeline(preprocessing, svr)
train_reg.fit(X, Y)

Y_pred = train_reg.predict(X_test)
error_ratios = Y_pred[:5].round(-2) / Y_test.iloc[:5].values - 1
print(", ".join([f"{100 * ratio:.1f}%" for ratio in error_ratios]))
rmse1 = cross_val_score(train_reg, X, Y, scoring="neg_root_mean_squared_error", cv=5)
rmse1

172.0%, -49.3%, -61.6%, -64.2%, -37.4%


array([-120731.57820146, -120435.63429728, -115179.09771935,
       -117072.19060613, -115345.92646744])

In [9]:
svr1 = SVR(
    kernel='linear',     
    C=0.5,            
    epsilon=0.1,       
    gamma='scale'      
)
train_reg = make_pipeline(preprocessing, svr1)
train_reg.fit(X, Y)

Y_pred = train_reg.predict(X_test)
error_ratios = Y_pred[:5].round(-2) / Y_test.iloc[:5].values - 1
print(", ".join([f"{100 * ratio:.1f}%" for ratio in error_ratios]))
rmse2 = cross_val_score(train_reg, X, Y, scoring="neg_root_mean_squared_error", cv=5)
rmse2

165.1%, -48.5%, -60.0%, -63.5%, -36.3%


array([-118236.53413968, -118049.38847764, -112780.73221683,
       -114638.86177326, -112904.36344397])

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectFromModel

train_set, test_set = sk.model_selection.train_test_split(data.iloc[:1000], test_size=0.2, random_state=42)
X1 = train_set.drop(columns=['median_house_value'])
Y1 = train_set['median_house_value']

tree_reg = Pipeline([("first",preprocessing), ("reg",LinearRegression())])
tree_reg.fit(X1, Y1)
linear_reg = tree_reg.named_steps['reg']
selector = SelectFromModel(linear_reg, prefit = True)

print("选择的特征索引:", selector.get_support(indices=True))

选择的特征索引: [ 3  4  5  6  7  9 13 14 16]




In [11]:
from sklearn.neighbors import KNeighborsRegressor

class KNNMedianHouseValuePredictor(BaseEstimator, TransformerMixin):
    use_cols = [0, 1]
    def __init__(self, n_neighbors=5):
        self.Knn_model = None
        self.n_neighbors = n_neighbors
    
    def fit(self, X_, y=None):
        self.Knn_model = KNeighborsRegressor(n_neighbors=self.n_neighbors)
        self.Knn_model.fit(X_[:, self.use_cols], y)
        return self
    def transform(self, X_):
        predict_X_ = self.Knn_model.predict(X_[:, self.use_cols])
        X_new = X.copy()
        X_new['KNN_Predicted_MedHouseVal'] = predict_X_
        return X_new

In [12]:
from sklearn.metrics import mean_squared_error
train_set, test_set = sk.model_selection.train_test_split(data, test_size=0.5, random_state=42)

X = train_set.loc[:, ["longitude", "latitude"]]
Y = train_set.loc[:, ['median_house_value']]
X_test = test_set.loc[:, ["longitude", "latitude"]]
Y_test = test_set.loc[:, ['median_house_value']]
local_pipeline = make_pipeline(SimpleImputer(strategy="most_frequent"), StandardScaler())
preprocessing1 = make_column_transformer(
    (local_pipeline, ["longitude", "latitude"])  # 直接传递元组，不加外层括号
)
pipeline = Pipeline([("preprocessor1", preprocessing1),('knn_feature', KNNMedianHouseValuePredictor(n_neighbors=5))])
train_knn = make_pipeline(pipeline, LinearRegression())
train_knn.fit(X, Y)
y_pred1 = train_knn.predict(X_test)
print(f"模型RMSE: {np.sqrt(mean_squared_error(Y_test, y_pred1)):.4f}")

模型RMSE: 54616.2560




In [13]:
class StandardScalerClone:
    def __init__(self, with_mean=True, with_std=True):
        self.with_mean = with_mean
        self.with_std = with_std
        self.feature_names_in_ = None
        self.n_features_in_ = None
        self.mean_ = None
        self.scale_ = None
    def fit(self, X, y=None):
        # 处理DataFrame或array-like输入
        if isinstance(X, pd.DataFrame):
            self.feature_names_in_ = np.array(X.columns, dtype=object)
            X = X.values
        self.n_features_in_ = X.shape[1]
        
        if self.with_mean:
            self.mean_ = np.mean(X, axis=0)
        else:
            self.mean_ = np.zeros(self.n_features_in_)
            
        if self.with_std:
            self.scale_ = np.std(X, axis=0)
            # 避免除以零
            self.scale_[self.scale_ == 0.0] = 1.0
        else:
            self.scale_ = np.ones(self.n_features_in_)
        return self
    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            if self.feature_names_in_ is not None:
                # 检查列名是否匹配
                if not np.array_equal(X.columns, self.feature_names_in_):
                    raise ValueError("X的列名与fit时的不匹配")
            X = X.values
        X_transformed = (X - self.mean_) / self.scale_
        return X_transformed
    def inverse_transform(self, X_transformed):
        if isinstance(X_transformed, pd.DataFrame):
            X_transformed = X_transformed.values
        X = X_transformed * self.scale_ + self.mean_
        return X
    def fit_transform(self, X, y=None):
        return self.fit(X).transform(X)
    def get_feature_names_out(self, input_features=None):
        if input_features is not None:
            input_features = np.asarray(input_features, dtype=object)
            if len(input_features) != self.n_features_in_:
                raise ValueError(
                    f"input_features的长度为{len(input_features)}，但应有{self.n_features_in_}个特征"
                )
            return input_features
        if self.feature_names_in_ is not None:
            return self.feature_names_in_
        return np.array([f"x{i}" for i in range(self.n_features_in_)], dtype=object)