In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
import numpy as np

In [2]:
source = pd.read_csv(r"../data/HousePrices_HalfMil.csv")
source.iloc[:5]

Unnamed: 0,Area,Garage,FirePlace,Baths,White Marble,Black Marble,Indian Marble,Floors,City,Solar,Electric,Fiber,Glass Doors,Swiming Pool,Garden,Prices
0,164,2,0,2,0,1,0,0,3,1,1,1,1,0,0,43800
1,84,2,0,4,0,0,1,1,2,0,0,0,1,1,1,37550
2,190,2,4,4,1,0,0,0,2,0,0,1,0,0,0,49500
3,75,2,4,4,0,0,1,1,1,1,1,1,1,1,1,50075
4,148,1,4,2,1,0,0,1,2,1,0,0,1,1,1,52400


In [3]:
source.isnull().sum()

Area             0
Garage           0
FirePlace        0
Baths            0
White Marble     0
Black Marble     0
Indian Marble    0
Floors           0
City             0
Solar            0
Electric         0
Fiber            0
Glass Doors      0
Swiming Pool     0
Garden           0
Prices           0
dtype: int64

In [4]:
target = source["Prices"]
features = source.drop(columns = ["Prices"])

xtrain,xtest,ytrain,ytest = train_test_split(features,target,test_size=0.2,random_state=42)

In [5]:
pipeline = Pipeline(
    [
        ('scaler',StandardScaler()), # 数据标准化
        ('reg',SVR()) # 默认使用回归支持向量机
    ]
)

In [10]:
# 定义参数网络
param_grid = [
    {
        'reg': [RandomForestRegressor()],
        'reg__n_estimators': [20,50,100],
        'reg__max_depth': [2,5,8,10,None],
        'reg__min_samples_split': [2,5,8] 
    },
    {
        'reg': [DecisionTreeRegressor()],
        'reg__criterion': ["absolute_error","squared_error"],
        'reg__splitter': ["best","random"],
        'reg__max_depth': [1,5,8,10,None],
        'reg__min_samples_split': [2,5,8,10]
    },
    {
        'reg': [SVR()],
        'reg__kernel': ["poly","rbf","sigmoid"],
        'reg__degree': [1,2,3],
        'reg__gamma': [0.2,0.5,1],
        'reg__C': [0.1,1,10,100,1000]
    }
]

In [11]:
model = GridSearchCV(pipeline,param_grid=param_grid,n_jobs=-1,refit=True,cv=3)
model.fit(xtrain,ytrain)

  _data = np.array(data, dtype=dtype, copy=copy,


In [12]:
display(model.best_estimator_,model.best_score_,model.best_params_)

np.float64(0.9999999999408757)

{'reg': SVR(),
 'reg__C': 1000,
 'reg__degree': 1,
 'reg__gamma': 0.5,
 'reg__kernel': 'poly'}

In [14]:
model.predict(xtest)[:5]

array([57049.90032806, 18625.09938344, 27825.10033017, 34350.09973636,
       49999.90016554])

In [15]:
ytest[:5]

82     57050
15     18625
111    27825
177    34350
76     50000
Name: Prices, dtype: int64