In [110]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.compose import ColumnTransformer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from joblib import dump

In [2]:
credit_train = pd.read_csv("https://cs307.org/lab-02/data/credit-train.csv")
# create X and y for train
X_train = credit_train.drop("Rating", axis=1)  # delete the rating column
y_train = credit_train["Rating"]

In [11]:
# create validation train and validation 
X_vtrain, X_validation, y_vtrain, y_validation = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [65]:
# split the data into numerical and categorical
numeric_features=X_train.select_dtypes(include=["float64"]).columns
categorical_features=X_train.select_dtypes(include=["object"]).columns
# define how to handle missing data and scale features for numeric and categorical features
numeric_transformer = Pipeline(
    steps=[
        ("Iterative Imputer", IterativeImputer(max_iter=10, random_state=0)),
        ("Standardization", StandardScaler()),
    ]
)
categorical_transformer = Pipeline(
    steps=[
        ("Modal Imputer", SimpleImputer(strategy="most_frequent")),
        ("One-Hot Encoder", OneHotEncoder(handle_unknown="ignore")),
    ]
)
# create general preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ("Numeric Transformer", numeric_transformer, numeric_features),
        ("Categorical Transformer", categorical_transformer, categorical_features),
    ],
    remainder="drop",
)

In [13]:
# 创建包含预处理和回归器的Pipeline
model_pipeline = Pipeline(steps=[
    ("Preprocessor", preprocessor),
    ("Regressor", KNeighborsRegressor())
])

In [106]:
# 定义参数网格
param_grid = {
    "Regressor__n_neighbors": range(1, 20),  # 例如，测试1到20的邻居数
    "Regressor__weights": ["uniform", "distance"],  # 测试两种权重计算方式
    "Regressor__metric": ["euclidean", "manhattan", "chebyshev"]  # 添加距离度量参数
}

# 使用GridSearchCV进行参数优化
grid_search = GridSearchCV(model_pipeline, param_grid,cv=230,scoring="neg_mean_squared_error")

# 对训练数据进行拟合
grid_search.fit(X_train, y_train)

In [107]:
print("Best parameters found:", grid_search.best_params_)

Best parameters found: {'Regressor__metric': 'chebyshev', 'Regressor__n_neighbors': 13, 'Regressor__weights': 'uniform'}


In [108]:
print("Lowest RMSE found:", np.sqrt(-grid_search.best_score_))

Lowest RMSE found: 107.11345806857459
