In [22]:
import pandas as pd
from joblib import dump
from sklearn.linear_model import Lasso, LassoCV
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import numpy as np

# suppress warnings
import warnings
warnings.filterwarnings('ignore')

In [23]:
genes_train = pd.read_csv("https://cs307.org/lab-05/data/genes-train.csv")
# create X and y for train
X_train = genes_train.drop("y", axis=1)
y_train = genes_train["y"]

In [35]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures

# 使用管道将标准化、多项式特征转换和Lasso回归结合起来
lasso_pipeline = make_pipeline(
    StandardScaler(),
    PolynomialFeatures(),
    Lasso(max_iter=10000)
)

# 现在我们需要在管道中对Lasso进行交叉验证，而不是单独使用LassoCV
lasso_pipeline.fit(X_train, y_train)  # 这里多项式特征和Lasso都会在内部使用X_train进行拟合

# 设置多项式特征和Lasso的alpha
param_grid = {
    'polynomialfeatures__degree': [2], # 比如尝试2阶和3阶多项式特征
    'lasso__alpha': np.logspace(-6, 1, 100)  # 同样的alpha范围
}

# 使用 GridSearchCV 来找到最佳的多项式特征阶数和alpha
grid_search = GridSearchCV(lasso_pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1)
grid_search.fit(X_train, y_train)

# 输出最佳参数
best_params = grid_search.best_params_
best_rmse = np.sqrt(-grid_search.best_score_)
print(f"Best parameters: {best_params}")
print(f"Best RMSE: {best_rmse}")


Fitting 5 folds for each of 200 candidates, totalling 1000 fits


KeyboardInterrupt: 

In [None]:
dump(lasso_cv, "gene-expression.joblib")

['gene-expression.joblib']

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import Pipeline
import numpy as np

# 创建一个管道，包括标准化处理、多项式特征转换和岭回归
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('poly', PolynomialFeatures()),
    ('ridge', Ridge())
])

# 设置要搜索的参数网格
param_grid = {
    'poly__degree': [1, 2, 3],  # 尝试1阶、2阶、3阶多项式特征
    'ridge__alpha': [0.001, 0.01, 0.1, 1, 10, 100]  # 尝试不同的正则化强度
}

# 创建 GridSearchCV 对象
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1)

# 执行网格搜索
grid_search.fit(X_train, y_train)

# 找到最佳参数
best_params = grid_search.best_params_
best_score = -grid_search.best_score_

# 使用最佳参数训练模型
best_model = grid_search.best_estimator_

# 打印最佳参数和在交叉验证上的最佳得分（转换为正RMSE）
print(f"Best parameters: {best_params}")
print(f"Best RMSE: {np.sqrt(best_score)}")

# 使用最佳模型进行预测
y_pred = best_model.predict(X_train)

# 计算训练集的RMSE
train_rmse = np.sqrt(np.mean((y_train - y_pred) ** 2))

print(f"Training RMSE: {train_rmse}")


Fitting 5 folds for each of 18 candidates, totalling 90 fits
Best parameters: {'poly__degree': 1, 'ridge__alpha': 1}
Best RMSE: 2.0526987279134095
Training RMSE: 0.12717589498725707
