In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from sklearn.linear_model import LinearRegression

# データ読み込み
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# ベースライン（平均予測）
y = train["SalePrice"]
y_pred = np.full_like(y, y.mean())
rmsle = np.sqrt(mean_squared_log_error(y, y_pred))
print("ベースライン（平均予測）のRMSLE:", rmsle)

# ベースライン（線形回帰）
X = train[["OverallQual"]]  # OverallQualのみを説明変数に

# RMSLE (Root Mean Squared Logarithmic Error) を計算する関数
def rmsle(y_true, y_pred):
    y_pred = np.maximum(y_pred, 0)
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

# 評価関数（線形回帰）
def run_linear_cv(X, y, n_splits=5, random_state=42):
    folds = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    oof_preds = np.zeros(len(X))
    cv_scores = []

    for fold, (train_idx, val_idx) in enumerate(folds.split(X, y)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        y_train_log = np.log1p(y_train)
        y_val_log = np.log1p(y_val)

        linearmodel = LinearRegression()
        linearmodel.fit(X_train, y_train_log)

        val_preds_log = linearmodel.predict(X_val)
        val_preds = np.expm1(val_preds_log)       
        oof_preds[val_idx] = val_preds      
        score = rmsle(y_val, val_preds)
        cv_scores.append(score)
        print(f"Fold {fold+1} RMSLE: {score:.4f}")

    cv_rmsle = np.mean(cv_scores)
    print(f"\nAverage CV Linear_RMSLE: {cv_rmsle:.4f}")
    return cv_rmsle, oof_preds

# 学習・評価
cv_rmsle, oof_preds = run_linear_cv(X, y, n_splits=5)
print("ベースライン（面積のみLinear）のRMSLE:", cv_rmsle)


ベースライン（平均予測）のRMSLE: 0.4075975552878524
Fold 1 RMSLE: 0.2311
Fold 2 RMSLE: 0.2272
Fold 3 RMSLE: 0.2351
Fold 4 RMSLE: 0.2406
Fold 5 RMSLE: 0.2202

Average CV Linear_RMSLE: 0.2308
ベースライン（面積のみLinear）のRMSLE: 0.23083110949997612
