#### 該文件包含1,338個案例，即目前已經登記過的保險計劃受益者、病人特點和計劃計入的總醫療費用特徵:

* age:表示主要受益者年齡
* sex:性別
* bmi:身體質量指數,理想BMI在18.5~24.9之間
* children:表示保險計劃中所包括的孩子/受撫養者的數量
* smoker:表示被保險人是否經常吸煙
* region:受益者在美國的居住地-東北(northeast), 東南(sotheast), 西南(southwest)和東北(northwest)
* charges:醫療費用

#### 目的:檢測與醫療費用相關之變量，找出在醫療費用上具高風險的人

In [None]:
from sklearn import datasets, linear_model
from sklearn.linear_model import LinearRegression, Ridge, Lasso

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from collections import Counter

# from sklearn.datasets import load_boston
sns.set_style('darkgrid')

import warnings
warnings.filterwarnings("ignore")

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('insurance.csv')

In [None]:
df.head()

In [None]:
df['sex_female'] = (df['sex'] == 'female').astype(int)
df['smoker'] = (df['smoker'] == 'yes').astype(int)
df = pd.get_dummies(df, columns=["region"],dtype=int)
df["charges"] = df.pop("charges")
df = df.drop(columns=['sex'])

df.head()

In [None]:
df.columns

In [None]:
from sklearn import preprocessing 
minmax = preprocessing.MinMaxScaler()
x_minmax = minmax.fit_transform(df[['age',  'bmi', 'children']])

In [None]:
x_minmax= pd.DataFrame(x_minmax, columns=['age',  'bmi', 'children']) #變成新的dataframe

In [None]:
res_minmax = pd.concat([x_minmax.reset_index(drop=True), df[['sex_female','smoker','region_northeast','region_northwest','region_southeast','region_southwest']]], axis=1) #合併回
res_minmax

In [None]:
print(df.info())
print(res_minmax.info())

In [None]:
print(res_minmax.columns[8])
print(df.columns[9])

In [None]:
X = res_minmax.iloc[:, 0:9].values
# X = dataset.iloc[:, 0:13].values #把x取出來複製成x 
y = df.iloc[:, 9].values.reshape(-1,1) # 轉換成1個column 

In [None]:
print(X.shape)
print(y.shape)

In [None]:
# 分割訓練和測試
from sklearn.model_selection import train_test_split #切割方法
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 25)

In [None]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
df.isnull().sum()

In [None]:
df.describe() 

In [None]:
df.sort_values(by=['children'], ascending=False)

In [None]:
df.columns

In [None]:
sns.pairplot(df[['age', 'bmi', 'children','charges']])

In [None]:
import matplotlib.pyplot as plt

plt.figure()
plt.boxplot([df.loc[df["smoker"]==0, "charges"],
             df.loc[df["smoker"]==1, "charges"]],
            labels=["smoker=0", "smoker=1"])
plt.ylabel("charges")
plt.title("charges vs smoker")
plt.show()

In [None]:
region_cols = ["region_northeast","region_northwest","region_southeast","region_southwest"]
df["region"] = df[region_cols].idxmax(axis=1).str.replace("region_", "")

plt.figure()
data = [df.loc[df["region"]==r, "charges"] for r in ["northeast","northwest","southeast","southwest"]]
plt.boxplot(data, labels=["northeast","northwest","southeast","southwest"])
plt.ylabel("charges")
plt.title("charges vs region")
plt.show()

In [None]:
df.info()
df = df.drop(columns=["region"])

In [None]:
corrmat = df.corr()
fig, ax = plt.subplots(figsize = (18, 10))
sns.heatmap(corrmat, annot = True, annot_kws={'size': 12})
#相關性 1 ~ -1 
# 正相關 (> 0.7)：兩者同步起伏。

# 負相關 (< -0.7)：兩者背道而馳。

# 零相關 (-0.1 ~ 0.1)：這兩個東西互相不影響，預測時可以考慮剔除。

In [None]:
from sklearn.linear_model import LinearRegression
regressor_linear = LinearRegression() #線性回歸 modul
regressor_linear.fit(X_train, y_train) #把資料丟進去

In [None]:
from sklearn.metrics import r2_score

# R2 score，即決定係數，反映Y的全部變異能通過迴歸關係被所有X解釋的比例。
y_pred_linear_train = regressor_linear.predict(X_train)
r2_score_linear_train = r2_score(y_train, y_pred_linear_train)

y_pred_linear_test = regressor_linear.predict(X_test)
r2_score_linear_test = r2_score(y_test, y_pred_linear_test)

# RMSE一般指均方根誤差(標準誤差)，可衡量預測值和實際值之間的平均差異，藉此估計預測模型預測目標值的準確度
rmse_train = (np.sqrt(mean_squared_error(y_train, y_pred_linear_train)))
rmse_test = (np.sqrt(mean_squared_error(y_test, y_pred_linear_test)))

print('R2_score (train): ', r2_score_linear_train)
print('R2_score (test): ', r2_score_linear_test)
print("RMSE: ", rmse_train) #越低越好 標準誤差
print("RMSE: ", rmse_test)

In [None]:
not_region = df.drop(columns=['region_northeast', 'region_northwest', 'region_southeast',
       'region_southwest'])

In [None]:
corrmat = not_region.corr()
fig, ax = plt.subplots(figsize = (18, 10))
sns.heatmap(corrmat, annot = True, annot_kws={'size': 12})

### <span style="color:#3498DB">Point: 請找出顯著與不顯著的變數</span>

In [None]:
import statsmodels.api as sm
from scipy import stats

In [None]:
res_minmax.info()

In [None]:
X2 = sm.add_constant(X)
est = sm.OLS(y, X2).fit()
print(est.summary())
# P < .05：常用顯著
# P < .01：更顯著
# P < .001：非常顯著
# coef 的數值代表「x 增加 1，y 平均改變多少」。
#smoker 對y影響最大，且正相關。

### <span style="color:#3498DB">Point: 預測結果</span>

In [None]:
res_minmax.columns

In [None]:
feature_cols = res_minmax.columns  # 訓練用的那些欄位（不含 y）

In [None]:
new_row = {
  "age": 30,
  "bmi": 27.5,
  "children": 1,
  'sex_female': 1,
  'smoker': 1,          # one-hot
  'region_northeast': 0,  # one-hot
   'region_northwest': 1,
   'region_southeast':0,
   'region_southwest':0,
}
new_df = pd.DataFrame([new_row])

In [None]:
#訓練時要fit ,  預測不用。
predicted_minmax = minmax.transform(new_df[['age',  'bmi', 'children']])
predicted_minmax = pd.DataFrame(predicted_minmax, columns=['age', 'bmi', 'children'])
predicted_minmax = pd.concat([predicted_minmax.reset_index(drop=True), new_df[['sex_female','smoker','region_northeast','region_northwest','region_southeast','region_southwest']]], axis=1) #合併回

In [None]:
to_be_predicted = predicted_minmax.reindex(columns=feature_cols, fill_value=0)
predicted_price = regressor_linear.predict(to_be_predicted)


In [None]:
print(predicted_price)
print("min:", np.min(y))
print("median:", np.median(y))
print("mean:", np.mean(y))
print("max:", np.max(y))

In [None]:
import numpy as np
# 把 predicted_price 轉成一個單一的浮點數 pred , 
# ndim 回傳維度，如果是 0 維，代表它已經是一個「單一數字」（純量）
# float(predicted_price)如果是純量，就直接轉成 float
# 如果不是 純量，就代表是陣列/列表類型，ravel() 會把陣列「攤平成 1 維」，取攤平後的第 1 個元素（也就是那個預測值）
pred = float(predicted_price) if np.ndim(predicted_price)==0 else float(np.array(predicted_price).ravel()[0])

#pred 在 y 的分佈中落在第幾百分位（percentile）
#在 numpy 裡，True 會當作 1，False 當作 0 所以 mean 就等於：True 的比例
# 0.828 → 代表 82.8% 的 y 比 pred 小

percentile = (y < pred).mean() * 100
print("pred:", pred, "percentile:", percentile)

### <span style="color:#3498DB">Test: 請將不顯著的變數與您覺得可能具有共線性的變數進行處理，再次跑迴歸，並觀察各類檢測數值是否有優化</span>

## 依數據我認為 地區具共線性、地區小孩及性別為不顯著變數
*  嘗試作法: 1. 刪除地區1類地區
*  嘗試作法: 2.刪除性別、小孩數量、地區

## 作法1

In [None]:
df.info()

In [None]:
res_minmax.info()

In [None]:

tree_region = res_minmax.drop(columns=['region_northeast'])# 將 region_northeast 區域作為基準



In [None]:
tree_region.info()

In [None]:
tree_region_X = tree_region.iloc[:, 0:8].values
# X = dataset.iloc[:, 0:13].values #把x取出來複製成x 
y = df.iloc[:, 9].values.reshape(-1,1) # 轉換成1個column 

In [None]:
# 分割訓練和測試
from sklearn.model_selection import train_test_split #切割方法
X_tree_region_train, X_tree_region_test, y_train, y_test = train_test_split(tree_region_X, y, test_size = 0.3, random_state = 25)

In [None]:
print(X_tree_region_train.shape,X_tree_region_test.shape, y_train.shape, y_test.shape)

In [None]:
from sklearn.linear_model import LinearRegression
regressor_linear = LinearRegression() #線性回歸 modul
regressor_linear.fit(X_tree_region_train, y_train) #把資料丟進去

### <span style="color:#3498DB">Point: 請觀察模型是否穩定</span>

## 把一個區域當作基準值後的穩定性

In [None]:
from sklearn.metrics import r2_score

# R2 score，即決定係數，反映Y的全部變異能通過迴歸關係被所有X解釋的比例。
y_pred_linear_treeregion_train = regressor_linear.predict(X_tree_region_train)
r2_score_linear_treeregion_train = r2_score(y_train, y_pred_linear_treeregion_train)

y_pred_linear_treeregion_test = regressor_linear.predict(X_tree_region_test)
r2_score_linear_treeregion_test = r2_score(y_test,y_pred_linear_treeregion_test)

# RMSE一般指均方根誤差(標準誤差)，可衡量預測值和實際值之間的平均差異，藉此估計預測模型預測目標值的準確度
rmse_treeregion_train = (np.sqrt(mean_squared_error(y_train, y_pred_linear_treeregion_train)))
rmse_treeregion_test = (np.sqrt(mean_squared_error(y_test, y_pred_linear_treeregion_test)))

print('R2_score_tree_region (train): ', r2_score_linear_treeregion_train)
print('R2_score_tree_region (test): ', r2_score_linear_treeregion_test)
print("RMSE_tree_region: ", rmse_treeregion_train) #越低越好 標準誤差
print("RMSE_tree_region: ", rmse_treeregion_test)

In [None]:
print('R2_score (train): ', r2_score_linear_train)
print('R2_score (test): ', r2_score_linear_test)
print("RMSE: ", rmse_train) #越低越好 標準誤差
print("RMSE: ", rmse_test)

In [None]:
X2 = sm.add_constant(X)
est = sm.OLS(y, X2).fit()
print(est.summary())

In [None]:
X_tree_region2 = sm.add_constant(tree_region_X)
est_tree_region = sm.OLS(y, X_tree_region2).fit()
print(est_tree_region.summary())
# P  < .05, P  < .01,P  < .001

## 把其中一個區域當作基準點後的解讀:
* Cond. No : 數據穩定性：如果 Cond. No. 很小，代表模型很穩定。如果很大，代表模型非常敏感，係數可能會因為數據的一點雜訊而完全反轉。 --> Cond NO 未刪除前較大 (代表具共線性、高度相關)，減少後下降。  

$< 10$：優良。變數之間幾乎沒有共線性問題。「減少一個區域」後的數值是 9.61  
$10 \sim 30$：中度相關。可能存在共線性，但通常還可以接受。  
$> 30$：嚴重共線性。模型的係數估計已經開始變得不可靠。  
$10^{15}$ 以上（結果1中的 $8.47 \times 10^{15}$）：代表變數之間存在「完全共線性」，矩陣幾乎無法求逆，算出來的結果在統計上是無意義的。  
* $R^2$ 不變，兩者的 $R^2$ 都是 0.751。  
表示：模型對於目標變數（charges）的預測能力完全沒有因為「少放一個變數」而變弱。減少變數只是重新排列了資訊的呈現方式，讓它符合統計前提，而沒有丟失任何地理資訊。

## 作法 2
* 刪除變數: 孩子數量、性別、地域

In [None]:
res_minmax.info()

In [None]:
del_minmax= res_minmax.drop(columns=['children','sex_female','region_northeast','region_northwest','region_southeast','region_southwest'])
del_minmax.info()

In [None]:
del_minmax_X = del_minmax.iloc[:, 0:3].values
# X = dataset.iloc[:, 0:13].values #把x取出來複製成x 
y = df.iloc[:, 9].values.reshape(-1,1) # 轉換成1個column 

In [None]:
print(del_minmax_X.shape,y.shape)

In [None]:
# 分割訓練和測試
from sklearn.model_selection import train_test_split #切割方法
X_del_train, X_del_test, y_train, y_test = train_test_split(del_minmax_X, y, test_size = 0.3, random_state = 25)

In [None]:
print(X_del_train.shape,X_del_test.shape, y_train.shape, y_test.shape)

In [None]:
from sklearn.linear_model import LinearRegression
regressor_linear = LinearRegression() #線性回歸 modul
regressor_linear.fit(X_del_train, y_train) #把資料丟進去

### <span style="color:#3498DB">Point: 請觀察模型是否穩定</span>
* 留下三個變數後的穩定性

In [None]:
from sklearn.metrics import r2_score

# R2 score，即決定係數，反映Y的全部變異能通過迴歸關係被所有X解釋的比例。
y_pred_linear_del_train = regressor_linear.predict(X_del_train)
r2_score_linear_del_train = r2_score(y_train,y_pred_linear_del_train)

y_pred_linear_del_test = regressor_linear.predict(X_del_test)
r2_score_linear_del_test = r2_score(y_test, y_pred_linear_del_test)

# RMSE一般指均方根誤差(標準誤差)，可衡量預測值和實際值之間的平均差異，藉此估計預測模型預測目標值的準確度
rmse_del_train = (np.sqrt(mean_squared_error(y_train, y_pred_linear_del_train)))
rmse_del_test = (np.sqrt(mean_squared_error(y_test, y_pred_linear_del_test)))

print('R2_score_del (train): ', r2_score_linear_del_train)
print('R2_score_del(test): ', r2_score_linear_del_test)
print("RMSE_del: ", rmse_del_train) #越低越好 標準誤差
print("RMSE_del: ", rmse_del_test)

In [None]:
X_del2 = sm.add_constant(del_minmax_X)
est_del = sm.OLS(y, X_del2).fit()
print(est_del.summary())

## 結論:
### 三種作法比較表

| 評估指標 | **1. 原始全放 (Baseline)** | **2. 基準點法 (移除 1 區域)** | **3. 僅留 3 變數** |
| :--- | :--- | :--- | :--- |
| **策略內容** | 包含所有原始變數與所有區域虛擬變數 | 移除一個區域變數作為基準點，解決共線性 | 僅保留 age, bmi, smoker 三個核心顯著變數 |
| **R² (解釋力)** | **0.751** | **0.751** | **0.747** |
| **RMSE (預測誤差)** | 約 5,936 | 約 5,936 | 約 6,005 |
| **Cond. No. (穩定度)** | **8.47e+15 (數學崩潰)** | **9.61 (優良狀態)** | **7.87 (極度穩定)** |
| **變數數量** | 9 個 (含截距) | 8 個 (含截距) | 4 個 (含截距) |
| **模型結論** | 存在嚴重的虛擬變數陷阱，係數不可靠。 | 保留完整資訊且數學結構正確，適合解釋區域差異。 | 以極小誤差代價換取極簡結構，泛化能力最強。 |

---

### 最終分析結論：

1. **資訊守恆性**：從策略 1 到策略 2 可以看到，$R^2$ 與 RMSE 完全沒有變動，證明「移除一個區域作為基準點」並不會丟失資訊，而是讓數學運算回歸正常。
2. **邊際效益遞減**：從策略 2 到策略 3，變數從 8 個減至 3 個，但 $R^2$ 僅下降了 **0.004 (0.4%)**，RMSE 僅增加約 **69 元**。這顯示「性別」、「小孩人數」與「地理位置」對保費的解釋力微乎其微。
3. **奧卡姆剃刀原則 (Occam's Razor)**：在預測能力幾乎相同的情況下，應選擇最簡單的模型。**精簡模型 (策略 3)** 避開了不顯著的噪音，是針對此數據集進行預測時的最優選擇。

## 作法4 : 
* 進行 bmi 與 smoker 的交互作用（Interaction Effect）特徵工程

In [None]:
bmi_smokrer = res_minmax
bmi_smokrer['bmi_smoker'] = bmi_smokrer['bmi'] * bmi_smokrer['smoker']
# bmi_smokrer.info()
features = bmi_smokrer[['age', 'bmi', 'smoker', 'bmi_smoker']]
print(features.info())

In [None]:
features_X = features.iloc[:, 0:4].values
# X = dataset.iloc[:, 0:13].values #把x取出來複製成x 
y = df.iloc[:, 9].values.reshape(-1,1) # 轉換成1個column 

In [None]:
print(features_X.shape,y.shape)

In [None]:
# 分割訓練和測試
from sklearn.model_selection import train_test_split #切割方法
X_features_train, X_features_test, y_train, y_test = train_test_split(features_X, y, test_size = 0.3, random_state = 25)

In [None]:
print(X_features_train.shape,X_features_test.shape, y_train.shape, y_test.shape)

In [None]:
from sklearn.linear_model import LinearRegression
regressor_linear = LinearRegression() #線性回歸 modul
regressor_linear.fit(X_features_train, y_train) #把資料丟進去

In [None]:
from sklearn.metrics import r2_score

# R2 score，即決定係數，反映Y的全部變異能通過迴歸關係被所有X解釋的比例。
y_pred_linear_features_train = regressor_linear.predict(X_features_train)
r2_score_linear_features_train = r2_score(y_train,y_pred_linear_features_train)

y_pred_linear_features_test = regressor_linear.predict(X_features_test)
r2_score_linear_features_test = r2_score(y_test, y_pred_linear_features_test)

# RMSE一般指均方根誤差(標準誤差)，可衡量預測值和實際值之間的平均差異，藉此估計預測模型預測目標值的準確度
rmse_features_train = (np.sqrt(mean_squared_error(y_train, y_pred_linear_features_train)))
rmse_features_test = (np.sqrt(mean_squared_error(y_test, y_pred_linear_features_test)))

print('R2_score_features (train): ', r2_score_linear_features_train)
print('R2_score_features(test): ', r2_score_linear_features_test)
print("RMSE_features: ", rmse_features_train) #越低越好 標準誤差
print("RMSE_features: ", rmse_features_test)

In [None]:
X_features2 = sm.add_constant(features_X)
est_feartures = sm.OLS(y, X_features2).fit()
print(est_feartures.summary())

In [None]:
print('R2_score_train:',r2_score_linear_train)
print('r2_score_tree_region_train:',r2_score_linear_treeregion_train)
print('r2_score_del_train:',r2_score_linear_del_train)
print('R2_score_features (train): ', r2_score_linear_features_train)
print('==================')
print('R2_score_test',r2_score_linear_test)
print('R2_score_tree_region_(test): ', r2_score_linear_treeregion_test)
print('R2_score_del_(test): ', r2_score_linear_del_test)
print('R2_score_features_(test): ', r2_score_linear_features_test)
print('==================')
print("RMSE_train: ", rmse_train) #越低越好 標準誤差
print("RMSE_test: ", rmse_test)
print('==================')
print("RMSE_treeregion_train: ", rmse_treeregion_train) #越低越好 標準誤差
print("RMSE_treeregion_test: ", rmse_treeregion_test)
print('==================')
print("RMSE_del_train: ", rmse_del_train) #越低越好 標準誤差
print("RMSE_del_test: ", rmse_del_test)
print('==================')
print("RMSE_features_train: ", rmse_features_train) #越低越好 標準誤差
print("RMSE_features_test: ", rmse_features_test)

In [None]:
print(f"未調整任何特徵:\n{est.summary()}")
print(f'設一區為基準點:\n{est_tree_region.summary()}')
print(f'僅留下一個特徵:\n{est_del.summary()}')
print(f'進行特徵工程(bmi_smoker):\n{est_feartures.summary()}')

## 四種作法結論:四種作法對比表


| 建模階段 | 策略名稱 | 核心變數數量 | $R^2$ (Test) | RMSE (Test) | Cond. No. | 結論與實務評價 |
| :--- | :--- | :---: | :---: | :---: | :---: | :--- |
| **階段 1** | **原始全放** | 8 | 0.7299 | 6336.3 | **8.47e+15** |存在嚴重「虛擬變數陷阱」，模型數學結構崩潰，係數估計不可靠。 |
| **階段 2** | **基準點法** | 8 | 0.7299 | 6336.3 | **9.61** | 解決共線性問題。證明減少一個變數不損失資訊，僅重新定義基準。 |
| **階段 3** | **精簡核心模型** | 3 | 0.7336 | 6292.6 | **7.87** | 剔除不顯著變數（地區、性別、孩童數量），測試集誤差反而下降，泛化能力提升。 |
| **階段 4** | **特徵交互作用** | 4 | **0.8226** | **5135.8** | **19.6** | 成功捕捉「肥胖且抽菸」的非線性加乘效應，解釋力大幅提升。 |

---

### 最終分析結論

#### 1. 數學穩定度的轉變 (階段 1 → 2)
* 透過手動剔除一個區域變數，將 **Cond. No.** 從($8.47 \times 10^{15}$) 降至 **9.61**。
* 這一步雖然沒有改變預測準確度 ($R^2$ 相同)，但確保了模型係數在統計學上的意義與穩定性。

#### 2. 奧卡姆剃刀原則的實踐 (階段 2 → 3)
* 將變數從 8 個縮減為 3 個（年齡、BMI、抽菸）後，**$R^2$ (Test)** 從 0.729 提升至 **0.733**，**RMSE** 從 6336 降至 **6292**。
* 這證明「地區」與「性別」、「孩童數量」在該數據集中主要是噪音，移除它們能讓預測更精準且模型更易於解釋。

#### 3. 非線性特徵的力量 (階段 3 → 4)
* 加入 `bmi * smoker` 交互作用項後。**$R^2$ 突破 0.82**，**RMSE 降低了約 18%** (由 6292 降至 5135)。
* 觀察報表發現，單純 BMI 的係數變得不顯著 ($p=0.777$)，但交互作用項的係數極大 ($5.319 \times 10^4$) 且極顯著。
* 這揭示了保費的核心邏輯：**高 BMI 對保費的威脅，主要集中在抽菸族群身上**。

#### 4. 關於 Cond. No. 19.6 的評價
* 雖然加入交互作用項後 Cond. No. 從 7.87 微升至 **19.6**，但仍遠低於預警線 30。
* 最終獲得 9% 的解釋力提升與 1100 元以上的誤差縮減。

## 使用 Lasso regression 及 Ridge regression 模型

In [None]:
df.columns

In [None]:
lasso = Lasso(alpha = 0.05) # , positive=True
lasso.fit(X_train,y_train)

y_pred_lasso_train = lasso.predict(X_train)
r2_score_lasso_train = lasso.score(X_train, y_train)

y_pred_lasso_test = lasso.predict(X_test)
r2_score_lasso_test = lasso.score(X_test, y_test)

rmse_lasso_train = (np.sqrt(mean_squared_error(y_train, y_pred_lasso_train)))
rmse_lasso_test = (np.sqrt(mean_squared_error(y_test, y_pred_lasso_test)))


In [None]:
print("R2_score (train): ",r2_score_lasso_train)
print("R2_score (test):", r2_score_lasso_test)
print("train_RMSE: ", rmse_lasso_train)
print("test_RMSE: ", rmse_lasso_test)

In [None]:
print("coef：")
for i, j in zip(res_minmax.columns, lasso.coef_):
    print(i, j)
#觀察變數那些變為 0 (lasso認為不重要的變數) -- > region_southwest -0.0

In [None]:
ridgeReg = Ridge(alpha=0.05)

ridgeReg.fit(X_train, y_train)

y_pred_ridge_train = ridgeReg.predict(X_train)
r2_score_ridge_train = ridgeReg.score(X_train, y_train)

y_pred_ridge_test = ridgeReg.predict(X_test)
r2_score_ridge_test = ridgeReg.score(X_test, y_test)

rmse_ridge_train = (np.sqrt(mean_squared_error(y_train, y_pred_ridge_train)))
rmse_ridge_test = (np.sqrt(mean_squared_error(y_test, y_pred_ridge_test)))

print("coef：")
for i, j in zip(res_minmax.columns, ridgeReg.coef_):
    print(i, j)



### 最終結果 加上 lasso 及 ridege

In [None]:
print('R2_score_train:',r2_score_linear_train)
print('r2_score_tree_region_train:',r2_score_linear_treeregion_train)
print('r2_score_del_train:',r2_score_linear_del_train)
print('R2_score_features (train): ', r2_score_linear_features_train)
print("R2_score_ridge (train): ",r2_score_ridge_train)
print("R2_score_lasso (train): ",r2_score_lasso_train)
print('==================')
print('R2_score_test',r2_score_linear_test)
print('R2_score_tree_region_(test): ', r2_score_linear_treeregion_test)
print('R2_score_del_(test): ', r2_score_linear_del_test)
print('R2_score_features_(test): ', r2_score_linear_features_test)
print("R2_score_ridege_(test):", r2_score_ridge_test)
print("R2_score_lasso_(test):", r2_score_lasso_test)


In [None]:
print('==================')
print("RMSE_train: ", rmse_train) #越低越好 標準誤差
print("RMSE_test: ", rmse_test)
print('==================')
print("RMSE_treeregion_train: ", rmse_treeregion_train) #越低越好 標準誤差
print("RMSE_treeregion_test: ", rmse_treeregion_test)
print('==================')
print("RMSE_del_train: ", rmse_del_train) #越低越好 標準誤差
print("RMSE_del_test: ", rmse_del_test)
print('==================')
print("RMSE_features_train: ", rmse_features_train) #越低越好 標準誤差
print("RMSE_features_test: ", rmse_features_test)
print('=================')
print("RMSE_train_ridge: ", rmse_ridge_train)
print("RMSE_test_ridge: ", rmse_ridge_test)
print('=================')
print("RMSE_train_lasso: ", rmse_lasso_train)
print("RMSE_test_lasso: ", rmse_lasso_test)