In [15]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [16]:
# 載入資料集
data = pd.read_csv('insurance.csv')

In [17]:
# 檢查是否有缺失值或異常值
print(data.isnull().sum()) # 檢查是否有缺失值
print(data.describe()) # 檢查是否有異常值

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64
               age          bmi     children       charges
count  1338.000000  1338.000000  1338.000000   1338.000000
mean     39.207025    30.663397     1.094918  13270.422265
std      14.049960     6.098187     1.205493  12110.011237
min      18.000000    15.960000     0.000000   1121.873900
25%      27.000000    26.296250     0.000000   4740.287150
50%      39.000000    30.400000     1.000000   9382.033000
75%      51.000000    34.693750     2.000000  16639.912515
max      64.000000    53.130000     5.000000  63770.428010


In [19]:
# 將類別型特徵轉換為二元變量
data['sex'] = pd.get_dummies(data['sex'], drop_first=True)
data['smoker'] = pd.get_dummies(data['smoker'], drop_first=True)

# 將居住區域特徵進行編碼
encoder = OneHotEncoder()
region_encoded = encoder.fit_transform(data['region'].values.reshape(-1, 1))
region_encoded_df = pd.DataFrame(region_encoded.toarray(), columns=encoder.get_feature_names_out(['region']))
data = pd.concat([data, region_encoded_df], axis=1)

# 將年紀轉換為出生年份
data['birth_year'] = 2023 - data['age']

# 將BMI進行標準化處理
scaler = StandardScaler()
data['bmi'] = scaler.fit_transform(data[['bmi']])

# 選擇特徵並切分資料集
X = data[['sex', 'bmi', 'children', 'smoker', 'region_northeast', 'region_northwest', 'region_southeast', 'region_southwest', 'birth_year']]
y = data['charges']

In [20]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
##Linear Regression 模型：



from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# 建立 Linear Regression 模型
lr_model = LinearRegression()

# 訓練模型
lr_model.fit(X_train, y_train)

# 預測測試集
lr_y_pred = lr_model.predict(X_test)

# 計算模型評估指標
lr_mse = mean_squared_error(y_test, lr_y_pred)
lr_r2 = r2_score(y_test, lr_y_pred)

print("Linear Regression 模型的 MSE 為：", lr_mse)
print("Linear Regression 模型的 R2 score 為：", lr_r2)

Linear Regression 模型的 MSE 為： 33596915.85136145
Linear Regression 模型的 R2 score 為： 0.7835929767120724


In [22]:
##SVM (SVR) 模型

from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

# 建立 SVM (SVR) 模型
svm_model = SVR(kernel='linear')

# 訓練模型
svm_model.fit(X_train, y_train)

# 預測測試集
svm_y_pred = svm_model.predict(X_test)

# 計算模型評估指標
svm_mse = mean_squared_error(y_test, svm_y_pred)
svm_r2 = r2_score(y_test, svm_y_pred)

print("SVM (SVR) 模型的 MSE 為：", svm_mse)
print("SVM (SVR) 模型的 R2 score 為：", svm_r2)

SVM (SVR) 模型的 MSE 為： 166061861.6390888
SVM (SVR) 模型的 R2 score 為： -0.06965036070448227


In [23]:
##Decision Tree 模型：

from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

# 建立 Decision Tree 模型
dt_model = DecisionTreeRegressor(random_state=42)

# 訓練模型
dt_model.fit(X_train, y_train)

# 預測測試集
dt_y_pred = dt_model.predict(X_test)

# 計算模型評估指標
dt_mse = mean_squared_error(y_test, dt_y_pred)
dt_r2 = r2_score(y_test, dt_y_pred)

print("Decision Tree 模型的 MSE 為：", dt_mse)
print("Decision Tree 模型的 R2 score 為：", dt_r2)

Decision Tree 模型的 MSE 為： 42187803.87714855
Decision Tree 模型的 R2 score 為： 0.7282566918790956


In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

