In [94]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error
from scipy.stats import ttest_ind

%matplotlib inline

In [95]:
## 超參數 ##

seed = 222

In [100]:
## 創造特徵交互的資料 ##

# x1 = np.random.rand(10000, 1) + 1 # +1避開0,因為想用MAPE觀察
# x2 = np.random.rand(10000, 1) + 1
x1 = np.random.randint(1, 10, size=(10000, 1))
x2 = np.random.randint(1, 10, size=(10000, 1))
x3 = np.random.rand(10000, 1) + 3
# x4 = np.random.rand(10000, 1) + 2



# 真實標籤
# y = 3*x1 + 4*x2 + (x1*x2) * (x2**3) + (x1*x3) + (x2*x3)
y = 3*x1 + 4*x2 #+ (x1*x2) * (x2**3) + (x1*x3) + (x2*x3)
y = y.flatten()

y.shape

(10000,)

In [101]:
## 整理資料 ##

# concate
x = np.concatenate((x1, x2, x3), axis=1)
print(f'x.shape: {x.shape}')

# 切分資料
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

# 標準化: 因為資料本身來自均勻分配，不需要！

x.shape: (10000, 3)


In [102]:
## 訓練模型 ##


lr = LinearRegression()
rf = RandomForestRegressor()
dt = DecisionTreeRegressor(max_depth=5)
gdbt = GradientBoostingRegressor()

lr.fit(x_train, y_train)
rf.fit(x_train, y_train)
dt.fit(x_train, y_train)
gdbt.fit(x_train, y_train)

GradientBoostingRegressor()

In [103]:
## 驗證模型效果

# 訓練集: 看擬合能力
y_train_lr = lr.predict(x_train)
y_train_rf = rf.predict(x_train)
y_train_dt = dt.predict(x_train)
y_train_gdbt = gdbt.predict(x_train)


mse_train_lr = mean_squared_error(y_train, y_train_lr)
mse_train_rf = mean_squared_error(y_train, y_train_rf)
mse_train_dt = mean_squared_error(y_train, y_train_dt)
mse_train_gdbt = mean_squared_error(y_train, y_train_gdbt)

mape_train_lr = mean_absolute_percentage_error(y_train, y_train_lr)
mape_train_rf = mean_absolute_percentage_error(y_train, y_train_rf)
mape_train_dt = mean_absolute_percentage_error(y_train, y_train_dt)
mape_train_gdbt = mean_absolute_percentage_error(y_train, y_train_gdbt)

print(f'LR 訓練 mse: {mse_train_lr}, mape: {mape_train_lr}')
print(f'RF 訓練 mse: {mse_train_rf}, mape: {mape_train_rf}')
print(f'DT 訓練 mse: {mse_train_dt}, mape: {mape_train_dt}')
print(f'GDBT 訓練 mse: {mse_train_gdbt}, mape: {mape_train_gdbt}')


# 測試集: 看泛化能力
y_test_lr = lr.predict(x_test)
y_test_rf = rf.predict(x_test)
y_test_dt = dt.predict(x_test)
y_test_gdbt = gdbt.predict(x_test)

mse_test_lr = mean_squared_error(y_test, y_test_lr)
mse_test_rf = mean_squared_error(y_test, y_test_rf)
mse_test_dt = mean_squared_error(y_test, y_test_dt)
mse_test_gdbt = mean_squared_error(y_test, y_test_gdbt)

mape_test_lr = mean_absolute_percentage_error(y_test, y_test_lr)
mape_test_rf = mean_absolute_percentage_error(y_test, y_test_rf)
mape_test_dt = mean_absolute_percentage_error(y_test, y_test_dt)
mape_test_gdbt = mean_absolute_percentage_error(y_test, y_test_gdbt)

print()
print(f'LR 測試 mse: {mse_test_lr}, mape: {mape_test_lr}')
print(f'RF 測試 mse: {mse_test_rf}, mape: {mape_test_rf}')
print(f'DT 測試 mse: {mse_test_dt}, mape: {mape_test_dt}')
print(f'GDBT 測試 mse: {mse_test_gdbt}, mape: {mape_test_gdbt}')

LR 訓練 mse: 2.9415541890344107e-27, mape: 1.9095484356949923e-15
RF 訓練 mse: 0.0, mape: 0.0
DT 訓練 mse: 4.151310218572137, mape: 0.06256624775097677
GDBT 訓練 mse: 0.01075708011551011, mape: 0.002460783066353675

LR 測試 mse: 2.936447313896918e-27, mape: 1.965514712040778e-15
RF 測試 mse: 0.0, mape: 0.0
DT 測試 mse: 4.302734115385284, mape: 0.06482421694092047
GDBT 測試 mse: 0.011191071820831917, mape: 0.0026016823722905622


## 結論
- 透過上面的 `y` 改變成不同形式
    1. `y = 3*x1 + 4*x2 + (x1*x2) * (x2**3) + (x1*x3) + (x2*x3)`: 標籤與特徵的關係`需要`特徵之間的交互關係。
    2. `y = 3*x1 + 4*x2 `: 標籤與特徵的關係`不需要`特徵之間的交互關係。
- 可以從MSE、MAPE等觀察到
    - 在`1.`的狀況下，LR非常差，而其他模型都有一些特徵交互的能力。
    - 在`2.`的狀況下，LR的模型假設正好符合，幾乎完美解決問題。
    
