# 複雜度比較

In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

bike = pd.read_csv("gongguan_best.csv")
print("Best")

X = bike.drop(["lent"], axis=1)
y = bike["lent"]

train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.3)

lm = LinearRegression()
lm.fit(train_X, train_y)
print("R Square")
print("Train:", lm.score(train_X, train_y))

predicted_y = lm.predict(valid_X)  # 求預測值

rss = ((predicted_y - valid_y) ** 2).mean()
tss = ((valid_y.mean() - valid_y) ** 2).mean()
print("Val:  ", 1 - rss/tss)  # 驗證誤差

Best
R Square
Train: 0.800273443236809
Val:   0.7534535669144474


In [2]:
bike = pd.read_csv("gongguan_underfitting.csv")
print("Underfitting")

X = bike.drop(["lent"], axis=1)
y = bike["lent"]

train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.3)

lm = LinearRegression()
lm.fit(train_X, train_y)
print("R Square")
print("Train:", lm.score(train_X, train_y))

predicted_y = lm.predict(valid_X)  # 求預測值

rss = ((predicted_y - valid_y) ** 2).mean()
tss = ((valid_y.mean() - valid_y) ** 2).mean()
print("Val:  ", 1 - rss/tss)  # 驗證誤差

Underfitting
R Square
Train: 0.7422284511430886
Val:   0.6461010872035304


In [3]:
bike = pd.read_csv("gongguan_overfitting.csv")
print("Overfitting")

X = bike.drop(["lent"], axis=1)
y = bike["lent"]

train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.3)

lm = LinearRegression()
lm.fit(train_X, train_y)
print("R Square")
print("Train:", lm.score(train_X, train_y))

predicted_y = lm.predict(valid_X)  # 求預測值

rss = ((predicted_y - valid_y) ** 2).mean()
tss = ((valid_y.mean() - valid_y) ** 2).mean()
print("Val:  ", 1 - rss/tss)  # 驗證誤差

Overfitting
R Square
Train: 0.8931718850012087
Val:   0.45045128313723004


# K-Fold交叉驗證

In [4]:
from sklearn.model_selection import cross_val_score

bike = pd.read_csv("gongguan_best.csv")

X = bike.drop(["lent"], axis=1)
y = bike["lent"]

lm = LinearRegression()
print("Best")
print(cross_val_score(lm, X, y, cv=4).mean())
print("---------")


bike = pd.read_csv("gongguan_underfitting.csv")

X = bike.drop(["lent"], axis=1)
y = bike["lent"]

lm = LinearRegression()
print("Underfitting")
print(cross_val_score(lm, X, y, cv=4).mean())
print("---------")

bike = pd.read_csv("gongguan_overfitting.csv")

X = bike.drop(["lent"], axis=1)
y = bike["lent"]

lm = LinearRegression()
print("Overfitting")
print(cross_val_score(lm, X, y, cv=4).mean())

Best
0.7511195859500663
---------
Underfitting
0.7017800502203373
---------
Overfitting
0.21382132935774767


# 懲罰模型複雜度

In [7]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Lasso

bike = pd.read_csv("gongguan_best.csv")
print("27自變數")

X = bike.drop(["lent"], axis=1)
y = bike["lent"]

lm_basic = LinearRegression()
print(cross_val_score(lm_basic, X, y, cv=4).mean())
lm_lasso = Lasso(alpha=0.001, max_iter=1000000)
print(cross_val_score(lm_lasso, X, y, cv=4).mean())
lm_lasso = Lasso(alpha=0.005, max_iter=1000000)
print(cross_val_score(lm_lasso, X, y, cv=4).mean())
lm_lasso = Lasso(alpha=0.01, max_iter=1000000)
print(cross_val_score(lm_lasso, X, y, cv=4).mean())

27自變數
0.7511195859500663
0.7511427036130731
0.7511577441276536
0.7509168574390377


In [8]:
bike = pd.read_csv("gongguan_overfitting.csv")
print("144自變數")

X = bike.drop(["lent"], axis=1)
y = bike["lent"]

lm_basic = LinearRegression()
print(cross_val_score(lm_basic, X, y, cv=4).mean())
lm_lasso = Lasso(alpha=1, max_iter=1000000)
print(cross_val_score(lm_lasso, X, y, cv=4).mean())
lm_lasso = Lasso(alpha=2, max_iter=1000000)
print(cross_val_score(lm_lasso, X, y, cv=4).mean())
lm_lasso = Lasso(alpha=5, max_iter=1000000)
print(cross_val_score(lm_lasso, X, y, cv=4).mean())

144自變數
0.21382132935774767
0.7864888874448437
0.7894659619266512
0.7884804368690677
