In [None]:
uv add scikit-learn seaborn statsmodels

In [None]:
# !pip3 install scikit-learn
# !pip3 install seaborn
# !pip3 install statsmodels --upgrade

In [None]:
%pip install scikit-learn
%pip install seaborn
%pip install statsmodels --upgrade

In [None]:
%pip list

In [None]:
#!conda install sklearn
#!conda install git+https://github.com/statsmodels/statsmodels
#!conda install statsmodels --upgrade

In [None]:
# !conda list

In [None]:
from sklearn import datasets, linear_model
from sklearn.linear_model import LinearRegression, Ridge, Lasso

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from collections import Counter

# from sklearn.datasets import load_boston
sns.set_style('darkgrid')

import warnings
warnings.filterwarnings("ignore")

## Purpose
(1) 請問每個地區預測房價是多少?
(2) 什麼因素會影響房價

# 1. import data

### <span style="color:#3498DB">Point: 了解資料形式</span>

該數據收集於1978年，506個樣本代表波士頓各個郊區房屋，14個特徵的信息。

In [None]:
dataset = pd.read_csv('BostonHousing.csv', sep=',', encoding='UTF-8')
dataset.columns = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']

In [None]:
# boston_dataset = load_boston()
# dataset = pd.DataFrame(boston_dataset.data, columns = boston_dataset.feature_names)
dataset.info()

In [None]:
dataset.head()

### <span style="color:#3498DB">Point: 請了解變數定義</span>

### <span style="color:#3498DB">Point: 確認預測目標Y </span>

In [None]:
# dataset['MEDV'] = boston_dataset.target

# 2. Data preprocessing

In [None]:
dataset.head()

In [None]:
dataset.columns

## 標準化

* 當數據集中的不同特徵之間的尺度差異很大時，擬合的線性迴歸模型可能會受到影響，因為較大的特徵尺度可能會支配著較小的特徵尺度。
* 在這種情況下，標準化可以幫助線性迴歸模型更好地捕捉特徵之間的關係，提高模型的準確性和穩定性。
* 如果特徵之間的尺度差異較小，則標準化可能不是必要的。在這種情況下，可以直接使用原始數據進行線性迴歸建模。
* 在某些情況下，標準化可能會導致特徵之間的相關性變弱，因此需要根據具體的數據集和問題來決定是否需要進行標準化。

In [None]:
from sklearn import preprocessing #標準化模組
#建立MinMaxScaler物件
minmax = preprocessing.MinMaxScaler()
# 資料標準化
x_minmax = minmax.fit_transform(dataset[['CRIM', 'ZN', 'INDUS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX','PTRATIO', 'B', 'LSTAT']]) #標準化連續型變數、類別變數不用標準化

In [None]:
x_minmax= pd.DataFrame(x_minmax, columns=['CRIM', 'ZN', 'INDUS', 'NOX', 'RM', 'AGE', 
                                          'DIS', 'RAD', 'TAX','PTRATIO', 'B', 'LSTAT']) #變成新的dataframe

In [None]:
res_minmax = pd.concat([x_minmax.reset_index(drop=True), dataset['CHAS']], axis=1) #合併回chas河(因為是類別變數)
res_minmax

## 設定特徵和標籤

In [None]:
X = res_minmax.iloc[:, 0:13].values
# X = dataset.iloc[:, 0:13].values #把x取出來複製成x 
y = dataset.iloc[:, 13].values.reshape(-1,1) # 轉換成1個column 

In [None]:
# 分割訓練和測試
from sklearn.model_selection import train_test_split #切割方法
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 25)

In [None]:
print(f"Shape of X_train:{X_train.shape}") #訓練資料 354筆１３個ｘ變數
print(f"Shape of X_test:{X_test.shape}") #測試資料 152筆１３個ｘ變數
print(f"Shape of y_train:{y_train.shape}")
print(f"Shape of y_test{y_test.shape}")

# 3. Descriptive statistics

### <span style="color:#3498DB">Point: 請確認是否有NA</span>

In [None]:
dataset.isnull().sum()

### <span style="color:#3498DB">Point: 請確認各變數的基礎統計量</span>

In [None]:
dataset.describe() 

In [None]:
dataset.sort_values(by=['CRIM'], ascending=False)

### <span style="color:#3498DB">Point: 請觀察各X與Y的關係</span>

In [None]:
dataset.columns

In [None]:
sns.pairplot(dataset[['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'MEDV']])

# CRIM：城鎮的人均犯罪率
# ZN：大於25,000平方英尺的地塊的住宅用地比例。
# INDUS：每個鎮的非零售業務英畝的比例。
# CHAS：查爾斯河虛擬變量（如果環河，則等於1；否則等於0）
# NOX：一氧化氮的濃度（百萬分之幾）
# RM：每個住宅的平均房間數
# MEDV：自有住房的中位數價值(單位：1000美元)

In [None]:
sns.pairplot(dataset[['AGE', 'DIS', 'RAD', 'TAX','PTRATIO', 'B', 'LSTAT', 'MEDV']])
# AGE：1940年之前建造的自有住房的比例
# DIS：到五個波士頓就業中心的加權距離
# RAD：徑向公路通達性的指標
# TAX：每$ 10,000的全值財產稅率
# PTRATIO：各鎮的師生比率
# B：計算方法為1000（Bk-0.63）²，其中Bk是按城鎮劃分的非裔美國人的比例
# LSTAT：底層人口的百分比
# MEDV：自有住房的中位數價值(單位：1000美元)

# 4. Correlation
### <span style="color:#3498DB">Point1: 請觀察各變數之間的關係</span>
### <span style="color:#3498DB">Point2: 請判斷各變數之間是否有共線關係</span>

In [None]:
corrmat = dataset.corr()
fig, ax = plt.subplots(figsize = (18, 10))
sns.heatmap(corrmat, annot = True, annot_kws={'size': 12})
#相關性 1 ~ -1 
# 正相關 (> 0.7)：兩者同步起伏。

# 負相關 (< -0.7)：兩者背道而馳。

# 零相關 (-0.1 ~ 0.1)：這兩個東西互相不影響，預測時可以考慮剔除。

# 5. Modeling

## 5-1 Linear Regression (PPT 1.1 & 1.2)

In [None]:
from sklearn.linear_model import LinearRegression
regressor_linear = LinearRegression() #線性回歸 modul
regressor_linear.fit(X_train, y_train) #把資料丟進去

### <span style="color:#3498DB">Point: 請觀察模型是否穩定</span>

In [None]:
from sklearn.metrics import r2_score

# R2 score，即決定係數，反映Y的全部變異能通過迴歸關係被所有X解釋的比例。
y_pred_linear_train = regressor_linear.predict(X_train)
r2_score_linear_train = r2_score(y_train, y_pred_linear_train)

y_pred_linear_test = regressor_linear.predict(X_test)
r2_score_linear_test = r2_score(y_test, y_pred_linear_test)

# RMSE一般指均方根誤差(標準誤差)，可衡量預測值和實際值之間的平均差異，藉此估計預測模型預測目標值的準確度
rmse_train = (np.sqrt(mean_squared_error(y_train, y_pred_linear_train)))
rmse_test = (np.sqrt(mean_squared_error(y_test, y_pred_linear_test)))

print('R2_score (train): ', r2_score_linear_train)
print('R2_score (test): ', r2_score_linear_test)
print("RMSE: ", rmse_train) #越低越好 標準誤差
print("RMSE: ", rmse_test)

### <span style="color:#3498DB">Point: 請找出顯著與不顯著的變數</span>

In [None]:
import statsmodels.api as sm
from scipy import stats

In [None]:
X2 = sm.add_constant(X)
est = sm.OLS(y, X2).fit()
print(est.summary())
# P  < .05, P  < .01,P  < .001

In [None]:
dataset.info()

### <span style="color:#3498DB">Test: 請將不顯著的變數與您覺得可能具有共線性的變數進行處理，再次跑迴歸，並觀察各類檢測數值是否有優化</span>

### <span style="color:#3498DB">Point: 預測結果</span>

In [None]:
to_be_predicted = np.array([
    [2.9850e-02, 0.0000e+00, 2.1800e+00, 0.0000e+00, 4.5800e-01,
       6.4300e+00, 5.8700e+01, 6.0622e+00, 3.0000e+00, 2.2200e+02,
       1.8700e+01, 3.9412e+02, 5.2100e+00]
])
predicted_price = regressor_linear.predict(to_be_predicted)

In [None]:
predicted_price

---

## 5-2 Polynomial regression (PPT 1.3)

In [None]:
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree = 2)
X_poly = poly_reg.fit_transform(X_train)
poly_reg.fit(X_poly, y_train)
regressor_poly2 = LinearRegression()
regressor_poly2.fit(X_poly, y_train)

### <span style="color:#3498DB">Point: 請比較Linear Regression和Polynomial regression的差異，如兩個模型之間的穩定度及誤差 </span>
### <span style="color:#3498DB">Question: 你會選則哪個模型？</span>

In [None]:
y_pred_poly2_train = regressor_poly2.predict(poly_reg.fit_transform(X_train))
r2_score_poly2_train = r2_score(y_train, y_pred_poly2_train)

y_pred_poly2_test = regressor_poly2.predict(poly_reg.fit_transform(X_test))
r2_score_poly2_test = r2_score(y_test, y_pred_poly2_test)

rmse_train = (np.sqrt(mean_squared_error(y_train, y_pred_poly2_train)))
rmse_test = (np.sqrt(mean_squared_error(y_test, y_pred_poly2_test)))

print('R2_score (train): ', r2_score_poly2_train)
print('R2_score (test): ', r2_score_poly2_test)
print("RMSE: ", rmse_train)
print("RMSE: ", rmse_test)

### <span style="color:#3498DB">Point: 預測結果</span>

In [None]:
to_be_predicted = np.array([
    [2.9850e-02, 0.0000e+00, 2.1800e+00, 0.0000e+00, 4.5800e-01,
       6.4300e+00, 5.8700e+01, 6.0622e+00, 3.0000e+00, 2.2200e+02,
       1.8700e+01, 3.9412e+02, 5.2100e+00]
])
predicted_price = regressor_poly2.predict(poly_reg.fit_transform(to_be_predicted))
predicted_price

## 5-3 Lasso regression (PPT 1.4)

alpha : 懲罰度(λ):通常介於1~10 之間

In [None]:
dataset.columns

In [None]:
lasso = Lasso(alpha = 0.05) # , positive=True
lasso.fit(X_train,y_train)

y_pred_lasso_train = lasso.predict(X_train)
r2_score_lasso_train = lasso.score(X_train, y_train)

y_pred_lasso_test = lasso.predict(X_test)
r2_score_lasso_test = lasso.score(X_test, y_test)

rmse_lasso_train = (np.sqrt(mean_squared_error(y_train, y_pred_lasso_train)))
rmse_lasso_test = (np.sqrt(mean_squared_error(y_test, y_pred_lasso_test)))



Lasso Regression，使用L1正則化，一些參數的值可能會被壓縮到0，因此coef向量中相應的元素也會為0，代表這些特徵在模型中被認為是不重要的。     
可以進行特徵選擇，提高模型的性能和穩定性。

In [None]:
print("R2_score (train): ",r2_score_lasso_train)
print("R2_score (test):", r2_score_lasso_test)
print("train_RMSE: ", rmse_lasso_train)
print("test_RMSE: ", rmse_lasso_test)

In [None]:
print("coef：")
for i, j in zip(dataset.columns, lasso.coef_):
    print(i, j)
#觀察變數那些變為 0 (lasso認為不重要的變數)

## 5-4 Ridge regression (PPT 1.4)

In [None]:
ridgeReg = Ridge(alpha=0.05)

ridgeReg.fit(X_train, y_train)

y_pred_ridge_train = ridgeReg.predict(X_train)
r2_score_ridge_train = ridgeReg.score(X_train, y_train)

y_pred_ridge_test = ridgeReg.predict(X_test)
r2_score_ridge_test = ridgeReg.score(X_test, y_test)

rmse_ridge_train = (np.sqrt(mean_squared_error(y_train, y_pred_ridge_train)))
rmse_ridge_test = (np.sqrt(mean_squared_error(y_test, y_pred_ridge_test)))

print("coef：")
for i, j in zip(dataset.columns, ridgeReg.coef_):
    print(i, j)

#重要性最高易氧化炭

Ridge Regression使用L2正則化來限制模型參數的大小，從而降低模型的複雜度並避免過度擬合。
L2正則化不會將參數壓縮到0，因此coef向量中的每個元素都對模型的預測有貢獻。

In [None]:
print("R2_score (train): ",r2_score_ridge_train)
print("R2_score (test):", r2_score_ridge_test)
print("train_RMSE: ", rmse_ridge_train)
print("test_RMSE: ", rmse_ridge_test)