# 特徵工程（Feature Engineering）
特徵工程是構建更好模型的關鍵部分。以下是使用 scikit-learn 進行特徵工程的一些常用技術，例如主成分分析（PCA）、特徵選擇和抽樣等。

In [95]:
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, chi2, RFE, SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.datasets import load_breast_cancer
import pandas as pd
import numpy as np


In [96]:
# 加載乳腺癌資料集
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

In [97]:
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [98]:
# 分離特徵和目標欄位
target_column = 'target'
features = df.drop(columns=[target_column])
target = df[target_column]

In [99]:
# 原始資料集
datasets = {
    "原始資料集": (features, target)
}

## 主成分分析（PCA）

In [100]:
# 進行 PCA
pca = PCA(n_components=2)  # 將維度降到2維
principal_components = pca.fit_transform(features)
pca_df = pd.DataFrame(data=principal_components, columns=['principal_component_1', 'principal_component_2'])
datasets["PCA 資料集"] = (pca_df, target)

# 將目標欄位添加回去
pca_df = pd.concat([pca_df, target.reset_index(drop=True)], axis=1)
pca_df.head()

Unnamed: 0,principal_component_1,principal_component_2,target
0,1160.142574,-293.917544,0
1,1269.122443,15.630182,0
2,995.793889,39.156743,0
3,-407.180803,-67.38032,0
4,930.34118,189.340742,0


## 特徵選擇（Feature Selection）
特徵選擇是從數據中選擇最重要的特徵，以提高模型的性能。

- 使用卡方檢驗（Chi-Square Test）

卡方檢驗用於測試兩個分類變量之間的獨立性。這裡我們用它來選擇與目標變量最相關的特徵。

In [101]:
# 使用卡方檢驗進行特徵選擇
select_k_best = SelectKBest(chi2, k=2)
selected_features = select_k_best.fit_transform(features, target)
selected_df = pd.DataFrame(data=selected_features, columns=['selected_feature_1', 'selected_feature_2'])
datasets["卡方檢驗資料集"] = (selected_df, target)

# 將目標欄位添加回去
selected_df = pd.concat([selected_df, target.reset_index(drop=True)], axis=1)
selected_df.head()

Unnamed: 0,selected_feature_1,selected_feature_2,target
0,1001.0,2019.0,0
1,1326.0,1956.0,0
2,1203.0,1709.0,0
3,386.1,567.7,0
4,1297.0,1575.0,0


## 使用遞歸特徵消除（RFE）
遞歸特徵消除（RFE）是通過遞歸地訓練模型，並每次消除表現最差的特徵來選擇特徵。

In [102]:
from sklearn.linear_model import LogisticRegression

# 使用遞歸特徵消除進行特徵選擇
model = LogisticRegression(max_iter=10000)
rfe = RFE(model, n_features_to_select=2)
fit = rfe.fit(features, target)

selected_features_rfe = features.loc[:, fit.support_]
datasets["RFE 資料集"] = (selected_features_rfe, target)

# 將目標欄位添加回去
selected_df_rfe = pd.concat([selected_features_rfe, target.reset_index(drop=True)], axis=1)
selected_df_rfe.head()


Unnamed: 0,worst compactness,worst concavity,target
0,0.6656,0.7119,0
1,0.1866,0.2416,0
2,0.4245,0.4504,0
3,0.8663,0.6869,0
4,0.205,0.4,0


In [103]:
from sklearn.ensemble import RandomForestClassifier

# 使用隨機森林進行特徵選擇
selector = SelectFromModel(RandomForestClassifier(n_estimators=100), threshold="mean")
selector.fit(features, target)
selected_features_sfm = selector.transform(features)
selected_df_sfm = pd.DataFrame(data=selected_features_sfm)

datasets["SelectFromModel 資料集"] = (selected_df_sfm, target)

# 將目標欄位添加回去
selected_df_sfm = pd.concat([selected_df_sfm, target.reset_index(drop=True)], axis=1)
selected_df_sfm.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,target
0,17.99,122.8,1001.0,0.3001,0.1471,25.38,184.6,2019.0,0.2654,0
1,20.57,132.9,1326.0,0.0869,0.07017,24.99,158.8,1956.0,0.186,0
2,19.69,130.0,1203.0,0.1974,0.1279,23.57,152.5,1709.0,0.243,0
3,11.42,77.58,386.1,0.2414,0.1052,14.91,98.87,567.7,0.2575,0
4,20.29,135.1,1297.0,0.198,0.1043,22.54,152.2,1575.0,0.1625,0


## 數據抽樣（Sampling）
處理不平衡數據集的一種方法是通過過採樣或下採樣來平衡類別分佈。

### 上採樣（Over-sampling）
上採樣是通過增加少數類別的樣本數量來平衡數據集。

In [104]:
# 原始資料集分佈
df['target'].value_counts()

target
1    357
0    212
Name: count, dtype: int64

In [105]:
# 將數據集分為多數類別和少數類別
majority = df[df[target_column] == df[target_column].value_counts().idxmax()]
minority = df[df[target_column] == df[target_column].value_counts().idxmin()]

# 上採樣少數類別
minority_upsampled = resample(minority,
                              replace=True,     # 允許放回抽樣
                              n_samples=len(majority),    # 使兩個類別的數量相同
                              random_state=42)  # 設定隨機種子

# 合併數據集
upsampled_df = pd.concat([majority, minority_upsampled])
upsampled_features = upsampled_df.drop(columns=[target_column])
upsampled_target = upsampled_df[target_column]
datasets["上採樣資料集"] = (upsampled_features, upsampled_target)

In [106]:
print(upsampled_target.value_counts())

target
1    357
0    357
Name: count, dtype: int64


## 下採樣（Under-sampling）
下採樣是通過減少多數類別的樣本數量來平衡數據集。

In [107]:
# 將數據集分為多數類別和少數類別
majority = df[df[target_column] == df[target_column].value_counts().idxmax()]
minority = df[df[target_column] == df[target_column].value_counts().idxmin()]

# 下採樣多數類別
majority_downsampled = resample(majority,
                                replace=False,    # 不允許放回抽樣
                                n_samples=len(minority),  # 使兩個類別的數量相同
                                random_state=42)  # 設定隨機種子

# 合併數據集
downsampled_df = pd.concat([majority_downsampled, minority])
downsampled_features = downsampled_df.drop(columns=[target_column])
downsampled_target = downsampled_df[target_column]
datasets["下採樣資料集"] = (downsampled_features, downsampled_target)

In [108]:
print(downsampled_target.value_counts())

target
1    212
0    212
Name: count, dtype: int64


## Model Traning Use Another Feature Engineering

In [109]:
datasets['原始資料集'][0].head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [110]:
datasets['原始資料集'][1].head()

0    0
1    0
2    0
3    0
4    0
Name: target, dtype: int64

In [111]:
datasets['PCA 資料集'][0].head()

Unnamed: 0,principal_component_1,principal_component_2
0,1160.142574,-293.917544
1,1269.122443,15.630182
2,995.793889,39.156743
3,-407.180803,-67.38032
4,930.34118,189.340742


In [112]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

# 遍歷每個資料集並進行模型訓練和評估
for name, (X, y) in datasets.items():
    # 分割資料集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # 訓練模型
    gnb = GaussianNB()
    gnb.fit(X_train, y_train)

    # 預測
    y_pred = gnb.predict(X_test)

    # 評估
    print(f"{name}分類報告:")
    print(classification_report(y_test, y_pred, digits=5))
    print("="*60)


原始資料集分類報告:
              precision    recall  f1-score   support

           0    1.00000   0.93023   0.96386        43
           1    0.95946   1.00000   0.97931        71

    accuracy                        0.97368       114
   macro avg    0.97973   0.96512   0.97158       114
weighted avg    0.97475   0.97368   0.97348       114

PCA 資料集分類報告:
              precision    recall  f1-score   support

           0    1.00000   0.83721   0.91139        43
           1    0.91026   1.00000   0.95302        71

    accuracy                        0.93860       114
   macro avg    0.95513   0.91860   0.93221       114
weighted avg    0.94411   0.93860   0.93732       114

卡方檢驗資料集分類報告:
              precision    recall  f1-score   support

           0    1.00000   0.86047   0.92500        43
           1    0.92208   1.00000   0.95946        71

    accuracy                        0.94737       114
   macro avg    0.96104   0.93023   0.94223       114
weighted avg    0.95147   0.94737   0