# Machine Learning

# Install Package

In [None]:
# %pip install scikit-learn pandas numpy

# Loading Dataset

In [None]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer

In [None]:
# 加載數據集
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

In [None]:
# 顯示資料夾詳細訊息
df.info()

In [None]:
df.head(5)

In [None]:
df.tail(5)

In [None]:
df.describe()

# Check Target Distributed

In [None]:
# 檢查 target 欄位的分佈
# To Do

target_distribution =
print(target_distribution)

# Data Preprocessing

In [None]:
# 檢查每個欄位是否存在缺失值
# To Do

missing_values =
print(missing_values)

## Create Missing Values

In [None]:
# 設置隨機種子以便結果可重現
np.random.seed(0)

# 計算需要設置為缺失值的總數量
nan_fraction = 0.1  # 10% 的數據設置為 NaN
num_nans = int(np.floor(nan_fraction * df.size))

# 確保 target 欄位不會被選中
columns_to_select = df.columns[:-1]  # 排除 'target' 欄位

# 隨機選擇行和列
for _ in range(num_nans):
    row = np.random.choice(df.index)
    col = np.random.choice(columns_to_select)
    df.at[row, col] = np.nan

# 檢查每個欄位的缺失值
missing_values = df.isnull().sum()
print(missing_values)

In [None]:
df.head(5)

## Processing the Missing Values

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer

### 簡單插補法 `SimpleImputer`

In [None]:
# 1. 用均值填充部分欄位
mean_fill_columns = ['mean radius', 'mean texture', 'mean perimeter']
imputer_mean = SimpleImputer(strategy='mean')
df[mean_fill_columns] = imputer_mean.fit_transform(df[mean_fill_columns])

In [None]:
# 2. 用中位數填充部分欄位
# To Do
median_fill_columns = ['mean area', 'mean smoothness']
imputer_median =
df[median_fill_columns] = imputer_median.fit_transform(df[median_fill_columns])

In [None]:
# 3. 用眾數填充部分欄位
# To Do
mode_fill_columns = ['mean compactness', 'mean concavity']
imputer_mode =
df[mode_fill_columns] = imputer_mode.fit_transform(df[mode_fill_columns])

### 線性插補法 `interpolate`
- 一種用於填充缺失值的方法
- 使用已知資料點之間的直線來估計缺失值
- 適用於時間序列數據或具有連續性特徵的數據

``` python
Before
   mean concave points  mean symmetry
0                  0.1            0.2
1                  NaN            0.3
2                  0.2            NaN
3                  NaN            0.5
4                  0.4            0.6

After
   mean concave points  mean symmetry
0                  0.1            0.2
1                  0.15           0.3
2                  0.2            0.4
3                  0.3            0.5
4                  0.4            0.6

```

線性插補會在Index 0 和Index 2 的已知資料之間畫一條直線。這條直線的公式是 y = mx + b，其中 m 是斜率，b 是截距。斜率 m 是 (0.2 - 0.1) / (2 - 0) = 0.1 / 2 = 0.05

In [None]:
# 4. 用線性插值填充部分欄位
interpolate_fill_columns = ['mean concave points', 'mean symmetry']
df[interpolate_fill_columns] = df[interpolate_fill_columns].interpolate()

### KNN插補法 `KNNImputer`
- 使用最近鄰居算法（K-Nearest Neighbors, KNN）來填補缺失值的一種方法
- 基本思想是用資料集中與缺失值最近的K個數據點的值來估計該缺失值
- 適用於資料間具有局部相似性或聚類特性的情況
- `KNNImputer` 中的 `n_neighbors` 表示指定用於填補缺失值的最近鄰居數量，不包含本身的缺失值
- 在填補缺失值中，`KNNImputer` 會找到資料集中與缺失值最近的 `n_neighbors` 個有效資料點，然後使用這些鄰居的值（通常是均值）來填補缺失值


``` python
Before
   mean fractal dimension  radius error
0                     0.10           1.1
1                     0.15           NaN
2                      NaN           1.3
3                     0.20           1.4
4                     0.25           NaN

After
   mean fractal dimension  radius error
0                    0.10      1.100000
1                    0.15      1.266667
2                    0.175     1.300000
3                    0.20      1.400000
4                    0.25      1.266667
```
使用 KNN 算法計算出的鄰居值如下：

對於 mean fractal dimension 欄位的索引 2 的缺失值：

最近的鄰居是索引 0, 1, 3, 4，平均值是 (0.1 + 0.15 + 0.2 + 0.25) / 4 = 0.175

對於 radius error 欄位的索引 1 的缺失值：

最近的鄰居是索引 0, 2, 3，平均值是 (1.1 + 1.3 + 1.4) / 3 = 1.2667

對於 radius error 欄位的索引 4 的缺失值：

最近的鄰居是索引 0, 1, 2, 3，平均值是 (1.1 + 1.2667 + 1.3 + 1.4) / 4 = 1.2667


In [None]:
# 5. 用KNN填充部分欄位
knn_fill_columns = ['mean fractal dimension', 'radius error']
knn_imputer = KNNImputer(n_neighbors=5)
df[knn_fill_columns] = knn_imputer.fit_transform(df[knn_fill_columns])

### 插補特定數值

In [None]:
# 6. 用特定值填充部分欄位
specific_value_fill_columns = ['texture error', 'perimeter error']
df[specific_value_fill_columns] = df[specific_value_fill_columns].fillna(0)

### Re-check Missing Values

In [None]:
# 檢查處理後每個欄位的缺失值
missing_values_after = df.isnull().sum()
print(missing_values_after)

### 多重插補法 `IterativeImputer`
- 用於填補資料集中的缺失值
- 迭代地使用其他特徵來估計缺失值
- 優點在於可充分利用數據集中所有可用的信息來進行插補，通常比簡單的插補方法（如均值插補或中位數插補）更加精確


常用參數

`max_iter`默認值：10

說明：最大迭代次數。插補過程將重複這麼多次，直到結果收斂或達到最大迭代次數。

`n_nearest_features`默認值：None

說明：用於插補的最近鄰居特徵的數量。如果設置為 None，則使用所有特徵。

`initial_strategy`默認值：'mean'

說明：初始插補策略，用於第一次填補缺失值。選項有 'mean'、'median'、'most_frequent'。

`imputation_order`默認值：'ascending'

說明：插補順序。選項有 'ascending'（從缺失值最少的特徵開始）、'descending'（從缺失值最多的特徵開始）、'roman'（按列順序）、'arabic'（按列逆序）和 'random'（隨機順序）。

`random_state`默認值：None

說明：控制隨機數生成器的種子，以保證結果的可重現性。

In [None]:
# 剩餘欄位的缺失值處理
remaining_columns = ['area error', 'smoothness error', 'compactness error',
                     'concavity error', 'concave points error', 'symmetry error',
                     'fractal dimension error', 'worst radius', 'worst texture',
                     'worst perimeter', 'worst area', 'worst smoothness',
                     'worst compactness', 'worst concavity', 'worst concave points',
                     'worst symmetry', 'worst fractal dimension']

# 7. 使用IterativeImputer填充
iterative_imputer = IterativeImputer(random_state=0,
                                     max_iter=50,
                                     n_nearest_features=None,
                                     imputation_order='ascending',
                                     initial_strategy='median',
)

df[remaining_columns] = iterative_imputer.fit_transform(df[remaining_columns])

In [None]:
# 檢查處理後每個欄位的缺失值
missing_values_final = df.isnull().sum()
print(missing_values_final)

# Data Normalization

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, MaxAbsScaler

In [None]:
df.head(5)

## Min-Max Scaling（最小-最大縮放）
- 將資料縮放到 [0, 1] 的範圍內。

In [None]:
# 1. Min-Max Scaling（最小-最大縮放）
scaler = MinMaxScaler()
normalized_df_min_max = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
normalized_df_min_max

In [None]:
# 假設 'target' 是目標欄位的名稱
target_column = 'target'

# 分離特徵和目標欄位
features = df.drop(columns=[target_column])
target = df[target_column]

# 進行 Min-Max Scaling 只對特徵欄位
scaler = MinMaxScaler()
normalized_min_max_features = pd.DataFrame(scaler.fit_transform(features), columns=features.columns)

# 將目標欄位添加回去
normalized_df_min_max = pd.concat([normalized_min_max_features, target.reset_index(drop=True)], axis=1)
normalized_df_min_max

## Z-Score Standardization（標準化）
- 將資料縮放為均值為0，標準差為1。

In [None]:
# 2. Z-Score Standardization（標準化）
scaler = StandardScaler()
normalized_df_z_score = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
normalized_df_z_score

In [None]:
normalized_z_score_features = pd.DataFrame(scaler.fit_transform(features), columns=features.columns)

# 將目標欄位添加回去
normalized_df_z_score = pd.concat([normalized_z_score_features, target.reset_index(drop=True)], axis=1)
normalized_df_z_score

## Robust Scaler（穩健縮放器）
- 基於中位數和四分位數範圍進行縮放，對於含有離群值的資料集更為穩健。

In [None]:
# 3. Robust Scaler（穩健縮放器）
scaler = RobustScaler()

# To Do


## Max Abs Scaler（最大絕對值縮放器）
- 將資料縮放到 [-1, 1] 的範圍內，適合稀疏資料集。

In [None]:
# 4. Max Abs Scaler（最大絕對值縮放器）
# To Do


# Split Train & Test Dataset

In [None]:
from sklearn.model_selection import train_test_split

# 加載數據集
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Traning & Evaluate Model Performance

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

### 朴素貝葉斯(Naive Bayes, NB)

In [None]:
from sklearn.naive_bayes import GaussianNB

# 建立模型
GNB_model = GaussianNB()
GNB_model.fit(X_train, y_train)

# 預測
predictions = GNB_model.predict(X_test)

In [None]:
predictions

In [None]:
print("Accuracy:", accuracy_score(y_test, predictions))
print("Precision:", precision_score(y_test, predictions))
print("Recall:", recall_score(y_test, predictions))
print("F1 Score:", f1_score(y_test, predictions))

In [None]:
print(classification_report(y_test, predictions, digits=5))

### 線性回歸（Linear Regression, LR）

In [None]:
from sklearn.linear_model import LogisticRegression

# 建立模型
LR_model = LogisticRegression(max_iter=10000)
LR_model.fit(X_train, y_train)

# 預測
predictions = LR_model.predict(X_test)
print(classification_report(y_test, predictions, digits=5))

### 決策樹（Decision Tree, DT）

In [None]:
from sklearn.tree import DecisionTreeClassifier

# To Do

### 隨機森林（Random Forest, RF）

In [None]:
from sklearn.ensemble import RandomForestClassifier

# To Do

### 支持向量機（Support Vector Machine, SVM）

In [None]:
from sklearn import svm

# 建立模型
SVM_model = svm.SVC()
SVM_model.fit(X_train, y_train)

# 預測
predictions = SVM_model.predict(X_test)
print(classification_report(y_test, predictions, digits=5))

### 多層感知機（Multilayer Perceptron, MLP）

In [None]:
from sklearn.neural_network import MLPClassifier

# 建立模型
mlp = MLPClassifier(hidden_layer_sizes=(50, 30), max_iter=1000)
mlp.fit(X_train, y_train)

# 預測
predictions = mlp.predict(X_test)
print(classification_report(y_test, predictions, digits=5))

### XGBoost

In [None]:
import xgboost as xgb

# To Do

### LightGBM

In [None]:
import lightgbm as lgb

# 建立模型
LGBM_model = lgb.LGBMClassifier(verbose=-1)
LGBM_model.fit(X_train, y_train)

# 預測
predictions = LGBM_model.predict(X_test)
print(classification_report(y_test, predictions, digits=5))

# K Fold Cross Validation

In [None]:
# Dictionary of classifiers for baseline comparison
classifiers = {
    'CNB': GaussianNB(),
    'LR': LogisticRegression(random_state=42, max_iter=10000),
    'DT': DecisionTreeClassifier(random_state=42),
    'RF': RandomForestClassifier(random_state=42),
    'SVM': svm.SVC(probability=True, random_state=42),  # 設定 probability=True 以獲得概率
    'MLP': MLPClassifier(random_state=42, max_iter=1000),  # 設定 max_iter 以防 MLPClassifier 收斂問題
    'XGBoost': xgb.XGBClassifier(random_state=42),
    'LightGBM': lgb.LGBMClassifier(random_state=42, verbose=-1),
}

In [None]:
from sklearn.model_selection import KFold
from tqdm import tqdm
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
import pickle

# 讀取數據集
X = df.drop('target', axis=1)  # 刪除目標欄位，保留特徵
y = df['target']  # 目標欄位

# 定義 10 折交叉驗證
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# 遍歷分類器字典
for name, classifier in classifiers.items():
    y_trues = []  # 用來存儲真實標籤
    y_preds = []  # 用來存儲預測標籤
    y_probs = []  # 用來存儲預測概率

    # 遍歷每一折交叉驗證
    for train_index, val_index in tqdm(kf.split(X), desc=f'{name} 10-Fold CV'):
        # 使用 iloc 來基於索引選取數據
        X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
        y_train_fold, y_val_fold = y.iloc[train_index], y.iloc[val_index]

        # 訓練分類器
        classifier.fit(X_train_fold, y_train_fold)
        # 預測標籤
        y_pred = classifier.predict(X_val_fold)
        # 預測概率，儲存預測結果為1個概率
        y_prob = classifier.predict_proba(X_val_fold)[:, 1]

        # 將結果加入列表
        y_trues.extend(y_val_fold)
        y_preds.extend(y_pred)
        y_probs.extend(y_prob)

    # 輸出 10 折交叉驗證報告
    print(f'{name} 10-Fold Cross Validation Report:\n')
    print(classification_report(y_trues, y_preds, digits=5))

    # 將列表轉換為 NumPy 陣列
    y_trues = np.array(y_trues)
    y_preds = np.array(y_preds)
    y_probs = np.array(y_probs)

    # 創建 DataFrame 來存儲結果
    df_results = pd.DataFrame({
        "model_name": [name] * len(y_trues),
        "model_preds": y_preds,
        "model_labels": y_trues,
        "model_prob": y_probs
    })

    # 保存結果到 pickle 文件
    result_path = f"{name}_10Folds.pkl"
    df_results.to_pickle(result_path)

    # 保存模型到 pickle 文件
    model_path = f"{name}_model.pkl"
    with open(model_path, 'wb') as file:
        pickle.dump(classifier, file)


# Plot the Curve

## AUROC (ROC curve)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

# 定義模型名稱
model_names = ['CNB', 'LR', 'DT', 'RF', 'SVM', 'MLP', 'XGBoost', 'LightGBM']

# 設置繪圖區域大小
fig = plt.figure(figsize=(10, 10))

# 設置繪圖區域背景顏色及邊框顏色
ax = plt.axes()
ax.set_facecolor("white")
ax.spines["top"].set_color("#3C3C3C")
ax.spines["bottom"].set_color("#3C3C3C")
ax.spines["left"].set_color("#3C3C3C")
ax.spines["right"].set_color("#3C3C3C")


for name in model_names:

    # 讀取保存的模型結果文件
    df_results = pd.read_pickle(f"{name}_10Folds.pkl")

    # 提取模型預測結果和真實標籤
    y_true = df_results["model_labels"]
    y_prob = df_results["model_prob"]

    # 計算假陽性率和真陽性率
    fpr, tpr, _ = roc_curve(y_true, y_prob)
    roc_auc = auc(fpr, tpr)

    # 繪製ROC曲線
    plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.4f})')

# 設置網格顏色
plt.grid(True, color="#9D9D9D")

# 繪製對角線
plt.plot([0, 1], [0, 1], color='#BEBEBE', linestyle='--')

# 設置X軸刻度和標籤
plt.xticks(np.arange(0.0, 1.05, step=0.2))
plt.xlabel("1-Specificity", fontsize=14)

# 設置Y軸刻度和標籤
plt.yticks(np.arange(0.0, 1.05, step=0.2))
plt.ylabel("Sensitivity", fontsize=14)

# 設置圖表標題
plt.title('Receiver Operating Characteristic Curve', fontweight='bold', fontsize=15)

# 設置圖例
plt.legend(prop={'size': 13}, loc='lower right', facecolor="white")

# 保存圖表為PNG圖片
plt.savefig(f"AUROC.png", dpi=300)

# 顯示圖表
plt.show()

## AUPRC ( PR Curve)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve, average_precision_score

# 定義模型名稱
model_names = ['CNB', 'LR', 'DT', 'RF', 'SVM', 'MLP', 'XGBoost', 'LightGBM']

# 設置繪圖區域大小
fig = plt.figure(figsize=(10, 10))

# 設置繪圖區域背景顏色及邊框顏色
ax = plt.axes()
ax.set_facecolor("white")
ax.spines["top"].set_color("#3C3C3C")
ax.spines["bottom"].set_color("#3C3C3C")
ax.spines["left"].set_color("#3C3C3C")
ax.spines["right"].set_color("#3C3C3C")

for name in model_names:

    # 讀取保存的模型結果文件
    df_results = pd.read_pickle(f"{name}_10Folds.pkl")

    # 提取模型預測結果和真實標籤
    y_true = df_results["model_labels"]
    y_prob = df_results["model_prob"]

    # 計算精確率和召回率
    precision, recall, _ = precision_recall_curve(y_true, y_prob)
    pr_auc = auc(recall, precision)

    # 繪製PR曲線
    plt.plot(recall, precision, lw=2, label=f'{name} (AP = {pr_auc:.4f})')

# 設置網格顏色
plt.grid(True, color="#9D9D9D")

# 繪製對角線
plt.plot([0, 1], [0, 0], color='#BEBEBE', linestyle='--')

# 設置X軸刻度和標籤
plt.xticks(np.arange(0.0, 1.05, step=0.2))
plt.xlabel("Recall", fontsize=14)

# 設置Y軸刻度和標籤
plt.yticks(np.arange(0.0, 1.05, step=0.2))
plt.ylabel("Precision", fontsize=14)

# 設置圖表標題
plt.title('Precision-Recall Curve', fontweight='bold', fontsize=15)

# 設置圖例
plt.legend(prop={'size': 13}, loc='lower left', facecolor="white")

# 保存圖表為PNG圖片
plt.savefig(f"PR_curve.png", dpi=300)

# 顯示圖表
plt.show()

# SHAP
- [SHAP Wbesite](https://shap.readthedocs.io/en/latest/#)
- TreeExplainer (XGBoost/LightGBM/CatBoost/scikit-learn models)

In [None]:
%pip install shap

In [None]:
import shap

shap.initjs()

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

In [None]:
# 加載數據集
data = load_breast_cancer()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

X = df.drop('target', axis=1)
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)


In [None]:
model = lgb.LGBMClassifier(verbose=-1, random_seed=12)
model.fit(X, y)

In [None]:
explainer = shap.TreeExplainer(model)
shap_values = explainer(X, y)

In [None]:
shap.plots.beeswarm(shap_values)

In [None]:
shap.plots.beeswarm(shap_values.abs, color="shap_red")

In [None]:
shap.plots.waterfall(shap_values[0])

In [None]:
shap.plots.bar(shap_values, max_display=30)