# 復發預測
利用Deephit 全連接 + 多任務學習 + 殘差網路

## 資料讀取


In [15]:
import pandas as pd
import openpyxl

# Read the Excel file
df = pd.read_excel('.\\data\\answer.xlsx')

# Display the first few rows of the dataframe
df.head()

Unnamed: 0,ID,復發,身分證統一編號,性別,出生日期,戶籍地址代碼,診斷年齡,癌症發生順序號碼,個案分類,診斷狀態分類,...,SSF2,SSF3,SSF4,SSF5,SSF6,SSF7,SSF8,SSF9,SSF10,復發資訊
0,201802000141,V,T120626632,1,19591226,729,57,1,1,1,...,10,20,40,10,0,0,0,60,,復發
1,201802000193,V,E201069975,2,19471020,736,69,1,2,2,...,20,988,220,0,0,10,0,988,,復發
2,201802000211,V,S120316738,1,19570114,738,59,1,2,2,...,10,999,200,0,0,0,0,50,,復發
3,201802000214,V,E220800558,2,19670930,733,49,1,1,1,...,20,30,200,0,0,10,0,988,,復發
4,201802000233,V,E100776335,1,19400303,732,77,2,1,1,...,10,10,450,0,0,10,0,988,,復發


In [16]:
# 保留所需的列
columns_to_keep = [
    'ID', '復發', '原發部位', '側性', '組織型態', '性態碼', '腫瘤大小', '區域淋巴檢查數目', '區域淋巴結侵犯數目', 
    '臨床 T', '臨床 N', '臨床 M', '臨床期別組合', '臨床分期字根字首', '病理 T', '病理 N', '病理 M', 
    '病理期別組合', '病理分期字根字首', '身高', '體重', '吸菸行為', '嚼檳榔行為', '喝酒行為','存活狀態'
]
df_filtered = df.filter(columns_to_keep)

# 顯示過濾後的數據框
df_filtered.head()

Unnamed: 0,ID,復發,原發部位,側性,組織型態,性態碼,腫瘤大小,區域淋巴檢查數目,區域淋巴結侵犯數目,臨床 T,...,病理 N,病理 M,病理期別組合,病理分期字根字首,身高,體重,吸菸行為,嚼檳榔行為,喝酒行為,存活狀態
0,201802000141,V,C209,0,8140,3,46,0,98,3,...,X,B,2A,4,171,81,989800,88,9,0
1,201802000193,V,C199,0,8140,3,45,29,0,3,...,0,B,2A,0,151,48,88,88,0,0
2,201802000211,V,C209,0,8480,3,36,25,24,3,...,2B,B,3C,4,160,74,88,88,0,0
3,201802000214,V,C199,0,8140,3,50,12,0,4B,...,0,B,2A,4,162,51,88,88,0,1
4,201802000233,V,C199,0,8140,3,48,12,0,4B,...,0,B,1,4,157,55,88,88,0,0


In [17]:
from sklearn.preprocessing import LabelEncoder

# 對非數字欄位進行編碼
label_encoders = {}
for column in df_filtered.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df_filtered[column] = le.fit_transform(df_filtered[column].astype(str))
    label_encoders[column] = le

# 將所有欄位轉換為數字
df_filtered = df_filtered.apply(pd.to_numeric, errors='coerce')

# 顯示轉換後的數據框
df_filtered.head()


Unnamed: 0,ID,復發,原發部位,側性,組織型態,性態碼,腫瘤大小,區域淋巴檢查數目,區域淋巴結侵犯數目,臨床 T,...,病理 N,病理 M,病理期別組合,病理分期字根字首,身高,體重,吸菸行為,嚼檳榔行為,喝酒行為,存活狀態
0,201802000141,0,9,0,8140,3,46,0,98,2,...,11,0,1,4,171,81,989800,88,9,0
1,201802000193,0,8,0,8140,3,45,29,0,2,...,0,0,1,0,151,48,88,88,0,0
2,201802000211,0,9,0,8480,3,36,25,24,2,...,9,0,6,4,160,74,88,88,0,0
3,201802000214,0,8,0,8140,3,50,12,0,6,...,0,0,1,4,162,51,88,88,0,1
4,201802000233,0,8,0,8140,3,48,12,0,6,...,0,0,0,4,157,55,88,88,0,0


In [59]:
import torch

import torch.nn as nn
import torch.nn.functional as F

# Adjust the model to handle dynamic number of tasks
class DynamicMultiTaskModel(nn.Module):
    def __init__(self, input_dim, shared_dim, task_dims):
        super(DynamicMultiTaskModel, self).__init__()
        
        # Shared layers
        self.shared_fc1 = nn.Linear(input_dim, shared_dim)
        self.shared_fc2 = nn.Linear(shared_dim, shared_dim)
        self.shared_fc3 = nn.Linear(shared_dim, shared_dim)
        
        # Task specific layers
        self.task_layers = nn.ModuleList()
        for task_dim in task_dims:
            task_layer = nn.ModuleList([
                nn.Linear(shared_dim, task_dim),
                nn.Linear(task_dim, task_dim),
                nn.Linear(task_dim, 1)  # Assuming binary classification for each task
            ])
            self.task_layers.append(task_layer)
        
    def forward(self, x):
        # Shared layers with residual connections
        shared_out = F.relu(self.shared_fc1(x))
        shared_out = F.relu(self.shared_fc2(shared_out) + shared_out)
        shared_out = F.relu(self.shared_fc3(shared_out) + shared_out)
        
        # Task specific layers
        task_outputs = []
        for task_layer in self.task_layers:    
            task_out = F.relu(task_layer[0](shared_out))
            task_out = F.relu(task_layer[1](task_out) + task_out)
            task_out = torch.sigmoid(task_layer[2](task_out))
            task_outputs.append(task_out)
        return task_outputs

# Example usage
input_dim = 23  # Number of input features
shared_dim = 64  # Dimension of shared layers
task_dims = [1,1]  # Dimensions of task specific layers for each task

model = DynamicMultiTaskModel(input_dim, shared_dim, task_dims)
print(model)

DynamicMultiTaskModel(
  (shared_fc1): Linear(in_features=23, out_features=64, bias=True)
  (shared_fc2): Linear(in_features=64, out_features=64, bias=True)
  (shared_fc3): Linear(in_features=64, out_features=64, bias=True)
  (task_layers): ModuleList(
    (0-1): 2 x ModuleList(
      (0): Linear(in_features=64, out_features=1, bias=True)
      (1-2): 2 x Linear(in_features=1, out_features=1, bias=True)
    )
  )
)


In [71]:
from sklearn.model_selection import train_test_split

import torch.optim as optim

# 將資料分成訓練集和測試集
X = df_filtered.drop(columns=['復發', '存活狀態']).values
y = df_filtered[['復發', '存活狀態']].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 將資料轉換為 PyTorch 張量
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)

# 定義損失函數和優化器
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 訓練模型
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    # 前向傳播
    outputs = model(X_train_tensor)
    #print(outputs)
    # 計算損失
    loss = sum(criterion(output.squeeze(), y_train_tensor.squeeze()[:, i]) for i, output in enumerate(outputs))
    
    # 反向傳播和優化
    loss.backward()
    optimizer.step()
    
    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')
        
        # 驗證模型
        model.eval()
        with torch.no_grad():
            test_outputs = model(X_test_tensor)
            test_loss = sum(criterion(test_output.squeeze(), y_test_tensor.squeeze()[:, i]) for i, test_output in enumerate(test_outputs))
            print(f'eval Loss: {test_loss.item():.4f}')
            # 計算準確度
            correct = 0
            total = 0
            for i, test_output in enumerate(test_outputs):
                predicted = (test_output.squeeze() > 0.5).float()
                total += y_test_tensor.squeeze()[:, i].size(0)
                correct += (predicted == y_test_tensor.squeeze()[:, i]).sum().item()
            
            accuracy = correct / total
            print(f'Accuracy: {accuracy:.4f}')

Epoch [10/100], Loss: 132.2581
eval Loss: 136.1702
Accuracy: 0.3191
Epoch [20/100], Loss: 132.2581
eval Loss: 136.1702
Accuracy: 0.3191
Epoch [30/100], Loss: 132.2581
eval Loss: 136.1702
Accuracy: 0.3191
Epoch [40/100], Loss: 132.2581
eval Loss: 136.1702
Accuracy: 0.3191
Epoch [50/100], Loss: 132.2581
eval Loss: 136.1702
Accuracy: 0.3191
Epoch [60/100], Loss: 132.2581
eval Loss: 136.1702
Accuracy: 0.3191
Epoch [70/100], Loss: 132.2581
eval Loss: 136.1702
Accuracy: 0.3191
Epoch [80/100], Loss: 132.2581
eval Loss: 136.1702
Accuracy: 0.3191
Epoch [90/100], Loss: 132.2581
eval Loss: 136.1702
Accuracy: 0.3191
Epoch [100/100], Loss: 132.2581
eval Loss: 136.1702
Accuracy: 0.3191


In [67]:
y_train_tensor.squeeze()[:, i].shape

torch.Size([186])