In [1]:
import pandas as pd
import numpy as np

# 1. **데이터 설명**
1. pathway_train, pathway_test, pathway_valid : CCLE에서 pathway score 계산 한 데이터
2. ic_train, ic_test, ic_val : 민감군, 저항군 라벨
3. ccle : 유전자 raw data -> 민감군, 저항군 간 유전자 발현량이 가장 큰 유전자 3개를 genes에 저장
4. tcga: 환자 데이터로 약물 민간/저항 라벨 부착 예정

In [2]:
pathway_train = pd.read_csv("train_pathway_score_TOZASERTIB.csv")
pathway_test = pd.read_csv("test_pathway_score_TOZASERTIB.csv")
pathway_valid = pd.read_csv("val_pathway_score_TOZASERTIB.csv")

ic_train = pd.read_csv("ic_train_TOZASERTIB.csv")
ic_test = pd.read_csv("ic_test_TOZASERTIB.csv")
ic_val = pd.read_csv("ic_val_TOZASERTIB.csv")

ccle = pd.read_csv("C:/Users/User/BAF-의료/data/#_filtered_CCLE_gene_expression.csv")
ccle.index = ccle["Unnamed: 0"]

tcga_path = pd.read_csv("tcga_pathway_score_TOZASERTIB.csv")
tcga = pd.read_csv("C:/Users/User/BAF-의료/data/TCGA_final_0419.csv")


In [3]:
pathway_train.set_index("SampleID", inplace= True)
pathway_test.set_index("SampleID", inplace= True)
pathway_valid.set_index("SampleID", inplace= True)

ic_train.set_index("Unnamed: 0", inplace=True)
ic_test.set_index("Unnamed: 0", inplace=True)
ic_val.set_index("Unnamed: 0", inplace=True)

tcga_path.set_index("SampleID",inplace=True)
tcga.set_index("Unnamed: 0", inplace=True)

In [4]:
# DEG로 계산한 민감군, 저항군 간 차이가 큰 유전자 3가지
genes = ["CCND2", "LYZ", "SELPLG"  ]

# 2. **전처리**
1. pathway score scale
2. pathway score에 대해 PCA 진행 후 90% 설명력까지 사용
3. train 기준으로 fjt하고 trest, valid, TCGA에는 transform만 해주기
4. ccle, tcga 유전자들도 sacle해주기
5. ic라벨 인코딩

In [5]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# 스케일링 (z-score 정규화가 되어있더라도 PCA 전에 한 번 더 정규화 )
scaler = StandardScaler()
X_scaled = scaler.fit_transform(pathway_train)
X_test_scaled = scaler.transform(pathway_test)
X_val_scaled = scaler.transform(pathway_valid)
tcga_path_scaled = scaler.transform(tcga_path)

# PCA 수행 (주성분 2개로 축소)
pca = PCA(n_components=0.90)
pathway_pca_train = pca.fit_transform(X_scaled)
pathway_pca_test  = pca.transform(X_test_scaled)
pathway_pca_valid = pca.transform(X_val_scaled)
pathway_pca_tcga = pca.transform(tcga_path_scaled)


In [6]:
pca_columns = [f"PC{i+1}" for i in range(pathway_pca_train.shape[1])]
pca_train_df_all = pd.DataFrame(pathway_pca_train, columns=pca_columns, index=pathway_train.index)
pca_train_df_all

pca_valid_df_all = pd.DataFrame(pathway_pca_valid, columns=pca_columns, index=pathway_valid.index)
pca_valid_df_all

pca_test_df_all = pd.DataFrame(pathway_pca_test, columns=pca_columns, index=pathway_test.index)
pca_test_df_all

pca_tcga_df_all = pd.DataFrame(pathway_pca_tcga, columns=pca_columns, index = tcga_path.index)


In [7]:
ccle = ccle[genes]
ccle_expr_train = ccle.loc[pathway_train.index]
ccle_log_train = np.log2(ccle_expr_train+1)

scaler = StandardScaler()
ccle_scaled_train = pd.DataFrame(scaler.fit_transform(ccle_log_train),
                          columns = ccle_expr_train.columns,
                          index = ccle_expr_train.index)

ccle_expr_valid = ccle.loc[pathway_valid.index]
ccle_log_valid = np.log2(ccle_expr_valid+1)

ccle_scaled_valid = pd.DataFrame(scaler.transform(ccle_log_valid),
                          columns = ccle_expr_valid.columns,
                          index = ccle_expr_valid.index)

ccle_expr_test = ccle.loc[pathway_test.index]
ccle_log_test = np.log2(ccle_expr_test+1)
ccle_scaled_test = pd.DataFrame(scaler.transform(ccle_log_test),
                          columns = ccle_expr_test.columns,
                          index = ccle_expr_test.index)

In [8]:
tcga = tcga[genes]
tcga_expr = tcga.loc[tcga.index]
tcga_log = np.log2(tcga_expr+1)

tcga_scaled = pd.DataFrame(scaler.transform(tcga_log),
                          columns=tcga_expr.columns,
                          index = tcga_expr.index)


In [9]:
X_train = pd.concat([pca_train_df_all, ccle_scaled_train], axis = 1)
X_valid = pd.concat([pca_valid_df_all, ccle_scaled_valid], axis = 1)
X_test = pd.concat([pca_test_df_all, ccle_scaled_test], axis = 1)
X_tcga = pd.concat([pca_tcga_df_all,tcga_scaled], axis = 1)


final_scaler = StandardScaler()
X_train_final = final_scaler.fit_transform(X_train)
X_valid_final = final_scaler.transform(X_valid)
X_test_final = final_scaler.transform(X_test)
X_tcga_final = final_scaler.transform(X_tcga)
X_tcga

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC41,PC42,PC43,PC44,PC45,PC46,PC47,CCND2,LYZ,SELPLG
TCGA.A2.A25D.01,-0.701719,-1.510414,0.070322,1.757629,2.429550,1.554154,0.737629,0.426931,1.537782,-0.793547,...,-0.564286,0.118989,-0.320622,-0.277875,0.526196,-0.091792,-0.530800,1.513339,2.839325,3.239253
TCGA.BH.A201.01,8.187501,8.147141,2.039770,1.488943,-0.039922,-0.029932,-1.344652,2.345067,0.005963,0.294464,...,-1.506772,0.119846,1.083582,0.713179,0.115587,-0.202217,-0.297659,1.642246,1.957932,1.859119
TCGA.AC.A23C.01,10.414578,6.941708,1.923020,1.884281,0.735348,0.126041,-1.745734,3.047178,0.860106,-0.454126,...,-0.013963,0.876192,1.911586,0.950005,0.394922,0.662439,-0.379005,1.672693,2.312363,2.904958
TCGA.AR.A5QP.01,-5.466996,-2.672094,-0.305852,-0.325318,1.724593,-1.010020,1.450354,-0.420352,-1.950753,-0.777733,...,0.214656,-0.024682,0.299358,-0.302704,0.695664,-0.157369,-0.448169,1.338155,1.832413,2.568035
TCGA.C8.A12P.01,0.495915,-0.608682,0.018269,0.613215,1.321306,0.114212,-2.519685,-2.568175,3.027821,1.137615,...,-0.920432,-1.194987,2.142686,-0.526065,0.083534,-0.200149,0.388126,1.299838,2.115426,2.587012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA.LL.A5YP.01,-4.415981,0.782237,-1.277562,-0.465203,-0.354761,0.488744,0.018209,-1.720087,0.302833,1.257225,...,0.342701,0.499458,-0.593479,0.813698,-0.218666,0.137832,1.517793,1.078478,2.299545,2.738434
TCGA.AO.A03L.01,3.823017,1.543038,-0.372449,2.604970,-1.756090,1.467586,-0.186135,1.009325,0.421235,-0.759486,...,-0.458627,-1.389755,0.107731,-0.491017,-0.496692,0.287621,0.093902,1.392217,2.179881,2.648512
TCGA.BH.A42T.01,3.655633,-0.790984,0.382277,2.540332,1.157169,1.842694,1.045143,0.947561,1.423271,-1.173992,...,0.439654,0.216617,0.534904,0.709332,0.619385,0.418618,0.460736,1.134096,1.866079,3.267370
TCGA.A2.A04W.01,-8.898838,-6.570026,-1.778598,-2.742514,0.009789,-0.348434,0.560011,-4.047984,1.146161,1.544842,...,1.265643,-0.270048,-0.785691,0.352068,0.677687,-0.803460,1.151087,1.463771,1.504530,2.019016


In [10]:
# 숫자 라벨로 변환 (sensitive = 1, resistant = 0)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(ic_train.iloc[:,0])
y_valid = le.fit_transform(ic_val.iloc[:,0])
y_test = le.fit_transform(ic_test.iloc[:,0])

# **3. 모델링**
1. 로지스틱, 랜덤포레스트, SVM, KNN으로 진행
2. 전처리를 R로 해야했기에 pipeline으로 묶을 수 없어 cross validation이 힘듬
3. 평가지표는 AUC, F1 score 중점으로 사용
4. 수동으로 튜닝 후 성능이 제일 좋은 모델 선택 후 TCGA 라벨 예측

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

In [13]:
## Logistic Regression
lr = LogisticRegression(C=0.05, max_iter=10000,class_weight="balanced",solver="liblinear")
lr.fit(X_train_final, y_train)
y_pred_lr = lr.predict(X_test_final)
y_proba_lr = lr.predict_proba(X_test_final)[:, 1]

## Random Forest
rf = RandomForestClassifier(n_estimators=250, max_depth=5, random_state=42,class_weight="balanced")
rf.fit(X_train_final, y_train)
y_pred_rf = rf.predict(X_test_final)
y_proba_rf = rf.predict_proba(X_test_final)[:, 1]

## Support Vector Machine (SVM)
svm = SVC(C=1,kernel='linear',probability=True, random_state=42,class_weight="balanced")
svm.fit(X_train_final, y_train)
y_pred_svm = svm.predict(X_test_final)
y_proba_svm = svm.predict_proba(X_test_final)[:, 1]

## K-Nearest Neighbors (KNN)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_final, y_train)
y_pred_knn = knn.predict(X_test_final)
y_proba_knn = knn.predict_proba(X_test_final)[:, 1]

# ── AdaBoost 추가 ──
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier(
    n_estimators=50,
    random_state=42
)
ada.fit(X_train_final, y_train)

y_pred_ada   = ada.predict(X_test_final)
y_proba_ada  = ada.predict_proba(X_test_final)[:, 1]

In [14]:
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

print("===== Validation Set Evaluation =====")

# Logistic Regression - Validation
y_pred_lr_valid = lr.predict(X_valid_final)
y_proba_lr_valid = lr.predict_proba(X_valid_final)[:, 1]
print("\n--- Logistic Regression ---")
print(classification_report(y_valid, y_pred_lr_valid, target_names=le.classes_))
print("ROC-AUC:", roc_auc_score(y_valid, y_proba_lr_valid))
print("Confusion matrix:\n", confusion_matrix(y_valid, y_pred_lr_valid))

# Random Forest - Validation
y_pred_rf_valid = rf.predict(X_valid_final)
y_proba_rf_valid = rf.predict_proba(X_valid_final)[:, 1]
print("\n--- Random Forest ---")
print(classification_report(y_valid, y_pred_rf_valid, target_names=le.classes_))
print("ROC-AUC:", roc_auc_score(y_valid, y_proba_rf_valid))
print("Confusion matrix:\n", confusion_matrix(y_valid, y_pred_rf_valid))

# Support Vector Machine - Validation
y_pred_svm_valid = svm.predict(X_valid_final)
y_proba_svm_valid = svm.predict_proba(X_valid_final)[:, 1]
print("\n--- Support Vector Machine ---")
print(classification_report(y_valid, y_pred_svm_valid, target_names=le.classes_))
print("ROC-AUC:", roc_auc_score(y_valid, y_proba_svm_valid))
print("Confusion matrix:\n", confusion_matrix(y_valid, y_pred_svm_valid))

# K-Nearest Neighbors - Validation
y_pred_knn_valid = knn.predict(X_valid_final)
y_proba_knn_valid = knn.predict_proba(X_valid_final)[:, 1]
print("\n--- K-Nearest Neighbors ---")
print(classification_report(y_valid, y_pred_knn_valid, target_names=le.classes_))
print("ROC-AUC:", roc_auc_score(y_valid, y_proba_knn_valid))
print("Confusion matrix:\n", confusion_matrix(y_valid, y_pred_knn_valid))
# AdaBoost - Validation
y_pred_ada_valid   = ada.predict(X_valid_final)
y_proba_ada_valid  = ada.predict_proba(X_valid_final)[:, 1]
print("\n--- AdaBoost ---")
print(classification_report(y_valid, y_pred_ada_valid, target_names=le.classes_))
print("ROC-AUC:", roc_auc_score(y_valid, y_proba_ada_valid))
print("Confusion matrix:\n", confusion_matrix(y_valid, y_pred_ada_valid))

===== Validation Set Evaluation =====

--- Logistic Regression ---
              precision    recall  f1-score   support

   resistant       0.70      0.57      0.63        28
   sensitive       0.59      0.71      0.64        24

    accuracy                           0.63        52
   macro avg       0.64      0.64      0.63        52
weighted avg       0.65      0.63      0.63        52

ROC-AUC: 0.6651785714285715
Confusion matrix:
 [[16 12]
 [ 7 17]]

--- Random Forest ---
              precision    recall  f1-score   support

   resistant       0.58      0.79      0.67        28
   sensitive       0.57      0.33      0.42        24

    accuracy                           0.58        52
   macro avg       0.58      0.56      0.54        52
weighted avg       0.58      0.58      0.55        52

ROC-AUC: 0.6145833333333334
Confusion matrix:
 [[22  6]
 [16  8]]

--- Support Vector Machine ---
              precision    recall  f1-score   support

   resistant       0.73      0.68    

In [15]:
from sklearn.ensemble import VotingClassifier
# ── Soft Voting Ensemble ──
voting_clf = VotingClassifier(
    estimators=[
        ('lr', lr),
        ('rf', rf),
        ('svm', svm),
        ('knn', knn),
        ('ada', ada)
    ],
    voting='soft'
)
voting_clf.fit(X_train_final, y_train)

y_pred_voting  = voting_clf.predict(X_test_final)
y_proba_voting = voting_clf.predict_proba(X_test_final)[:, 1]

print("\n===== Soft Voting Ensemble =====")
print(classification_report(y_test, y_pred_voting, target_names=le.classes_))
print("ROC-AUC:", roc_auc_score(y_test, y_proba_voting))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_voting))


===== Soft Voting Ensemble =====
              precision    recall  f1-score   support

   resistant       0.66      0.70      0.68        27
   sensitive       0.62      0.57      0.59        23

    accuracy                           0.64        50
   macro avg       0.64      0.63      0.63        50
weighted avg       0.64      0.64      0.64        50

ROC-AUC: 0.6312399355877618
Confusion matrix:
 [[19  8]
 [10 13]]
