In [1]:
import pandas as pd
import numpy as np

# 1. **데이터 설명**
1. pathway_train, pathway_test, pathway_valid : CCLE에서 pathway score 계산 한 데이터
2. ic_train, ic_test, ic_val : 민감군, 저항군 라벨
3. ccle : 유전자 raw data -> 민감군, 저항군 간 유전자 발현량이 가장 큰 유전자 3개를 genes에 저장
4. tcga: 환자 데이터로 약물 민간/저항 라벨 부착 예정

In [2]:
pathway_train = pd.read_csv("train_pathway_score_TRAMETINIB.csv")
pathway_test = pd.read_csv("test_pathway_score_TRAMETINIB.csv")
pathway_valid = pd.read_csv("val_pathway_score_TRAMETINIB.csv")

ic_train = pd.read_csv("ic_train_TRAMETINIB.csv")
ic_test = pd.read_csv("ic_test_TRAMETINIB.csv")
ic_val = pd.read_csv("ic_val_TRAMETINIB.csv")

ccle = pd.read_csv("C:/Users/User/BAF-의료/data/#_filtered_CCLE_gene_expression.csv")
ccle.index = ccle["Unnamed: 0"]

tcga_path = pd.read_csv("tcga_pathway_score_TRAMETINIB.csv")
tcga = pd.read_csv("C:/Users/User/BAF-의료/data/TCGA_final_0419.csv")


In [3]:
pathway_train.set_index("SampleID", inplace= True)
pathway_test.set_index("SampleID", inplace= True)
pathway_valid.set_index("SampleID", inplace= True)

ic_train.set_index("Unnamed: 0", inplace=True)
ic_test.set_index("Unnamed: 0", inplace=True)
ic_val.set_index("Unnamed: 0", inplace=True)

tcga_path.set_index("SampleID",inplace=True)
tcga.set_index("Unnamed: 0", inplace=True)

In [4]:
genes = ["COL1A1", "INPP5D", "ITGA11"  ]

# 2. **전처리**
1. pathway score scale
2. pathway score에 대해 PCA 진행 후 90% 설명력까지 사용
3. train 기준으로 fjt하고 trest, valid, TCGA에는 transform만 해주기
4. ccle, tcga 유전자들도 sacle해주기
5. ic라벨 인코딩`

In [5]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# 스케일링 (z-score 정규화가 되어있더라도 PCA 전에 한 번 더 정규화 추천)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(pathway_train)
X_test_scaled = scaler.transform(pathway_test)
X_val_scaled = scaler.transform(pathway_valid)
tcga_path_scaled = scaler.transform(tcga_path)

# PCA 수행 (주성분 2개로 축소)
pca = PCA(n_components=0.90)
pathway_pca_train = pca.fit_transform(X_scaled)
pathway_pca_test  = pca.transform(X_test_scaled)
pathway_pca_valid = pca.transform(X_val_scaled)
pathway_pca_tcga = pca.transform(tcga_path_scaled)

In [6]:
pca_columns = [f"PC{i+1}" for i in range(pathway_pca_train.shape[1])]
pca_train_df_all = pd.DataFrame(pathway_pca_train, columns=pca_columns, index=pathway_train.index)
pca_train_df_all

pca_valid_df_all = pd.DataFrame(pathway_pca_valid, columns=pca_columns, index=pathway_valid.index)
pca_valid_df_all

pca_test_df_all = pd.DataFrame(pathway_pca_test, columns=pca_columns, index=pathway_test.index)
pca_test_df_all

pca_tcga_df_all = pd.DataFrame(pathway_pca_tcga, columns=pca_columns, index = tcga_path.index)

In [7]:
ccle = ccle[genes]
ccle_expr_train = ccle.loc[pathway_train.index]
ccle_log_train = np.log2(ccle_expr_train+1)

scaler = StandardScaler()
ccle_scaled_train = pd.DataFrame(scaler.fit_transform(ccle_log_train),
                          columns = ccle_expr_train.columns,
                          index = ccle_expr_train.index)

ccle_expr_valid = ccle.loc[pathway_valid.index]
ccle_log_valid = np.log2(ccle_expr_valid+1)

ccle_scaled_valid = pd.DataFrame(scaler.transform(ccle_log_valid),
                          columns = ccle_expr_valid.columns,
                          index = ccle_expr_valid.index)

ccle_expr_test = ccle.loc[pathway_test.index]
ccle_log_test = np.log2(ccle_expr_test+1)
ccle_scaled_test = pd.DataFrame(scaler.transform(ccle_log_test),
                          columns = ccle_expr_test.columns,
                          index = ccle_expr_test.index)

In [8]:
tcga = tcga[genes]
tcga_expr = tcga.loc[tcga.index]
tcga_log = np.log2(tcga_expr+1)

tcga_scaled = pd.DataFrame(scaler.transform(tcga_log),
                          columns=tcga_expr.columns,
                          index = tcga_expr.index)


In [9]:
X_train = pd.concat([pca_train_df_all, ccle_scaled_train], axis = 1)
X_valid = pd.concat([pca_valid_df_all, ccle_scaled_valid], axis = 1)
X_test = pd.concat([pca_test_df_all, ccle_scaled_test], axis = 1)
X_tcga = pd.concat([pca_tcga_df_all,tcga_scaled], axis = 1)

final_scaler = StandardScaler()
X_train_final = final_scaler.fit_transform(X_train)
X_valid_final = final_scaler.transform(X_valid)
X_test_final = final_scaler.transform(X_test)
X_tcga_final = final_scaler.transform(X_tcga)
X_tcga

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,...,PC41,PC42,PC43,PC44,PC45,PC46,PC47,COL1A1,INPP5D,ITGA11
TCGA.A2.A25D.01,-0.449199,-1.277509,0.386248,1.593499,0.260806,0.844710,1.067935,-0.810487,0.524101,1.337414,...,0.416157,0.309027,-1.655401,0.485688,-0.227038,0.087684,-0.058599,1.830318,2.220841,1.296291
TCGA.BH.A201.01,8.844229,3.882238,5.116272,0.647882,0.813863,-1.868032,1.288526,0.462354,0.185408,-1.748471,...,-0.461211,-0.568675,-0.073309,-1.093853,1.625817,-0.442240,-0.758550,2.736347,1.563121,1.940333
TCGA.AC.A23C.01,9.569072,2.524610,5.092145,1.249081,0.827775,-0.291009,0.464333,0.601303,0.646854,-1.300668,...,-0.392595,-1.105562,1.665281,-1.119473,1.556477,-1.070270,-0.304149,2.517620,1.909064,2.056365
TCGA.AR.A5QP.01,-4.723412,-1.113025,-1.110859,-0.377260,0.619748,1.164356,-0.232820,-1.143085,-0.501955,0.863565,...,0.134282,-0.004488,0.003276,-0.322157,-0.926914,1.047201,0.512494,2.493365,1.797713,1.404978
TCGA.C8.A12P.01,0.118596,-0.199319,-0.099081,0.967495,0.327276,0.641202,-1.411725,0.951967,0.881183,1.621114,...,0.860287,-0.247998,-0.697620,-0.141268,-0.658539,-0.767391,-1.024569,2.647744,1.751537,2.176518
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA.LL.A5YP.01,-4.657924,2.316802,0.381357,-0.728604,0.023503,-0.025363,-0.845761,-0.067850,-2.265164,0.361732,...,2.225536,0.547610,-1.681968,0.881103,-0.865240,0.363534,-0.022430,2.553621,1.767518,1.959113
TCGA.AO.A03L.01,3.518239,1.028470,0.687774,0.894550,-0.097524,-2.119128,1.127740,-0.436496,-0.106641,-0.143143,...,-1.031947,-0.124671,0.191382,-0.268203,-0.558590,-1.190544,-0.975229,2.489624,1.832160,1.755779
TCGA.BH.A42T.01,3.144793,-1.239935,0.717447,2.044672,0.138473,0.786618,0.887360,-0.575847,-0.511377,1.998781,...,-0.317032,-0.022614,-0.634561,0.632965,0.146135,1.588517,-0.571053,2.441191,2.018631,1.862630
TCGA.A2.A04W.01,-9.247603,-1.654121,-4.116136,-1.905534,-0.488720,1.086193,-0.938622,-0.189387,-0.417404,1.123044,...,-0.153290,0.170549,-0.677232,0.878591,-0.592151,0.395272,-0.620347,2.658690,1.212120,1.839494


In [10]:
# 숫자 라벨로 변환 (sensitive = 1, resistant = 0)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(ic_train.iloc[:,0])
y_valid = le.fit_transform(ic_val.iloc[:,0])
y_test = le.fit_transform(ic_test.iloc[:,0])

# **3. 모델링**
1. 로지스틱, 랜덤포레스트, SVM, KNN으로 진행
2. 전처리를 R로 해야했기에 pipeline으로 묶을 수 없어 cross validation이 힘듬
3. 평가지표는 AUC, F1 score 중점으로 사용
4. 수동으로 튜닝 후 성능이 제일 좋은 모델 선택 후 TCGA 라벨 예측

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

In [12]:
## Logistic Regression
lr = LogisticRegression(C=0.01, max_iter=10000)
lr.fit(X_train_final, y_train)
y_pred_lr = lr.predict(X_test_final)
y_proba_lr = lr.predict_proba(X_test_final)[:, 1]

## Random Forest
rf = RandomForestClassifier(n_estimators=250, max_depth=5, random_state=42)
rf.fit(X_train_final, y_train)
y_pred_rf = rf.predict(X_test_final)
y_proba_rf = rf.predict_proba(X_test_final)[:, 1]

## Support Vector Machine (SVM)
svm = SVC(probability=True, random_state=42)
svm.fit(X_train_final, y_train)
y_pred_svm = svm.predict(X_test_final)
y_proba_svm = svm.predict_proba(X_test_final)[:, 1]

## K-Nearest Neighbors (KNN)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_final, y_train)
y_pred_knn = knn.predict(X_test_final)
y_proba_knn = knn.predict_proba(X_test_final)[:, 1]

# ── AdaBoost 추가 ──
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier(
    n_estimators=50,
    random_state=42
)
ada.fit(X_train_final, y_train)

y_pred_ada   = ada.predict(X_test_final)
y_proba_ada  = ada.predict_proba(X_test_final)[:, 1]

In [13]:
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

print("===== Validation Set Evaluation =====")

# Logistic Regression - Validation
y_pred_lr_valid = lr.predict(X_valid_final)
y_proba_lr_valid = lr.predict_proba(X_valid_final)[:, 1]
print("\n--- Logistic Regression ---")
print(classification_report(y_valid, y_pred_lr_valid, target_names=le.classes_))
print("ROC-AUC:", roc_auc_score(y_valid, y_proba_lr_valid))
print("Confusion matrix:\n", confusion_matrix(y_valid, y_pred_lr_valid))

# Random Forest - Validation
y_pred_rf_valid = rf.predict(X_valid_final)
y_proba_rf_valid = rf.predict_proba(X_valid_final)[:, 1]
print("\n--- Random Forest ---")
print(classification_report(y_valid, y_pred_rf_valid, target_names=le.classes_))
print("ROC-AUC:", roc_auc_score(y_valid, y_proba_rf_valid))
print("Confusion matrix:\n", confusion_matrix(y_valid, y_pred_rf_valid))

# Support Vector Machine - Validation
y_pred_svm_valid = svm.predict(X_valid_final)
y_proba_svm_valid = svm.predict_proba(X_valid_final)[:, 1]
print("\n--- Support Vector Machine ---")
print(classification_report(y_valid, y_pred_svm_valid, target_names=le.classes_))
print("ROC-AUC:", roc_auc_score(y_valid, y_proba_svm_valid))
print("Confusion matrix:\n", confusion_matrix(y_valid, y_pred_svm_valid))

# K-Nearest Neighbors - Validation
y_pred_knn_valid = knn.predict(X_valid_final)
y_proba_knn_valid = knn.predict_proba(X_valid_final)[:, 1]
print("\n--- K-Nearest Neighbors ---")
print(classification_report(y_valid, y_pred_knn_valid, target_names=le.classes_))
print("ROC-AUC:", roc_auc_score(y_valid, y_proba_knn_valid))
print("Confusion matrix:\n", confusion_matrix(y_valid, y_pred_knn_valid))

# AdaBoost - Validation
y_pred_ada_valid   = ada.predict(X_valid_final)
y_proba_ada_valid  = ada.predict_proba(X_valid_final)[:, 1]
print("\n--- AdaBoost ---")
print(classification_report(y_valid, y_pred_ada_valid, target_names=le.classes_))
print("ROC-AUC:", roc_auc_score(y_valid, y_proba_ada_valid))
print("Confusion matrix:\n", confusion_matrix(y_valid, y_pred_ada_valid))

===== Validation Set Evaluation =====

--- Logistic Regression ---
              precision    recall  f1-score   support

   resistant       0.74      0.74      0.74        27
   sensitive       0.72      0.72      0.72        25

    accuracy                           0.73        52
   macro avg       0.73      0.73      0.73        52
weighted avg       0.73      0.73      0.73        52

ROC-AUC: 0.8192592592592592
Confusion matrix:
 [[20  7]
 [ 7 18]]

--- Random Forest ---
              precision    recall  f1-score   support

   resistant       0.75      0.78      0.76        27
   sensitive       0.75      0.72      0.73        25

    accuracy                           0.75        52
   macro avg       0.75      0.75      0.75        52
weighted avg       0.75      0.75      0.75        52

ROC-AUC: 0.8311111111111111
Confusion matrix:
 [[21  6]
 [ 7 18]]

--- Support Vector Machine ---
              precision    recall  f1-score   support

   resistant       0.83      0.70    

In [14]:
from sklearn.ensemble import VotingClassifier
# ── Soft Voting Ensemble ──
voting_clf = VotingClassifier(
    estimators=[
        ('lr', lr),
        ('rf', rf),
        ('svm', svm),
        ('knn', knn)
    ],
    voting='soft'
)
voting_clf.fit(X_train_final, y_train)

y_pred_voting  = voting_clf.predict(X_test_final)
y_proba_voting = voting_clf.predict_proba(X_test_final)[:, 1]

print("\n===== Soft Voting Ensemble =====")
print(classification_report(y_test, y_pred_voting, target_names=le.classes_))
print("ROC-AUC:", roc_auc_score(y_test, y_proba_voting))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_voting))





===== Soft Voting Ensemble =====
              precision    recall  f1-score   support

   resistant       0.73      0.73      0.73        26
   sensitive       0.71      0.71      0.71        24

    accuracy                           0.72        50
   macro avg       0.72      0.72      0.72        50
weighted avg       0.72      0.72      0.72        50

ROC-AUC: 0.75
Confusion matrix:
 [[19  7]
 [ 7 17]]
