In [1]:
import pandas as pd
import numpy as np

# 1. **데이터 설명**
1. pathway_train, pathway_test, pathway_valid : CCLE에서 pathway score 계산 한 데이터
2. ic_train, ic_test, ic_val : 민감군, 저항군 라벨
3. ccle : 유전자 raw data -> 민감군, 저항군 간 유전자 발현량이 가장 큰 유전자 3개를 genes에 저장
4. tcga: 환자 데이터로 약물 민간/저항 라벨 부착 예정

In [3]:
pathway_train = pd.read_csv("C:/Users/USER/비어플 의료/0415_final/pathway scores/AZ960/train_pathway_score_AZ9601.csv")
pathway_test = pd.read_csv("C:/Users/USER/비어플 의료/0415_final/pathway scores/AZ960/test_pathway_score_AZ9601.csv")
pathway_valid = pd.read_csv("C:/Users/USER/비어플 의료/0415_final/pathway scores/AZ960/val_pathway_score_AZ9601.csv")

ic_train = pd.read_csv("C:/Users/USER/비어플 의료/0415_final/pathway scores/AZ960/ic_train_AZ9601.csv")
ic_test = pd.read_csv("C:/Users/USER/비어플 의료/0415_final/pathway scores/AZ960/ic_test_AZ9601.csv")
ic_val = pd.read_csv("C:/Users/USER/비어플 의료/0415_final/pathway scores/AZ960/ic_val_AZ9601.csv")

ccle = pd.read_csv("C:/Users/User/비어플 의료/#_filtered_CCLE_gene_expression.csv")
ccle.index = ccle["Unnamed: 0"]

tcga_path = pd.read_csv("C:/Users/User/비어플 의료/0415_final/pathway scores/AZ960/tcga_pathway_score_AZ960.csv")
tcga = pd.read_csv("C:/Users/User/비어플 의료/TCGA_final_0419.csv")



In [4]:
pathway_train.set_index("SampleID", inplace= True)
pathway_test.set_index("SampleID", inplace= True)
pathway_valid.set_index("SampleID", inplace= True)

ic_train.set_index("Unnamed: 0", inplace=True)
ic_test.set_index("Unnamed: 0", inplace=True)
ic_val.set_index("Unnamed: 0", inplace=True)

tcga_path.set_index("SampleID",inplace=True)
tcga.set_index("Unnamed: 0", inplace=True)

In [5]:
genes = ["WAS", "MAP4K1", "SELPLG"]

# 2. **전처리**
1. pathway score scale
2. pathway score에 대해 PCA 진행 후 90% 설명력까지 사용
3. train 기준으로 fjt하고 trest, valid, TCGA에는 transform만 해주기
4. ccle, tcga 유전자들도 sacle해주기
5. ic라벨 인코딩

In [7]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# 스케일링 (z-score 정규화가 되어있더라도 PCA 전에 한 번 더 정규화)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(pathway_train)
X_test_scaled = scaler.transform(pathway_test)
X_val_scaled = scaler.transform(pathway_valid)
tcga_path_scaled = scaler.transform(tcga_path)

# PCA 수행 (주성분 2개로 축소)
pca = PCA(n_components=0.90)
pathway_pca_train = pca.fit_transform(X_scaled)
pathway_pca_test  = pca.transform(X_test_scaled)
pathway_pca_valid = pca.transform(X_val_scaled)
pathway_pca_tcga = pca.transform(tcga_path_scaled)

In [8]:
pca_columns = [f"PC{i+1}" for i in range(pathway_pca_train.shape[1])]
pca_train_df_all = pd.DataFrame(pathway_pca_train, columns=pca_columns, index=pathway_train.index)
pca_train_df_all

pca_valid_df_all = pd.DataFrame(pathway_pca_valid, columns=pca_columns, index=pathway_valid.index)
pca_valid_df_all

pca_test_df_all = pd.DataFrame(pathway_pca_test, columns=pca_columns, index=pathway_test.index)
pca_test_df_all

pca_tcga_df_all = pd.DataFrame(pathway_pca_tcga, columns=pca_columns, index = tcga_path.index)

In [9]:
ccle = ccle[genes]
ccle_expr_train = ccle.loc[pathway_train.index]
ccle_log_train = np.log2(ccle_expr_train+1)

scaler = StandardScaler()
ccle_scaled_train = pd.DataFrame(scaler.fit_transform(ccle_log_train),
                          columns = ccle_expr_train.columns,
                          index = ccle_expr_train.index)

ccle_expr_valid = ccle.loc[pathway_valid.index]
ccle_log_valid = np.log2(ccle_expr_valid+1)

ccle_scaled_valid = pd.DataFrame(scaler.transform(ccle_log_valid),
                          columns = ccle_expr_valid.columns,
                          index = ccle_expr_valid.index)

ccle_expr_test = ccle.loc[pathway_test.index]
ccle_log_test = np.log2(ccle_expr_test+1)
ccle_scaled_test = pd.DataFrame(scaler.transform(ccle_log_test),
                          columns = ccle_expr_test.columns,
                          index = ccle_expr_test.index)

In [10]:
tcga = tcga[genes]
tcga_expr = tcga.loc[tcga.index]
tcga_log = np.log2(tcga_expr+1)

tcga_scaled = pd.DataFrame(scaler.transform(tcga_log),
                          columns=tcga_expr.columns,
                          index = tcga_expr.index)


In [11]:
X_train = pd.concat([pca_train_df_all, ccle_scaled_train], axis = 1)
X_valid = pd.concat([pca_valid_df_all, ccle_scaled_valid], axis = 1)
X_test = pd.concat([pca_test_df_all, ccle_scaled_test], axis = 1)
X_tcga = pd.concat([pca_tcga_df_all,tcga_scaled], axis = 1)

final_scaler = StandardScaler()
X_train_final = final_scaler.fit_transform(X_train)
X_valid_final = final_scaler.transform(X_valid)
X_test_final = final_scaler.transform(X_test)
X_tcga_final = final_scaler.transform(X_tcga)

In [12]:
# 숫자 라벨로 변환 (sensitive = 1, resistant = 0)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train = le.fit_transform(ic_train.iloc[:,0])
y_valid = le.fit_transform(ic_val.iloc[:,0])
y_test = le.fit_transform(ic_test.iloc[:,0])

# **3. 모델링**
1. 로지스틱, 랜덤포레스트, SVM, KNN으로 진행
2. 전처리를 R로 해야했기에 pipeline으로 묶을 수 없어 cross validation이 힘듬
3. 평가지표는 AUC, F1 score 중점으로 사용
4. 수동으로 튜닝 후 성능이 제일 좋은 모델 선택 후 TCGA 라벨 예측

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

In [15]:
## Logistic Regression
lr = LogisticRegression(C=0.01, max_iter=10000,class_weight="balanced")
lr.fit(X_train_final, y_train)
y_pred_lr = lr.predict(X_test_final)
y_proba_lr = lr.predict_proba(X_test_final)[:, 1]

## Random Forest
rf = RandomForestClassifier(
    n_estimators=250,
    max_depth=3,
    random_state=42,
    class_weight="balanced"
)
rf.fit(X_train_final, y_train)
y_pred_rf = rf.predict(X_test_final)
y_proba_rf = rf.predict_proba(X_test_final)[:, 1]

## Support Vector Machine (SVM)
svm = SVC(probability=True, random_state=42,C=1.5,class_weight="balanced")
svm.fit(X_train_final, y_train)
y_pred_svm = svm.predict(X_test_final)
y_proba_svm = svm.predict_proba(X_test_final)[:, 1]

## K-Nearest Neighbors (KNN)
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_final, y_train)
y_pred_knn = knn.predict(X_test_final)
y_proba_knn = knn.predict_proba(X_test_final)[:, 1]

# ── AdaBoost 추가 ──
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier(
    n_estimators=50,
    random_state=42
)
ada.fit(X_train_final, y_train)

y_pred_ada   = ada.predict(X_test_final)
y_proba_ada  = ada.predict_proba(X_test_final)[:, 1]



In [16]:
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix

print("===== Validation Set Evaluation =====")

# Logistic Regression - Validation
y_pred_lr_valid = lr.predict(X_valid_final)
y_proba_lr_valid = lr.predict_proba(X_valid_final)[:, 1]
print("\n--- Logistic Regression ---")
print(classification_report(y_valid, y_pred_lr_valid, target_names=le.classes_))
print("ROC-AUC:", roc_auc_score(y_valid, y_proba_lr_valid))
print("Confusion matrix:\n", confusion_matrix(y_valid, y_pred_lr_valid))

# Random Forest - Validation
y_pred_rf_valid = rf.predict(X_valid_final)
y_proba_rf_valid = rf.predict_proba(X_valid_final)[:, 1]
print("\n--- Random Forest ---")
print(classification_report(y_valid, y_pred_rf_valid, target_names=le.classes_))
print("ROC-AUC:", roc_auc_score(y_valid, y_proba_rf_valid))
print("Confusion matrix:\n", confusion_matrix(y_valid, y_pred_rf_valid))

# Support Vector Machine - Validation
y_pred_svm_valid = svm.predict(X_valid_final)
y_proba_svm_valid = svm.predict_proba(X_valid_final)[:, 1]
print("\n--- Support Vector Machine ---")
print(classification_report(y_valid, y_pred_svm_valid, target_names=le.classes_))
print("ROC-AUC:", roc_auc_score(y_valid, y_proba_svm_valid))
print("Confusion matrix:\n", confusion_matrix(y_valid, y_pred_svm_valid))

# AdaBoost - Validation
y_pred_ada_valid   = ada.predict(X_valid_final)
y_proba_ada_valid  = ada.predict_proba(X_valid_final)[:, 1]
print("\n--- AdaBoost ---")
print(classification_report(y_valid, y_pred_ada_valid, target_names=le.classes_))
print("ROC-AUC:", roc_auc_score(y_valid, y_proba_ada_valid))
print("Confusion matrix:\n", confusion_matrix(y_valid, y_pred_ada_valid))



===== Validation Set Evaluation =====

--- Logistic Regression ---
              precision    recall  f1-score   support

   resistant       0.43      0.43      0.43        23
   sensitive       0.54      0.54      0.54        28

    accuracy                           0.49        51
   macro avg       0.49      0.49      0.49        51
weighted avg       0.49      0.49      0.49        51

ROC-AUC: 0.517080745341615
Confusion matrix:
 [[10 13]
 [13 15]]

--- Random Forest ---
              precision    recall  f1-score   support

   resistant       0.44      0.35      0.39        23
   sensitive       0.55      0.64      0.59        28

    accuracy                           0.51        51
   macro avg       0.49      0.50      0.49        51
weighted avg       0.50      0.51      0.50        51

ROC-AUC: 0.41304347826086957
Confusion matrix:
 [[ 8 15]
 [10 18]]

--- Support Vector Machine ---
              precision    recall  f1-score   support

   resistant       0.48      0.48    

In [17]:
from sklearn.ensemble import VotingClassifier
# ── Soft Voting Ensemble ──
voting_clf = VotingClassifier(
    estimators=[
        ('lr', lr),
        ('rf', rf),
        ('svm', svm),
        ('knn', knn),
        ('ada', ada)
    ],
    voting='soft'
)
voting_clf.fit(X_train_final, y_train)

y_pred_voting  = voting_clf.predict(X_test_final)
y_proba_voting = voting_clf.predict_proba(X_test_final)[:, 1]

print("\n===== Soft Voting Ensemble =====")
print(classification_report(y_test, y_pred_voting, target_names=le.classes_))
print("ROC-AUC:", roc_auc_score(y_test, y_proba_voting))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_voting))





===== Soft Voting Ensemble =====
              precision    recall  f1-score   support

   resistant       0.38      0.39      0.38        23
   sensitive       0.48      0.46      0.47        28

    accuracy                           0.43        51
   macro avg       0.43      0.43      0.43        51
weighted avg       0.43      0.43      0.43        51

ROC-AUC: 0.4627329192546583
Confusion matrix:
 [[ 9 14]
 [15 13]]


