In [336]:
import pandas as pd             
import numpy as np             
import matplotlib.pyplot as plt  
import seaborn as sns           

from sklearn.preprocessing import OneHotEncoder # 원핫인코딩
from sklearn.preprocessing import LabelEncoder # 라벨인코딩
from sklearn.model_selection import train_test_split  # 데이터 분할
from sklearn.preprocessing import StandardScaler      # 스케일링
from sklearn.impute import SimpleImputer              # 결측치 처리
from sklearn.ensemble import RandomForestClassifier   # 분류 모델
from sklearn.linear_model import LogisticRegression   # 분류 모델
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report  # 모델 평가
from sklearn.linear_model import LinearRegression     # 회귀 모델
from sklearn.ensemble import RandomForestRegressor   # 회귀 모델
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score  # 모델 평가

# 경고 메시지 무시
import warnings
warnings.filterwarnings('ignore')

## 한글 깨짐 방지
#pip install koreanize_matplotlib
import koreanize_matplotlib 

# sampleMap 데이터

In [337]:
sampleMap = pd.read_csv('/Users/zoohunn/Desktop/비어플/비어플[의료]/tcga/sampleMap_BRCA_clinicalMatrix.csv', sep=',')
print(sampleMap.shape)
sampleMap.head()

(1218, 205)


Unnamed: 0.1,Unnamed: 0,sampleID,AJCC_Stage_nature2012,Age_at_Initial_Pathologic_Diagnosis_nature2012,CN_Clusters_nature2012,Converted_Stage_nature2012,Days_to_Date_of_Last_Contact_nature2012,Days_to_date_of_Death_nature2012,ER_Status_nature2012,Gender_nature2012,...,pan.samplesID,cancer.type,Subtype_mRNA,Subtype_DNAmeth,Subtype_protein,Subtype_miRNA,Subtype_CNA,Subtype_Integrative,Subtype_other,Subtype_Selected
0,0,TCGA-3C-AAAU-01,,,,,,,,,...,TCGA-3C-AAAU-01,BRCA,LumA,NA_character_,-2147483648,NA_character_,NA_character_,NA_character_,NA_character_,BRCA.LumA
1,1,TCGA-3C-AALI-01,,,,,,,,,...,TCGA-3C-AALI-01,BRCA,Her2,NA_character_,-2147483648,NA_character_,NA_character_,NA_character_,NA_character_,BRCA.Her2
2,2,TCGA-3C-AALJ-01,,,,,,,,,...,TCGA-3C-AALJ-01,BRCA,LumB,NA_character_,-2147483648,NA_character_,NA_character_,NA_character_,NA_character_,BRCA.LumB
3,3,TCGA-3C-AALK-01,,,,,,,,,...,TCGA-3C-AALK-01,BRCA,LumA,NA_character_,-2147483648,NA_character_,NA_character_,NA_character_,NA_character_,BRCA.LumA
4,4,TCGA-4H-AAAK-01,,,,,,,,,...,TCGA-4H-AAAK-01,BRCA,LumA,NA_character_,-2147483648,NA_character_,NA_character_,NA_character_,NA_character_,BRCA.LumA


In [338]:
sampleMap = sampleMap.drop('Unnamed: 0', axis=1)
sampleMap.replace('NA_character_', np.nan, inplace=True)
sampleMap.replace(-2147483648, np.nan, inplace=True)
sampleMap.isnull().sum()

sampleID                                             0
AJCC_Stage_nature2012                              439
Age_at_Initial_Pathologic_Diagnosis_nature2012     285
CN_Clusters_nature2012                             445
Converted_Stage_nature2012                         427
                                                  ... 
Subtype_miRNA                                     1218
Subtype_CNA                                       1218
Subtype_Integrative                               1218
Subtype_other                                     1218
Subtype_Selected                                     0
Length: 204, dtype: int64

sampleMape 데이터는 (1218, 205)의 구조며, 많은 열에 결측치가 과도하게 존재   
결측치가 대부분인 열을 우선 제거하고, 생존 분석과 직접 관련이 없는 변수들을 유형별로 묶어 삭제   
변수 하나하나를 모두 설명하지 않고, 생존 분석과 무관한 변수들을 유형별로 구분하여 삭제하는 방향으로 진행하겠음

생존 분석 무관 변수 유형:   
	•	유전자 클러스터 변수   
	•	행정적 메타 정보 변수   
	•	연구 목적 변수   
	•	이차 정보(중복) 변수   
	•   GENOMIC_ID_TCGA_BRCA_로 시작하는 변수
	•   조직(tissue) 변수

In [339]:
## 유전자 클러스터 변수 제거
# 고차원 데이터로 너무 많은 변수가 존재하면 모델이 복잡해지고, 과적합이 발생할 수 있음
# 생존 분석의 중요한 명확하고 직관적인 변수가 아닌, 클러스터 변수는 의미가 모호

# 유전자 클러스터 변수 목록
genomic_cluster_columns = [
    'Integrated_Clusters_no_exp__nature2012',
    'Integrated_Clusters_unsup_exp__nature2012',
    'Integrated_Clusters_with_PAM50__nature2012',
    'CN_Clusters_nature2012',
    'methylation_Clusters_nature2012',
    'miRNA_Clusters_nature2012',
    'RPPA_Clusters_nature2012',
    'SigClust_Intrinsic_mRNA_nature2012',
    'SigClust_Unsupervised_mRNA_nature2012'
]

## 연구 목적 변수 제거
# 연구 목적 변수들은 특정 프로젝트나 연구에서 데이터 관리를 위해 추가된 변수
# 연구 목적 변수들은 이러한 생존 시간과 직접적인 관련이 없으므로 불필요

# 연구 목적 변수 목록
research_columns = [
    'Survival_Data_Form_nature2012',
    '_PANCAN_CNA_PANCAN_K8',
    '_PANCAN_Cluster_Cluster_PANCAN',
    '_PANCAN_DNAMethyl_BRCA',
    '_PANCAN_DNAMethyl_PANCAN',
    '_PANCAN_RPPA_PANCAN_K8',
    '_PANCAN_UNC_RNAseq_PANCAN_K16',
    '_PANCAN_miRNA_PANCAN',
    '_PANCAN_mirna_BRCA',
    '_PANCAN_mutation_PANCAN',
    '_PATIENT',
    '_cohort',
    '_primary_disease',
    '_primary_site'
]

## 행정적 메타 변수 제거
# 데이터 관리 및 식별을 위해 추가된 변수
# 환자 식별자, 코호트 정보, 연구 프로젝트 정보 등을 포함(중복됨)

# 행정적 메타 정보 변수 목록
admin_columns = [
    'bcr_patient_barcode', 'bcr_sample_barcode', 'bcr_followup_barcode', 
    '_cohort', '_primary_disease', '_primary_site', '_INTEGRATION', '_PATIENT', 
    'Redaction', 'pan.samplesID'
]

In [340]:
# 제거할 변수 목록 합치기
remove_columns = genomic_cluster_columns + research_columns + admin_columns

# 실제 데이터에 존재하는 변수만 필터링
remove_columns = [col for col in remove_columns if col in sampleMap.columns]

# 불필요 변수 제거
sampleMap2 = sampleMap.drop(columns=remove_columns)

# 제거 완료 확인
print(f"제거된 변수 개수: {len(remove_columns)}")
print("제거된 변수 목록:")
for col in remove_columns:
    print(f"- {col}")

# 제거 후 데이터 확인
print("제거 후 데이터 형태:", sampleMap2.shape)

제거된 변수 개수: 32
제거된 변수 목록:
- Integrated_Clusters_no_exp__nature2012
- Integrated_Clusters_unsup_exp__nature2012
- Integrated_Clusters_with_PAM50__nature2012
- CN_Clusters_nature2012
- methylation_Clusters_nature2012
- miRNA_Clusters_nature2012
- RPPA_Clusters_nature2012
- SigClust_Intrinsic_mRNA_nature2012
- SigClust_Unsupervised_mRNA_nature2012
- Survival_Data_Form_nature2012
- _PANCAN_CNA_PANCAN_K8
- _PANCAN_Cluster_Cluster_PANCAN
- _PANCAN_DNAMethyl_BRCA
- _PANCAN_DNAMethyl_PANCAN
- _PANCAN_RPPA_PANCAN_K8
- _PANCAN_UNC_RNAseq_PANCAN_K16
- _PANCAN_miRNA_PANCAN
- _PANCAN_mirna_BRCA
- _PANCAN_mutation_PANCAN
- _PATIENT
- _cohort
- _primary_disease
- _primary_site
- bcr_patient_barcode
- bcr_sample_barcode
- bcr_followup_barcode
- _cohort
- _primary_disease
- _primary_site
- _INTEGRATION
- _PATIENT
- pan.samplesID
제거 후 데이터 형태: (1218, 176)


In [341]:
## 어느정도 결측치의 비율이 있는 데이터를 제거해야할지 확신이 안서지만 70% 이상의 결측치가 있는 열을 제거해보겠음,,

# 결측치 비율 계산
missing_percentage = sampleMap2.isnull().mean() * 100

# 70% 이상 결측치인 열 목록
high_missing_columns = missing_percentage[missing_percentage > 70].index.tolist()

sampleMap2 = sampleMap2.drop(columns=high_missing_columns)

In [342]:
## GENOMIC_ID_TCGA_BRCA_ 시작하는 변수들 제거
# 주로 유전자 발현, 유전자 복제 수 변이, DNA 메틸화 등을 포함
# 생존 분석의 관점에서 보면 대부분 직접적으로 필요하지 않음

genomic_id_columns = [col for col in sampleMap2.columns if col.startswith('_GENOMIC_ID_TCGA_BRCA')]


sampleMap2 = sampleMap2.drop(columns=genomic_id_columns)
print(f"제거된 '_GENOMIC_ID_TCGA_BRCA'로 시작하는 변수 개수: {len(genomic_id_columns)}")
print("제거된 변수 목록:")
for col in genomic_id_columns:
    print(f"- {col}")

제거된 '_GENOMIC_ID_TCGA_BRCA'로 시작하는 변수 개수: 18
제거된 변수 목록:
- _GENOMIC_ID_TCGA_BRCA_exp_HiSeqV2_exon
- _GENOMIC_ID_TCGA_BRCA_exp_HiSeqV2_PANCAN
- _GENOMIC_ID_TCGA_BRCA_RPPA_RBN
- _GENOMIC_ID_TCGA_BRCA_mutation
- _GENOMIC_ID_TCGA_BRCA_PDMRNAseq
- _GENOMIC_ID_TCGA_BRCA_hMethyl450
- _GENOMIC_ID_TCGA_BRCA_RPPA
- _GENOMIC_ID_TCGA_BRCA_PDMRNAseqCNV
- _GENOMIC_ID_TCGA_BRCA_mutation_curated_wustl_gene
- _GENOMIC_ID_TCGA_BRCA_PDMarrayCNV
- _GENOMIC_ID_TCGA_BRCA_miRNA_HiSeq
- _GENOMIC_ID_TCGA_BRCA_mutation_wustl_gene
- _GENOMIC_ID_TCGA_BRCA_exp_HiSeqV2_percentile
- _GENOMIC_ID_TCGA_BRCA_gistic2thd
- _GENOMIC_ID_TCGA_BRCA_G4502A_07_3
- _GENOMIC_ID_TCGA_BRCA_exp_HiSeqV2
- _GENOMIC_ID_TCGA_BRCA_gistic2
- _GENOMIC_ID_TCGA_BRCA_PDMarray


In [343]:
## tissue 변수 제거

# 'tissue'라는 단어가 포함된 모든 변수명을 필터링
tissue_columns = [col for col in sampleMap2.columns if 'tissue' in col]

# tissue 관련 변수들 목록 출력
print(f"'tissue' 관련 변수 개수: {len(tissue_columns)}")
print("tissue 관련 변수 목록:")
for col in tissue_columns:
    print(f"- {col}")

'tissue' 관련 변수 개수: 4
tissue 관련 변수 목록:
- tissue_prospective_collection_indicator
- tissue_retrospective_collection_indicator
- tissue_source_site
- tumor_tissue_site


In [344]:
# 생존 분석 관점에서 'tissue' 변수는 필요하지 않다고 판단하여 제거
sampleMap2 = sampleMap2.drop(columns=tissue_columns)

In [345]:
## 두 변수 의미가 중복돼 하나 제거(암 병기 변수)
print(sampleMap2[['AJCC_Stage_nature2012', 'Converted_Stage_nature2012']].value_counts())
sampleMap2 = sampleMap2.drop(columns=['Converted_Stage_nature2012'])
cleaned_sampleMap_df = sampleMap2.drop(columns=['_GENOMIC_ID_data/public/TCGA/BRCA/miRNA_HiSeq_gene', 'cancer.type','Subtype_mRNA'])

AJCC_Stage_nature2012  Converted_Stage_nature2012
Stage IIA              Stage IIA                     209
Stage IIB              Stage IIB                      95
Stage IIIA             Stage IIIA                     67
Stage IA               Stage I                        60
Stage I                Stage I                        56
Stage IIB              No_Conversion                  53
Stage IIIA             No_Conversion                  32
Stage IIA              No_Conversion                  31
Stage IIIC             Stage IIIC                     22
Stage II               Stage IIA                      18
                       No_Conversion                  11
Stage III              Stage IIIA                     11
Stage II               Stage IIB                      11
Stage IV               No_Conversion                  11
Stage IIIC             No_Conversion                   9
Stage IIIB             Stage IIIB                      9
                       No_Conversion  

In [346]:
cleaned_sampleMap_df.isnull().sum(), cleaned_sampleMap_df.shape

(sampleID                                            0
 AJCC_Stage_nature2012                             439
 Age_at_Initial_Pathologic_Diagnosis_nature2012    285
 Days_to_Date_of_Last_Contact_nature2012           351
 ER_Status_nature2012                              436
                                                  ... 
 targeted_molecular_therapy                        583
 vial_number                                       192
 vital_status                                        3
 year_of_initial_pathologic_diagnosis                5
 Subtype_Selected                                    0
 Length: 75, dtype: int64,
 (1218, 75))

# Survival 데이터

In [347]:
survival = pd.read_csv('/Users/zoohunn/Desktop/비어플/비어플[의료]/tcga/survival_BRCA_survival_(BRCA).csv')
print(survival.shape)
survival.head()

(1215, 22)


Unnamed: 0.1,Unnamed: 0,sample,_PATIENT,OS,OS.time,DSS,DSS.time,DFI,DFI.time,PFI,...,pan.samplesID,cancer.type,Subtype_mRNA,Subtype_DNAmeth,Subtype_protein,Subtype_miRNA,Subtype_CNA,Subtype_Integrative,Subtype_other,Subtype_Selected
0,0,TCGA-3C-AAAU-01,TCGA-3C-AAAU,0,4047.0,0.0,4047.0,1.0,1808.0,1,...,TCGA-3C-AAAU-01,BRCA,LumA,NA_character_,-2147483648,NA_character_,NA_character_,NA_character_,NA_character_,BRCA.LumA
1,1,TCGA-3C-AALI-01,TCGA-3C-AALI,0,4005.0,0.0,4005.0,0.0,4005.0,0,...,TCGA-3C-AALI-01,BRCA,Her2,NA_character_,-2147483648,NA_character_,NA_character_,NA_character_,NA_character_,BRCA.Her2
2,2,TCGA-3C-AALJ-01,TCGA-3C-AALJ,0,1474.0,0.0,1474.0,0.0,1474.0,0,...,TCGA-3C-AALJ-01,BRCA,LumB,NA_character_,-2147483648,NA_character_,NA_character_,NA_character_,NA_character_,BRCA.LumB
3,3,TCGA-3C-AALK-01,TCGA-3C-AALK,0,1448.0,0.0,1448.0,,,0,...,TCGA-3C-AALK-01,BRCA,LumA,NA_character_,-2147483648,NA_character_,NA_character_,NA_character_,NA_character_,BRCA.LumA
4,4,TCGA-4H-AAAK-01,TCGA-4H-AAAK,0,348.0,0.0,348.0,0.0,348.0,0,...,TCGA-4H-AAAK-01,BRCA,LumA,NA_character_,-2147483648,NA_character_,NA_character_,NA_character_,NA_character_,BRCA.LumA


In [348]:
survival = survival.drop('Unnamed: 0', axis=1)
survival.replace('NA_character_', np.nan, inplace=True)
survival.replace(-2147483648, np.nan, inplace=True)
survival.isnull().sum() 

sample                    0
_PATIENT                  0
OS                        0
OS.time                   1
DSS                      30
DSS.time                  1
DFI                     177
DFI.time                178
PFI                       0
PFI.time                  1
Redaction              1209
pan.samplesID             0
cancer.type               0
Subtype_mRNA              0
Subtype_DNAmeth        1215
Subtype_protein        1215
Subtype_miRNA          1215
Subtype_CNA            1215
Subtype_Integrative    1215
Subtype_other          1215
Subtype_Selected          0
dtype: int64

- 전체 행, 대부분이 결측치인 열들 제거
- 중복된 의미의 변수 _PATIENT, pan.samplesID,'cancer.type','Subtype_mRNA' 제거
- DFI, DFI.time 각각 177, 178개의 결측치가 같은 행에 동시에 존재하는 지 확인 후 제거
- OS.time, DSS.time, PFI.time 결측치 한 개가 같은 행에 동시에 존재하는 지 확인 후 제거

In [349]:
survival2 = survival.copy()

In [350]:
# 같은 행에서 DFI와 DFI.time이 결측치인 행 필터링
same_missing = survival2[survival2['DFI'].isnull() & survival2['DFI.time'].isnull()]

# 같은 행에서 결측치인 행 개수와 목록 출력
print(f"DFI와 DFI.time이 같은 행에서 결측치인 경우: {same_missing.shape[0]}개")
print(same_missing[['sample', 'DFI', 'DFI.time']])

DFI와 DFI.time이 같은 행에서 결측치인 경우: 177개
               sample  DFI  DFI.time
3     TCGA-3C-AALK-01  NaN       NaN
5     TCGA-5L-AAT0-01  NaN       NaN
6     TCGA-5L-AAT1-01  NaN       NaN
7     TCGA-5T-A9QA-01  NaN       NaN
16    TCGA-A1-A0SK-01  NaN       NaN
...               ...  ...       ...
1193  TCGA-PL-A8LY-01  NaN       NaN
1194  TCGA-PL-A8LZ-01  NaN       NaN
1205  TCGA-UL-AAZ6-01  NaN       NaN
1206  TCGA-UU-A93S-01  NaN       NaN
1213  TCGA-Z7-A8R5-01  NaN       NaN

[177 rows x 3 columns]


In [351]:
# DFI와 DFI.time이 모두 결측치인 행 제거
cleaned_survival_df = survival2[~(survival2['DFI'].isnull() & survival2['DFI.time'].isnull())]

# 제거된 행 개수 확인
removed_count = survival2.shape[0] - cleaned_survival_df.shape[0]
print(f"제거된 행 개수: {removed_count}")

제거된 행 개수: 177


In [352]:
cleaned_survival_df[
    cleaned_survival_df[["OS.time", "DSS.time", "PFI.time"]].isnull().any(axis=1)
]
# 같은 행에 결측치 존재, 삭제하기

Unnamed: 0,sample,_PATIENT,OS,OS.time,DSS,DSS.time,DFI,DFI.time,PFI,PFI.time,...,pan.samplesID,cancer.type,Subtype_mRNA,Subtype_DNAmeth,Subtype_protein,Subtype_miRNA,Subtype_CNA,Subtype_Integrative,Subtype_other,Subtype_Selected
1176,TCGA-OL-A66H-01,TCGA-OL-A66H,0,,0.0,,0.0,,0,,...,TCGA-OL-A66H-01,BRCA,LumA,,,,,,,BRCA.LumA


In [353]:
cleaned_survival_df = cleaned_survival_df.drop(columns=['_PATIENT', 'Subtype_DNAmeth','Subtype_protein','Subtype_miRNA','Subtype_CNA','Subtype_Integrative','Subtype_other','Redaction','pan.samplesID','cancer.type','Subtype_mRNA'])
cleaned_survival_df = cleaned_survival_df[cleaned_survival_df["sample"] != "TCGA-OL-A66H-01"].reset_index(drop=True)

In [354]:
cleaned_survival_df.shape, cleaned_survival_df.head()

((1037, 10),
             sample  OS  OS.time  DSS  DSS.time  DFI  DFI.time  PFI  PFI.time  \
 0  TCGA-3C-AAAU-01   0   4047.0  0.0    4047.0  1.0    1808.0    1    1808.0   
 1  TCGA-3C-AALI-01   0   4005.0  0.0    4005.0  0.0    4005.0    0    4005.0   
 2  TCGA-3C-AALJ-01   0   1474.0  0.0    1474.0  0.0    1474.0    0    1474.0   
 3  TCGA-4H-AAAK-01   0    348.0  0.0     348.0  0.0     348.0    0     348.0   
 4  TCGA-A1-A0SB-01   0    259.0  0.0     259.0  0.0     259.0    0     259.0   
 
   Subtype_Selected  
 0        BRCA.LumA  
 1        BRCA.Her2  
 2        BRCA.LumB  
 3        BRCA.LumA  
 4      BRCA.Normal  )

# cleaned_sampleMap_df와 cleaned_survival_df 데이터 병합

In [355]:
# 공통된 환자 ID를 기준으로 데이터 병합
cleaned_survival_df['sampleID'] = cleaned_survival_df['sample']
cleaned_survival_df = cleaned_survival_df.drop('sample', axis=1)
merged_data = pd.merge(cleaned_sampleMap_df, cleaned_survival_df, how='inner', on='sampleID')

# 병합 결과 확인
print("병합 후 데이터 크기:", merged_data.shape)


병합 후 데이터 크기: (1037, 84)


In [357]:
# 중복 여부 확인 및 공통 ID 수 계산
sampleMap_unique_ids = set(cleaned_sampleMap_df['sampleID'].dropna().unique())
survival_unique_ids = set(cleaned_survival_df['sampleID'].dropna().unique())

# 공통 ID 수 계산
common_ids = sampleMap_unique_ids.intersection(survival_unique_ids)
num_common_ids = len(common_ids)

print(f"공통된 고유 환자 ID 수: {num_common_ids}")

공통된 고유 환자 ID 수: 1037


그러므로 cleaned_sampleMap_df 데이터와 cleaned_survival_df 데이터의 공통된 고유 환자 ID는 1037명이다.

In [358]:
merged_data.drop(columns=['Subtype_Selected_y'], axis=1, inplace=True)
merged_data.rename(columns={'Subtype_Selected_x': 'Subtype'}, inplace=True)
merged_data.head()

Unnamed: 0,sampleID,AJCC_Stage_nature2012,Age_at_Initial_Pathologic_Diagnosis_nature2012,Days_to_Date_of_Last_Contact_nature2012,ER_Status_nature2012,Gender_nature2012,HER2_Final_Status_nature2012,Metastasis_Coded_nature2012,Metastasis_nature2012,Node_Coded_nature2012,...,year_of_initial_pathologic_diagnosis,Subtype,OS,OS.time,DSS,DSS.time,DFI,DFI.time,PFI,PFI.time
0,TCGA-3C-AAAU-01,,,,,,,,,,...,2004.0,BRCA.LumA,0,4047.0,0.0,4047.0,1.0,1808.0,1,1808.0
1,TCGA-3C-AALI-01,,,,,,,,,,...,2003.0,BRCA.Her2,0,4005.0,0.0,4005.0,0.0,4005.0,0,4005.0
2,TCGA-3C-AALJ-01,,,,,,,,,,...,2011.0,BRCA.LumB,0,1474.0,0.0,1474.0,0.0,1474.0,0,1474.0
3,TCGA-4H-AAAK-01,,,,,,,,,,...,2013.0,BRCA.LumA,0,348.0,0.0,348.0,0.0,348.0,0,348.0
4,TCGA-A1-A0SB-01,Stage I,70.0,259.0,Positive,FEMALE,Negative,Negative,M0,Negative,...,2008.0,BRCA.Normal,0,259.0,0.0,259.0,0.0,259.0,0,259.0


In [359]:
merged_data.isnull().sum()

sampleID                                            0
AJCC_Stage_nature2012                             353
Age_at_Initial_Pathologic_Diagnosis_nature2012    248
Days_to_Date_of_Last_Contact_nature2012           258
ER_Status_nature2012                              359
                                                 ... 
DSS.time                                            0
DFI                                                 0
DFI.time                                            0
PFI                                                 0
PFI.time                                            0
Length: 83, dtype: int64

In [363]:
merged_data.to_csv('/Users/zoohunn/Desktop/비어플/비어플[의료]/merged_data.csv', index=False)