In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Display settings
pd.set_option('display.max_columns', 100)
sns.set(style='whitegrid')

In [15]:
# Load the unzipped expression file
expr = pd.read_csv('../data/TCGA-BRCA.expression.tsv', sep='\t', index_col=0)

print("Expression shape:", expr.shape)
expr.iloc[:5, :5]  # preview

Expression shape: (20530, 1218)


Unnamed: 0_level_0,TCGA-AR-A5QQ-01,TCGA-D8-A1JA-01,TCGA-BH-A0BQ-01,TCGA-BH-A0BT-01,TCGA-A8-A06X-01
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ARHGEF10L,9.5074,7.4346,9.3216,9.0198,9.6417
HIF3A,1.5787,3.6607,2.7224,1.3414,0.5819
RNF17,0.0,0.6245,0.5526,0.0,0.0
RNF10,11.3676,11.9181,11.9665,13.1881,12.0036
RNF11,11.1292,13.5273,11.4105,11.0911,11.2545


In [16]:
# Load survival metadata
clin = pd.read_csv('../data/TCGA-CDR.survival.txt', sep='\t', index_col=0)

print("Clinical shape:", clin.shape)
clin[['OS.time', 'OS']].dropna().head()

Clinical shape: (1236, 10)


Unnamed: 0_level_0,OS.time,OS
sample,Unnamed: 1_level_1,Unnamed: 2_level_1
TCGA-3C-AAAU-01,4047.0,0
TCGA-3C-AALI-01,4005.0,0
TCGA-3C-AALJ-01,1474.0,0
TCGA-3C-AALK-01,1448.0,0
TCGA-4H-AAAK-01,348.0,0


In [17]:
print("Expression sample IDs:")
print(expr.columns[:5].tolist())

print("\nClinical sample IDs:")
print(clin.index[:5].tolist())

Expression sample IDs:
['TCGA-AR-A5QQ-01', 'TCGA-D8-A1JA-01', 'TCGA-BH-A0BQ-01', 'TCGA-BH-A0BT-01', 'TCGA-A8-A06X-01']

Clinical sample IDs:
['TCGA-3C-AAAU-01', 'TCGA-3C-AALI-01', 'TCGA-3C-AALJ-01', 'TCGA-3C-AALK-01', 'TCGA-4H-AAAK-01']


In [18]:
# Keep only BRCA patients (prefix filter)
clin_brca = clin[clin.index.str.startswith(('TCGA-A', 'TCGA-B', 'TCGA-C', 'TCGA-D'))]

print("Filtered clinical shape (BRCA only):", clin_brca.shape)


Filtered clinical shape (BRCA only): (889, 10)


In [19]:
# Rename survival columns for clarity
clin_brca = clin_brca.rename(columns={'OS.time': 'time', 'OS': 'status'})

In [20]:
# Drop duplicated sample IDs (if any) in both datasets
expr = expr.loc[:, ~expr.columns.duplicated()]
clin_brca = clin_brca[~clin_brca.index.duplicated(keep='first')]

In [21]:
# Get intersecting sample IDs and align datasets
common_ids = expr.columns.intersection(clin_brca.index)
print("Number of matched BRCA samples:", len(common_ids))

# Keep only essential survival columns before subsetting
clin_brca = clin_brca[['time', 'status']]

# Subset both datasets (order matters for Cox regression)
expr = expr[common_ids]
clin_brca = clin_brca.loc[common_ids]

print("✅ Final Expression shape:", expr.shape)
print("✅ Final Survival shape:", clin_brca.shape)
print("✅ Survival columns:", clin_brca.columns.tolist())

Number of matched BRCA samples: 877
✅ Final Expression shape: (20530, 877)
✅ Final Survival shape: (877, 2)
✅ Survival columns: ['time', 'status']


In [22]:
# Check for missing survival data
print("Missing survival data:")
print(clin_brca[['time', 'status']].isnull().sum())

# Remove samples with missing survival data
clin_brca = clin_brca.dropna(subset=['time', 'status'])

Missing survival data:
time      0
status    0
dtype: int64


In [23]:
# Validate survival data
print(f"Survival status distribution: {clin_brca['status'].value_counts()}")
print(f"Survival time range: {clin_brca['time'].min():.1f} - {clin_brca['time'].max():.1f}")

Survival status distribution: status
0    705
1    172
Name: count, dtype: int64
Survival time range: 0.0 - 8605.0


In [24]:
expr.to_csv('../data/processed_expression.tsv', sep='\t')
clin_brca.to_csv('../data/processed_clinical.tsv', sep='\t')