In [42]:
import pandas as pd

In [43]:
df = pd.read_csv('dataset/meta.csv')
df_to_merged = pd.read_csv('dataset/upenn.csv')

In [44]:
print(df.head())
print(df_to_merged.head())

             id   death   age                 gt grade     sex mortality  \
0     C3N-01199     1.0  49.9  Oligodendroglioma    G2    male      Dead   
1  TCGA-CS-4938  9999.0  31.0        Astrocytoma    G2  female     Alive   
2  TCGA-CS-4944  9999.0  50.0        Astrocytoma    G2    male     Alive   
3  TCGA-CS-5390  9999.0  47.0  Oligodendroglioma    G2  female      Dead   
4  TCGA-CS-6667  9999.0  39.0        Astrocytoma    G2  female     Alive   

                  treatment  
0                       NaN  
1    Radiation Therapy, NOS  
2  Radiation, External Beam  
3  Radiation, External Beam  
4           Steroid Therapy  
                   ID Gender  Age_at_scan_years  \
0  UPENN-GBM-00001_11      F              52.16   
1  UPENN-GBM-00002_11      F              61.30   
2  UPENN-GBM-00003_11      M              42.82   
3  UPENN-GBM-00004_11      M              33.43   
4  UPENN-GBM-00005_11      M              53.33   

  Survival_from_surgery_days_UPDATED Survival_Status Sur

In [45]:
df_to_merged = df_to_merged.rename(columns={
    'ID': 'id',
    'Age_at_scan_years': 'age',
    'Gender': 'sex',
    'Survival_from_surgery_days_UPDATED': 'death',
    "Survival_Status": "mortality",
})[['id', 'age', 'sex', 'death', 'mortality']]
df_to_merged['id'] = df_to_merged['id'].str.rsplit('_', n=1).str[0]
df_to_merged = df_to_merged.drop_duplicates(subset='id', keep='last')


In [46]:
print(df_to_merged.head())
print(len(df_to_merged))

                id    age sex death mortality
0  UPENN-GBM-00001  52.16   F   960  Deceased
1  UPENN-GBM-00002  61.30   F   291  Deceased
2  UPENN-GBM-00003  42.82   M  2838  Deceased
3  UPENN-GBM-00004  33.43   M   623  Deceased
4  UPENN-GBM-00005  53.33   M  1143  Deceased
630


In [47]:
# Step 1: Normalize mortality values
df_to_merged['mortality'] = df_to_merged['mortality'].apply(
    lambda x: 'Dead' if str(x).strip().lower() == 'deceased' else 'Alive'
)

# Step 2: Set death to 9999 where mortality is 'Alive'
df_to_merged.loc[df_to_merged['mortality'] == 'Alive', 'death'] = 9999

df_to_merged['gt'] = 'Glioblastoma'
df_to_merged['grade'] = 'G4'


In [48]:
print(df_to_merged.head())
print(len(df_to_merged))

                id    age sex death mortality            gt grade
0  UPENN-GBM-00001  52.16   F   960      Dead  Glioblastoma    G4
1  UPENN-GBM-00002  61.30   F   291      Dead  Glioblastoma    G4
2  UPENN-GBM-00003  42.82   M  2838      Dead  Glioblastoma    G4
3  UPENN-GBM-00004  33.43   M   623      Dead  Glioblastoma    G4
4  UPENN-GBM-00005  53.33   M  1143      Dead  Glioblastoma    G4
630


In [49]:
missing_age = df_to_merged.isna().sum()
missing_sex = df_to_merged.isna().sum()

print(f"Missing values in 'age': {missing_age}")
print(f"Missing values in 'sex': {missing_sex}")


Missing values in 'age': id           0
age          0
sex          0
death        0
mortality    0
gt           0
grade        0
dtype: int64
Missing values in 'sex': id           0
age          0
sex          0
death        0
mortality    0
gt           0
grade        0
dtype: int64


In [50]:
# Select columns from df_to_merged except 'id' (which is key)
merged_df = pd.concat([df, df_to_merged], ignore_index=True)


print(len(merged_df))
print(merged_df.head())
merged_df.to_csv('./dataset/meta2.csv')



1773
             id   death   age                 gt grade     sex mortality  \
0     C3N-01199     1.0  49.9  Oligodendroglioma    G2    male      Dead   
1  TCGA-CS-4938  9999.0  31.0        Astrocytoma    G2  female     Alive   
2  TCGA-CS-4944  9999.0  50.0        Astrocytoma    G2    male     Alive   
3  TCGA-CS-5390  9999.0  47.0  Oligodendroglioma    G2  female      Dead   
4  TCGA-CS-6667  9999.0  39.0        Astrocytoma    G2  female     Alive   

                  treatment  
0                       NaN  
1    Radiation Therapy, NOS  
2  Radiation, External Beam  
3  Radiation, External Beam  
4           Steroid Therapy  
