# ML-diagnosis-of-esophageal-cancer
## Data Cleaning & Preprocessing Notebook
Authors: Robert Franklin, Brianna O'Connor, Marisa Duong

Date: 2023-03-13

In [1]:
# Dependencies & Installs
import pandas as pd
# pip install xlrd
# pip install openpyxl

In [2]:
# Extract data from CB cohort and convert to Dataframe
# file_path = "RAW data 2cohorts_QIMR.xlsx"
file_path = "../Data_Source/RAW data 2cohorts_QIMR.xlsx"
CB_raw = pd.read_excel(file_path, sheet_name='Protein Intensity_CBcohort', engine='openpyxl')
CB_raw.head(2)

Unnamed: 0.1,Unnamed: 0,Patient Group,O75636_AAL,O95445_AAL,P00450_AAL,P00734_AAL,P00736_AAL,P00738_AAL,P00747_AAL,P00748_AAL,...,Race,BMI (kg/m2),On Acid Supp Regimen?,Collected Date,Treatment Type & Date,Treatment naïve,Comment,Dx (Per Path); highest recorded,Diagnosis When Sample Was Taken (Per Path),Comorbidities
0,CB10001,EAC,2499187.0,231901.565681,531675.915634,130229.593177,45742780.0,74278760.0,1399114.0,62766.439036,...,Caucasian,31.46,Y,2015-06-03,No Txmt w/Ochsner; Stented Prior to Coming to ...,Yes,Underwent surgery unrelated to BE/EAC,Adenocarcinoma,Adenocarcinoma,"DM Type II, HTN"
1,CB10002,BE-ID,2828146.0,225768.925899,392680.876091,147433.34004,46299340.0,32400030.0,1193564.0,56995.90123,...,Caucasian,41.4,Y,2014-12-10,"Cryo (12/10/2014); RFA (4/29/2014, 12/4/2013, ...",No,Received treatment before blood collection,BE w/LGD,Intestinal Metaplasia; Limited Interpretation ...,"Thyroid Disease, Bipolar, GERD, COPD"


In [3]:
# Extract data from PN cohort and convert to Dataframe
# file_path = "RAW data 2cohorts_QIMR.xlsx"
PN_raw = pd.read_excel(file_path, sheet_name='Protein Intensity_PNcohort', engine='openpyxl')
PN_raw.head(2)

Unnamed: 0,PNUM,Patient Group,O95445_AAL,P00450_AAL,P00734_AAL,P00736_AAL,P00738_AAL,P00747_AAL,P00748_AAL,P00751_AAL,...,Normalized Protein (μg/μL),gender,consentdate,collectiondate,City of collection,Hospital collected,agecollect,height,weight,bmi
0,PN200001,BE-LGD,190246.4723,731790.8471,51185.55315,99026.39829,17511369.52,1281657.844,146252.4045,424532.4302,...,79.431861,M,2009-11-25,2009-11-25 00:00:00,Sydney,St Vincents Clinic,64,,,
1,PN200003,NSE,259245.139,729337.2285,59015.99648,108490.756,16696711.48,1356981.217,147436.0885,449091.9729,...,76.910367,M,2009-05-13,2009-05-13 00:00:00,Sydney,St Vincents Clinic,57,178.0,94.4,29.8


In [4]:
# Rename first column 'ID'
CB_raw = CB_raw.rename(columns={'Unnamed: 0': 'ID'})
CB_raw.head(1)

Unnamed: 0,ID,Patient Group,O75636_AAL,O95445_AAL,P00450_AAL,P00734_AAL,P00736_AAL,P00738_AAL,P00747_AAL,P00748_AAL,...,Race,BMI (kg/m2),On Acid Supp Regimen?,Collected Date,Treatment Type & Date,Treatment naïve,Comment,Dx (Per Path); highest recorded,Diagnosis When Sample Was Taken (Per Path),Comorbidities
0,CB10001,EAC,2499187.0,231901.565681,531675.915634,130229.593177,45742780.0,74278760.0,1399114.0,62766.439036,...,Caucasian,31.46,Y,2015-06-03,No Txmt w/Ochsner; Stented Prior to Coming to ...,Yes,Underwent surgery unrelated to BE/EAC,Adenocarcinoma,Adenocarcinoma,"DM Type II, HTN"


In [5]:
# Rename first column 'ID'
PN_raw = PN_raw.rename(columns={'PNUM': 'ID'})
PN_raw.head(1)

Unnamed: 0,ID,Patient Group,O95445_AAL,P00450_AAL,P00734_AAL,P00736_AAL,P00738_AAL,P00747_AAL,P00748_AAL,P00751_AAL,...,Normalized Protein (μg/μL),gender,consentdate,collectiondate,City of collection,Hospital collected,agecollect,height,weight,bmi
0,PN200001,BE-LGD,190246.4723,731790.8471,51185.55315,99026.39829,17511369.52,1281657.844,146252.4045,424532.4302,...,79.431861,M,2009-11-25,2009-11-25 00:00:00,Sydney,St Vincents Clinic,64,,,


In [6]:
# Ensure that both tables have the same columns
common_cols = set(CB_raw.columns).intersection(PN_raw.columns)
diff_cols = set(CB_raw.columns).difference(PN_raw.columns).union(set(PN_raw.columns).difference(CB_raw.columns))
print("Different columns: ", diff_cols)
print(len(diff_cols))

Different columns:  {'Q14520_EPHA', 'P02675_EPHA', 'Comorbidities', 'Q08380_NPL', 'Age at Collection', 'P55058_AAL', 'Q16610_JAC', 'O75636_EPHA', 'O75636_JAC', 'Q16610_EPHA', 'P02741_JAC', 'Q08380_AAL', 'P02741_AAL', 'P05090_NPL', 'weight', 'P32119_AAL', 'P05090_EPHA', 'Assay ID', 'P01012_AAL', 'P08519_JAC', 'P02649_EPHA', 'P55058_JAC', 'Normalized Protein (μg/μL)', 'Gender', 'P02741_EPHA', 'P05090_AAL', 'collectiondate', 'P29622_NPL', 'P07996_EPHA', 'Q7Z7A1_JAC', 'consentdate', 'P02760_NPL', 'Q96IY4_NPL', 'Q5T011_NPL', 'P19652_NPL', 'Q7Z7A1_AAL', 'Treatment naïve', 'P01012_EPHA', 'Collected Date', 'P02671_AAL', 'P29622_JAC', 'Q96PD5_EPHA', 'Q08380_EPHA', 'P33981_NPL', 'City of collection', 'height', 'agecollect', 'Q14966_NPL', 'Sample Type (Progressor/Non-progressor)', 'P00736_EPHA', 'P08185_NPL', 'BMI (kg/m2)', 'P02647_EPHA', 'P12259_EPHA', 'P12259_AAL', 'P33981_AAL', 'Treatment Type & Date', 'P07996_JAC', 'P02760_AAL', 'bmi', 'Q14520_AAL', 'P13671_EPHA', 'P02649_JAC', 'Q96PD5_JAC', 

In [7]:
# Rename corresponding columns in PN cohort to match CB cohort
# Age, BMI, Protein and Gender 
PN_raw = PN_raw.rename(columns={'agecollect': 'Age at Collection', 'bmi': 'BMI (kg/m2)','Normalized Protein (μg/μL)':'Protein (μg/μL)', 'gender':'Gender'})

In [8]:
# Update common_cols and diff_cols
common_cols = set(CB_raw.columns).intersection(PN_raw.columns)
diff_cols = set(CB_raw.columns).difference(PN_raw.columns).union(set(PN_raw.columns).difference(CB_raw.columns))
print("Different columns: ", diff_cols)
print(len(diff_cols))

Different columns:  {'Q14520_EPHA', 'P02675_EPHA', 'Comorbidities', 'Q08380_NPL', 'P55058_AAL', 'Q16610_JAC', 'O75636_EPHA', 'O75636_JAC', 'Q16610_EPHA', 'P02741_JAC', 'Q08380_AAL', 'P02741_AAL', 'P05090_NPL', 'weight', 'P32119_AAL', 'P05090_EPHA', 'Assay ID', 'P01012_AAL', 'P08519_JAC', 'P02649_EPHA', 'P55058_JAC', 'P02741_EPHA', 'P05090_AAL', 'collectiondate', 'P29622_NPL', 'P07996_EPHA', 'Q7Z7A1_JAC', 'consentdate', 'P02760_NPL', 'Q96IY4_NPL', 'Q5T011_NPL', 'P19652_NPL', 'Q7Z7A1_AAL', 'Treatment naïve', 'P01012_EPHA', 'Collected Date', 'P02671_AAL', 'P29622_JAC', 'Q96PD5_EPHA', 'Q08380_EPHA', 'P33981_NPL', 'City of collection', 'height', 'Q14966_NPL', 'Sample Type (Progressor/Non-progressor)', 'P00736_EPHA', 'P08185_NPL', 'P02647_EPHA', 'P12259_AAL', 'P12259_EPHA', 'P33981_AAL', 'Treatment Type & Date', 'P07996_JAC', 'P02760_AAL', 'Q14520_AAL', 'P13671_EPHA', 'P02649_JAC', 'Q96PD5_JAC', 'P32119_NPL', 'P19652_JAC', 'P02671_EPHA', 'P55058_NPL', 'P02679_AAL', 'P01012_JAC', 'Q96IY4_JAC'

In [9]:
# Remove columns not found in both cohorts
for col in diff_cols:
  if col in PN_raw.columns:
    PN_raw = PN_raw.drop(columns=col)
PN_raw.head(2)

Unnamed: 0,ID,Patient Group,O95445_AAL,P00450_AAL,P00734_AAL,P00736_AAL,P00738_AAL,P00747_AAL,P00748_AAL,P00751_AAL,...,P01023_LPPNVVEESAR_Ratio_NPL,P02748_LSPIYNLVPVK_Ratio_NPL,P04114_SPAFTDLHLR_Ratio_NPL,P06396_AVEVLPK_Ratio_NPL,P0C0L5_GSFEFPVGDAVSK_Ratio_NPL,P10643_LTPLYELVK_Ratio_NPL,Protein (μg/μL),Gender,Age at Collection,BMI (kg/m2)
0,PN200001,BE-LGD,190246.4723,731790.8471,51185.55315,99026.39829,17511369.52,1281657.844,146252.4045,424532.4302,...,1.021711,1.009847,2.098434,1.01716,0.434495,1.237117,79.431861,M,64,
1,PN200003,NSE,259245.139,729337.2285,59015.99648,108490.756,16696711.48,1356981.217,147436.0885,449091.9729,...,0.824941,1.033205,4.127104,1.085205,0.724996,1.840604,76.910367,M,57,29.8


In [10]:
# Remove columns not found in both cohorts
for col in diff_cols:
  if col in CB_raw.columns:
    CB_raw = CB_raw.drop(columns=col)
CB_raw.head(2)

Unnamed: 0,ID,Patient Group,O95445_AAL,P00450_AAL,P00734_AAL,P00736_AAL,P00738_AAL,P00747_AAL,P00748_AAL,P00751_AAL,...,P01023_LPPNVVEESAR_Ratio_NPL,P02748_LSPIYNLVPVK_Ratio_NPL,P04114_SPAFTDLHLR_Ratio_NPL,P06396_AVEVLPK_Ratio_NPL,P0C0L5_GSFEFPVGDAVSK_Ratio_NPL,P10643_LTPLYELVK_Ratio_NPL,Protein (μg/μL),Gender,Age at Collection,BMI (kg/m2)
0,CB10001,EAC,231901.565681,531675.915634,130229.593177,45742780.0,74278760.0,1399114.0,62766.439036,8785951.0,...,2.03006,3.13863,6.545236,2.455497,1.362097,3.987969,65.49768,M,63.0,31.46
1,CB10002,BE-ID,225768.925899,392680.876091,147433.34004,46299340.0,32400030.0,1193564.0,56995.90123,8329383.0,...,5.897053,3.089863,6.585498,4.543369,0.778761,2.623578,68.052035,F,69.0,41.4


In [11]:
# CB Data Cleaning
CB_df = CB_raw
# Drop null rows
CB_df = CB_df.dropna()

In [12]:
# PN Data Cleaning
PN_df = PN_raw
# Drop null rows
PN_df = PN_df.dropna()

In [13]:
# Drop the Columns Lectin EPHA, heavy
CB_df = CB_df.loc[:, ~CB_df.columns.str.contains('EPHA|heavy')]
CB_df.head(2)

Unnamed: 0,ID,Patient Group,O95445_AAL,P00450_AAL,P00734_AAL,P00736_AAL,P00738_AAL,P00747_AAL,P00748_AAL,P00751_AAL,...,P01023_LPPNVVEESAR_Ratio_NPL,P02748_LSPIYNLVPVK_Ratio_NPL,P04114_SPAFTDLHLR_Ratio_NPL,P06396_AVEVLPK_Ratio_NPL,P0C0L5_GSFEFPVGDAVSK_Ratio_NPL,P10643_LTPLYELVK_Ratio_NPL,Protein (μg/μL),Gender,Age at Collection,BMI (kg/m2)
0,CB10001,EAC,231901.565681,531675.915634,130229.593177,45742780.0,74278760.0,1399114.0,62766.439036,8785951.0,...,2.03006,3.13863,6.545236,2.455497,1.362097,3.987969,65.49768,M,63.0,31.46
1,CB10002,BE-ID,225768.925899,392680.876091,147433.34004,46299340.0,32400030.0,1193564.0,56995.90123,8329383.0,...,5.897053,3.089863,6.585498,4.543369,0.778761,2.623578,68.052035,F,69.0,41.4


In [14]:
# Drop the Columns Lectin EPHA, heavy_ALL, Collected Date, Treatment Type & Date, Treatment naïve, Comment, DX (Per Path); highest recorded, Diagnosis When Sample Was Taken (Per Path), Comorbidities
PN_df = PN_df.loc[:, ~PN_df.columns.str.contains('EPHA|heavy')]
PN_df.head(2)

Unnamed: 0,ID,Patient Group,O95445_AAL,P00450_AAL,P00734_AAL,P00736_AAL,P00738_AAL,P00747_AAL,P00748_AAL,P00751_AAL,...,P01023_LPPNVVEESAR_Ratio_NPL,P02748_LSPIYNLVPVK_Ratio_NPL,P04114_SPAFTDLHLR_Ratio_NPL,P06396_AVEVLPK_Ratio_NPL,P0C0L5_GSFEFPVGDAVSK_Ratio_NPL,P10643_LTPLYELVK_Ratio_NPL,Protein (μg/μL),Gender,Age at Collection,BMI (kg/m2)
1,PN200003,NSE,259245.139,729337.2,59015.99648,108490.756,16696711.48,1356981.217,147436.0885,449091.9729,...,0.824941,1.033205,4.127104,1.085205,0.724996,1.840604,76.910367,M,57,29.8
2,PN200009,NSE,273950.2686,1194245.0,53057.44721,116871.9276,24306476.52,1766197.287,141270.2367,669375.7039,...,0.67533,1.522049,6.423165,0.842477,0.629588,1.208206,69.476569,F,38,45.0


In [15]:
# Join tables
joined_df = pd.merge(CB_df, PN_df, how='outer')
joined_df

Unnamed: 0,ID,Patient Group,O95445_AAL,P00450_AAL,P00734_AAL,P00736_AAL,P00738_AAL,P00747_AAL,P00748_AAL,P00751_AAL,...,P01023_LPPNVVEESAR_Ratio_NPL,P02748_LSPIYNLVPVK_Ratio_NPL,P04114_SPAFTDLHLR_Ratio_NPL,P06396_AVEVLPK_Ratio_NPL,P0C0L5_GSFEFPVGDAVSK_Ratio_NPL,P10643_LTPLYELVK_Ratio_NPL,Protein (μg/μL),Gender,Age at Collection,BMI (kg/m2)
0,CB10001,EAC,231901.565681,531675.915634,130229.593177,4.574278e+07,7.427876e+07,1.399114e+06,62766.439036,8.785951e+06,...,2.030060,3.138630,6.545236,2.455497,1.362097,3.987969,65.497680,M,63.0,31.46
1,CB10002,BE-ID,225768.925899,392680.876091,147433.340040,4.629934e+07,3.240003e+07,1.193564e+06,56995.901230,8.329383e+06,...,5.897053,3.089863,6.585498,4.543369,0.778761,2.623578,68.052035,F,69.0,41.40
2,CB10003,NSE,317408.869810,624661.826663,166048.185854,5.879864e+07,3.151861e+07,2.006994e+06,66677.865179,8.918962e+06,...,4.502031,1.858580,11.655492,4.471990,0.821500,2.668353,72.455779,F,54.0,30.04
3,CB10004,BE,196448.722270,383654.430349,81930.599409,4.418331e+07,1.274380e+07,1.121787e+06,47417.392161,6.726070e+06,...,1.153663,1.179382,6.028578,3.230195,0.731139,1.341246,68.811932,M,55.0,30.26
4,CB10006,BE,513410.263339,665023.074428,168785.312885,8.292568e+07,3.314564e+07,2.811941e+06,96794.597616,1.307532e+07,...,6.688398,2.745901,10.891096,9.083470,1.226545,4.634702,54.474587,M,68.0,30.40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
262,PN520198-2,BE-HGD,212909.415400,409214.656100,46791.487300,1.044341e+05,1.897595e+07,1.341625e+06,109005.583000,2.811791e+05,...,1.084367,1.105802,4.535125,1.143341,0.667606,1.862710,84.361363,M,71.0,30.60
263,PN520223,BE-LGD,264552.337400,443446.696400,53288.416120,1.013776e+05,1.906240e+07,1.276686e+06,161909.319100,4.063519e+05,...,1.099950,0.826197,2.938648,1.569241,0.477460,2.483334,61.603283,M,76.0,27.20
264,PN520230,BE,311888.143400,779222.794900,59896.271210,1.419347e+05,3.572879e+07,1.880594e+06,167653.314800,6.303756e+05,...,0.721562,1.085453,3.612338,1.061732,0.564546,1.251242,79.354873,M,49.0,35.60
265,PN520246,BE-LGD,262882.689100,317187.906000,38295.881410,1.148620e+05,8.537090e+06,1.429449e+06,134570.142600,3.732240e+05,...,0.759669,0.618220,7.036068,1.250521,0.403141,1.231198,67.244436,M,49.0,0.00


In [16]:
# Dropped ID column which contained the cohort number (cohort instead referenced in dataframe name)
CB_df = CB_df.iloc[:, 1:]
PN_df = PN_df.iloc[:, 1:]
joined_df = joined_df.iloc[:, 1:]
CB_df.head()

Unnamed: 0,Patient Group,O95445_AAL,P00450_AAL,P00734_AAL,P00736_AAL,P00738_AAL,P00747_AAL,P00748_AAL,P00751_AAL,P01008_AAL,...,P01023_LPPNVVEESAR_Ratio_NPL,P02748_LSPIYNLVPVK_Ratio_NPL,P04114_SPAFTDLHLR_Ratio_NPL,P06396_AVEVLPK_Ratio_NPL,P0C0L5_GSFEFPVGDAVSK_Ratio_NPL,P10643_LTPLYELVK_Ratio_NPL,Protein (μg/μL),Gender,Age at Collection,BMI (kg/m2)
0,EAC,231901.565681,531675.915634,130229.593177,45742780.0,74278760.0,1399114.0,62766.439036,8785951.0,1865353.0,...,2.03006,3.13863,6.545236,2.455497,1.362097,3.987969,65.49768,M,63.0,31.46
1,BE-ID,225768.925899,392680.876091,147433.34004,46299340.0,32400030.0,1193564.0,56995.90123,8329383.0,2990847.0,...,5.897053,3.089863,6.585498,4.543369,0.778761,2.623578,68.052035,F,69.0,41.4
2,NSE,317408.86981,624661.826663,166048.185854,58798640.0,31518610.0,2006994.0,66677.865179,8918962.0,3406724.0,...,4.502031,1.85858,11.655492,4.47199,0.8215,2.668353,72.455779,F,54.0,30.04
3,BE,196448.72227,383654.430349,81930.599409,44183310.0,12743800.0,1121787.0,47417.392161,6726070.0,1723110.0,...,1.153663,1.179382,6.028578,3.230195,0.731139,1.341246,68.811932,M,55.0,30.26
4,BE,513410.263339,665023.074428,168785.312885,82925680.0,33145640.0,2811941.0,96794.597616,13075320.0,4179410.0,...,6.688398,2.745901,10.891096,9.08347,1.226545,4.634702,54.474587,M,68.0,30.4


In [17]:
# Group columns into protein and non-protein columns for ease of viewing
CB_protein_cols = [col for col in CB_df.columns if col.endswith(('JAC', 'AAL', 'NPL'))]
CB_df_protein = CB_df[CB_protein_cols]
CB_df_other = CB_df.drop(columns=CB_protein_cols)

print("Protein columns data types:")
print(CB_df_protein.dtypes)

print("\nOther columns data types:")
print(CB_df_other.dtypes)

Protein columns data types:
O95445_AAL                        float64
P00450_AAL                        float64
P00734_AAL                        float64
P00736_AAL                        float64
P00738_AAL                        float64
                                   ...   
P02748_LSPIYNLVPVK_Ratio_NPL      float64
P04114_SPAFTDLHLR_Ratio_NPL       float64
P06396_AVEVLPK_Ratio_NPL          float64
P0C0L5_GSFEFPVGDAVSK_Ratio_NPL    float64
P10643_LTPLYELVK_Ratio_NPL        float64
Length: 185, dtype: object

Other columns data types:
Patient Group         object
Protein (μg/μL)      float64
Gender                object
Age at Collection    float64
BMI (kg/m2)          float64
dtype: object


In [18]:
# Group columns into protein and non-protein columns for ease of viewing
PN_protein_cols = [col for col in PN_df.columns if col.endswith(('JAC', 'AAL', 'NPL'))]
PN_df_protein = PN_df[PN_protein_cols]
PN_df_other = PN_df.drop(columns=PN_protein_cols)

print("Protein columns data types:")
print(PN_df_protein.dtypes)

print("\nOther columns data types:")
print(PN_df_other.dtypes)

Protein columns data types:
O95445_AAL                        float64
P00450_AAL                        float64
P00734_AAL                        float64
P00736_AAL                        float64
P00738_AAL                        float64
                                   ...   
P02748_LSPIYNLVPVK_Ratio_NPL      float64
P04114_SPAFTDLHLR_Ratio_NPL       float64
P06396_AVEVLPK_Ratio_NPL          float64
P0C0L5_GSFEFPVGDAVSK_Ratio_NPL    float64
P10643_LTPLYELVK_Ratio_NPL        float64
Length: 185, dtype: object

Other columns data types:
Patient Group         object
Protein (μg/μL)      float64
Gender                object
Age at Collection      int64
BMI (kg/m2)          float64
dtype: object


In [19]:
cols_to_encode = ['Gender']
CB_df = pd.get_dummies(CB_df, columns=cols_to_encode)

In [20]:
PN_df = pd.get_dummies(PN_df, columns=cols_to_encode)
PN_df.head(2)

Unnamed: 0,Patient Group,O95445_AAL,P00450_AAL,P00734_AAL,P00736_AAL,P00738_AAL,P00747_AAL,P00748_AAL,P00751_AAL,P01008_AAL,...,P02748_LSPIYNLVPVK_Ratio_NPL,P04114_SPAFTDLHLR_Ratio_NPL,P06396_AVEVLPK_Ratio_NPL,P0C0L5_GSFEFPVGDAVSK_Ratio_NPL,P10643_LTPLYELVK_Ratio_NPL,Protein (μg/μL),Age at Collection,BMI (kg/m2),Gender_F,Gender_M
1,NSE,259245.139,729337.2,59015.99648,108490.756,16696711.48,1356981.217,147436.0885,449091.9729,171866.0975,...,1.033205,4.127104,1.085205,0.724996,1.840604,76.910367,57,29.8,0,1
2,NSE,273950.2686,1194245.0,53057.44721,116871.9276,24306476.52,1766197.287,141270.2367,669375.7039,154653.578,...,1.522049,6.423165,0.842477,0.629588,1.208206,69.476569,38,45.0,1,0


In [21]:
joined_df = pd.get_dummies(joined_df, columns=cols_to_encode)
joined_df.head(2)

Unnamed: 0,Patient Group,O95445_AAL,P00450_AAL,P00734_AAL,P00736_AAL,P00738_AAL,P00747_AAL,P00748_AAL,P00751_AAL,P01008_AAL,...,P02748_LSPIYNLVPVK_Ratio_NPL,P04114_SPAFTDLHLR_Ratio_NPL,P06396_AVEVLPK_Ratio_NPL,P0C0L5_GSFEFPVGDAVSK_Ratio_NPL,P10643_LTPLYELVK_Ratio_NPL,Protein (μg/μL),Age at Collection,BMI (kg/m2),Gender_F,Gender_M
0,EAC,231901.565681,531675.915634,130229.593177,45742780.0,74278760.0,1399114.0,62766.439036,8785951.0,1865353.0,...,3.13863,6.545236,2.455497,1.362097,3.987969,65.49768,63.0,31.46,0,1
1,BE-ID,225768.925899,392680.876091,147433.34004,46299340.0,32400030.0,1193564.0,56995.90123,8329383.0,2990847.0,...,3.089863,6.585498,4.543369,0.778761,2.623578,68.052035,69.0,41.4,1,0


In [22]:
CB_df.head(2)

Unnamed: 0,Patient Group,O95445_AAL,P00450_AAL,P00734_AAL,P00736_AAL,P00738_AAL,P00747_AAL,P00748_AAL,P00751_AAL,P01008_AAL,...,P02748_LSPIYNLVPVK_Ratio_NPL,P04114_SPAFTDLHLR_Ratio_NPL,P06396_AVEVLPK_Ratio_NPL,P0C0L5_GSFEFPVGDAVSK_Ratio_NPL,P10643_LTPLYELVK_Ratio_NPL,Protein (μg/μL),Age at Collection,BMI (kg/m2),Gender_F,Gender_M
0,EAC,231901.565681,531675.915634,130229.593177,45742780.0,74278760.0,1399114.0,62766.439036,8785951.0,1865353.0,...,3.13863,6.545236,2.455497,1.362097,3.987969,65.49768,63.0,31.46,0,1
1,BE-ID,225768.925899,392680.876091,147433.34004,46299340.0,32400030.0,1193564.0,56995.90123,8329383.0,2990847.0,...,3.089863,6.585498,4.543369,0.778761,2.623578,68.052035,69.0,41.4,1,0


In [23]:
# Save cleaned data to csv files
CB_df.to_csv('../Data_Cleaned/CB_df_cleaned.csv', index='False')
PN_df.to_csv('../Data_Cleaned/PN_df_cleaned.csv', index='False')
joined_df.to_csv('../Data_Cleaned/Joined_df_cleaned.csv', index='False')