The Raw Data and Metadata can be found both on originally in the GEO, and then processed in the Autoimmune Diseases Explore (ADEx).

After analysis of both Metadata files, the one from ADEx has a good format for our downstream analysis, but it is missing information which can be found on the Series Matrix File from GEO.

Here we open and inspect both files, and add all necessary extra info to an Updated Metadata File,in order to be used in the next steps of our process

In [9]:
import pandas as pd

# File paths
adex_file = '/content/sample_data/Thesis/GSE108497_metadata.tsv'
geo_file = '/content/sample_data/Thesis/GSE108497_series_matrix.txt'
output_file = '/content/sample_data/Thesis/GSE108497_updated_metadata.tsv'

In [10]:
# Load the ADEx metadata (TSV) and preview it.
# Preview 1: top of dataset
adex_df = pd.read_csv(adex_file, sep='\t')
adex_df['Sample'] = adex_df['Sample'].astype(str).str.strip().str.strip('"')
print("Preview of ADEx metadata (first and last 5 rows):")
adex_df.head()

Preview of ADEx metadata (first and last 5 rows):


Unnamed: 0,Sample,GSE,Experimental Strategy,GPL,Condition,Tissue,Cell Type,Gender,Age,Ethnicity
0,GSM2901826,GSE108497,Expression,GPL10558,Healthy,Whole blood,,Female,21-30,Not Hispanic or Latino
1,GSM2901827,GSE108497,Expression,GPL10558,Healthy,Whole blood,,Female,21-30,Not Hispanic or Latino
2,GSM2901828,GSE108497,Expression,GPL10558,Healthy,Whole blood,,Female,21-30,Not Hispanic or Latino
3,GSM2901829,GSE108497,Expression,GPL10558,Healthy,Whole blood,,Female,21-30,Not Hispanic or Latino
4,GSM2901830,GSE108497,Expression,GPL10558,Healthy,Whole blood,,Female,31-40,Not Hispanic or Latino


In [11]:
# Preview 2: bottom of dataset
adex_df.tail()

Unnamed: 0,Sample,GSE,Experimental Strategy,GPL,Condition,Tissue,Cell Type,Gender,Age,Ethnicity
507,GSM2902333,GSE108497,Expression,GPL10558,SLE,Whole blood,,Female,31-40,Not Hispanic or Latino
508,GSM2902334,GSE108497,Expression,GPL10558,SLE,Whole blood,,Female,31-40,Not Hispanic or Latino
509,GSM2902335,GSE108497,Expression,GPL10558,SLE,Whole blood,,Female,31-40,Not Hispanic or Latino
510,GSM2902336,GSE108497,Expression,GPL10558,SLE,Whole blood,,Female,21-30,Not Hispanic or Latino
511,GSM2902337,GSE108497,Expression,GPL10558,SLE,Whole blood,,Female,21-30,Not Hispanic or Latino


In [13]:
adex_df.shape
# As we can see below, it contains 512 rows and 10 columns
# The rows correspond to the samples, so the samples are 512, and the columns correspond to the charasteristics of each sample.

(512, 10)

In [25]:
# Extract sample-level characteristics from the GEO file.
sample_ids = None
characteristics_dict = {}

with open(geo_file, 'r') as file:
    for line in file:
        line = line.strip()
        # Extract sample IDs from the GEO file.
        if line.startswith('!Sample_geo_accession'):
            parts = line.split('\t')
            # Clean each sample id by stripping extra spaces and quotes.
            sample_ids = [s.strip().strip('"') for s in parts[1:]]
            for sid in sample_ids:
                characteristics_dict[sid] = {}
        # Process lines starting with '!Sample_characteristics_ch1'
        elif line.startswith('!Sample_characteristics_ch1'):
            parts = line.split('\t')
            values = parts[1:]
            if sample_ids is None:
                continue
            # Adjust the list so its length matches the number of sample IDs.
            if len(values) < len(sample_ids):
                values += [""] * (len(sample_ids) - len(values))
            elif len(values) > len(sample_ids):
                values = values[:len(sample_ids)]
            # Use enumerate to loop over each value with its index.
            for i, cell in enumerate(values):
                if cell:
                    if ':' in cell:
                        key, val = cell.split(':', 1)  # Split on first colon only.
                        # Clean the property name and value.
                        key = key.strip().strip('"')
                        val = val.strip().strip('"')
                        # Check if the property already exists for this sample.
                        current = characteristics_dict[sample_ids[i]].get(key)
                        if current is None:
                            characteristics_dict[sample_ids[i]][key] = val
                        else:
                            # If the property already exists, combine the values.
                            if isinstance(current, list):
                                current.append(val)
                                characteristics_dict[sample_ids[i]][key] = current
                            else:
                                characteristics_dict[sample_ids[i]][key] = [current, val]
                    else:
                        continue


In [26]:
# After processing, convert any lists into a single string (joined by "; ").
for id, props in characteristics_dict.items():
    for key, value in props.items():
        if isinstance(value, list):
            characteristics_dict[id][key] = "; ".join(value)

In [27]:
# Convert the GEO characteristics dictionary into a DataFrame.
geo_characteristics_df = pd.DataFrame.from_dict(characteristics_dict, orient='index')
geo_characteristics_df.index.name = 'Sample'
geo_characteristics_df.reset_index(inplace=True)

In [28]:
# Clean sample IDs in the GEO characteristics DataFrame as well.
geo_characteristics_df['Sample'] = geo_characteristics_df['Sample'].astype(str).str.strip().str.strip('"')
print("Preview of GEO characteristics (first 5 rows):")
geo_characteristics_df.head()

Preview of GEO characteristics (first 5 rows):


Unnamed: 0,Sample,tissue,grp_p_tp,sample_name,donor_id,age,gender,race,ethnicity,sle,...,fd,nnd,pl_insuff,iugr,sga,batch,time_point,ga_at_collection,ga_at_end_of_pregnancy,if_pe_before_or_after_36_weeks
0,GSM2901826,whole blood,HC_NP_5,HC2013_1,106346,25,Female,C,Not Hispanic or Latino,0,...,0,0,0,0,0,3,,,,
1,GSM2901827,whole blood,HC_NP_5,HC2013-15,134642,25,Female,C,Not Hispanic or Latino,0,...,0,0,0,0,0,3,,,,
2,GSM2901828,whole blood,HC_NP_5,HC-85,HD-85,25,Female,AA,Not Hispanic or Latino,0,...,0,0,0,0,0,1,,,,
3,GSM2901829,whole blood,HC_NP_5,HC2013-10,139353,24,Female,C,Not Hispanic or Latino,0,...,0,0,0,0,0,3,,,,
4,GSM2901830,whole blood,HC_NP_5,HC2013-9,149039,33,Female,C,Not Hispanic or Latino,0,...,0,0,0,0,0,3,,,,


In [29]:
geo_characteristics_df.tail()

Unnamed: 0,Sample,tissue,grp_p_tp,sample_name,donor_id,age,gender,race,ethnicity,sle,...,fd,nnd,pl_insuff,iugr,sga,batch,time_point,ga_at_collection,ga_at_end_of_pregnancy,if_pe_before_or_after_36_weeks
507,GSM2902333,whole blood,SLE_P_NC_5,231X02H9,T82,37,Female,C,Not Hispanic or Latino,1,...,0,0,0,0,0,4,PP,11.3 WPP,38.0,
508,GSM2902334,whole blood,SLE_P_NC_5,496-Wnd9,T87,35,Female,AA,Not Hispanic or Latino,1,...,0,0,0,0,0,2,PP,17.4 WPP,40.1,
509,GSM2902335,whole blood,SLE_P_NC_5,U28(136)3MPP,U28,37,Female,H,Not Hispanic or Latino,1,...,0,0,0,0,0,4,PP,13.2 WPP,40.1,
510,GSM2902336,whole blood,SLE_P_NC_5,655X01K9,U68,28,Female,C,Not Hispanic or Latino,1,...,0,0,0,0,0,4,PP,15.3 WPP,37.6,
511,GSM2902337,whole blood,SLE_P_NC_5,723X01K9,U78,30,Female,C,Not Hispanic or Latino,1,...,0,0,0,0,0,4,PP,16.6 WPP,38.2,


In [30]:
# Merge the ADEx metadata with the GEO characteristics DataFrame and preview the first and last rows
merged_df = adex_df.merge(geo_characteristics_df, on='Sample', how='left')

print("Preview of merged metadata (first 5 rows):")
merged_df.head()

Preview of merged metadata (first 5 rows):


Unnamed: 0,Sample,GSE,Experimental Strategy,GPL,Condition,Tissue,Cell Type,Gender,Age,Ethnicity,...,fd,nnd,pl_insuff,iugr,sga,batch,time_point,ga_at_collection,ga_at_end_of_pregnancy,if_pe_before_or_after_36_weeks
0,GSM2901826,GSE108497,Expression,GPL10558,Healthy,Whole blood,,Female,21-30,Not Hispanic or Latino,...,0,0,0,0,0,3,,,,
1,GSM2901827,GSE108497,Expression,GPL10558,Healthy,Whole blood,,Female,21-30,Not Hispanic or Latino,...,0,0,0,0,0,3,,,,
2,GSM2901828,GSE108497,Expression,GPL10558,Healthy,Whole blood,,Female,21-30,Not Hispanic or Latino,...,0,0,0,0,0,1,,,,
3,GSM2901829,GSE108497,Expression,GPL10558,Healthy,Whole blood,,Female,21-30,Not Hispanic or Latino,...,0,0,0,0,0,3,,,,
4,GSM2901830,GSE108497,Expression,GPL10558,Healthy,Whole blood,,Female,31-40,Not Hispanic or Latino,...,0,0,0,0,0,3,,,,


In [31]:
merged_df.tail()

Unnamed: 0,Sample,GSE,Experimental Strategy,GPL,Condition,Tissue,Cell Type,Gender,Age,Ethnicity,...,fd,nnd,pl_insuff,iugr,sga,batch,time_point,ga_at_collection,ga_at_end_of_pregnancy,if_pe_before_or_after_36_weeks
507,GSM2902333,GSE108497,Expression,GPL10558,SLE,Whole blood,,Female,31-40,Not Hispanic or Latino,...,0,0,0,0,0,4,PP,11.3 WPP,38.0,
508,GSM2902334,GSE108497,Expression,GPL10558,SLE,Whole blood,,Female,31-40,Not Hispanic or Latino,...,0,0,0,0,0,2,PP,17.4 WPP,40.1,
509,GSM2902335,GSE108497,Expression,GPL10558,SLE,Whole blood,,Female,31-40,Not Hispanic or Latino,...,0,0,0,0,0,4,PP,13.2 WPP,40.1,
510,GSM2902336,GSE108497,Expression,GPL10558,SLE,Whole blood,,Female,21-30,Not Hispanic or Latino,...,0,0,0,0,0,4,PP,15.3 WPP,37.6,
511,GSM2902337,GSE108497,Expression,GPL10558,SLE,Whole blood,,Female,21-30,Not Hispanic or Latino,...,0,0,0,0,0,4,PP,16.6 WPP,38.2,


In [32]:
# Save the merged DataFrame to a new TSV file.
merged_df.to_csv(output_file, sep='\t', index=False)
print(f"Merged file saved as '{output_file}'")

Merged file saved as '/content/sample_data/Thesis/GSE108497_updated_metadata.tsv'


In [33]:
# Reload the saved file and print a preview to verify the output.
reloaded_df = pd.read_csv(output_file, sep='\t')
print("Preview of reloaded merged file (first 5 rows):")
reloaded_df.head()

Preview of reloaded merged file (first 5 rows):


Unnamed: 0,Sample,GSE,Experimental Strategy,GPL,Condition,Tissue,Cell Type,Gender,Age,Ethnicity,...,fd,nnd,pl_insuff,iugr,sga,batch,time_point,ga_at_collection,ga_at_end_of_pregnancy,if_pe_before_or_after_36_weeks
0,GSM2901826,GSE108497,Expression,GPL10558,Healthy,Whole blood,,Female,21-30,Not Hispanic or Latino,...,0,0,0,0,0,3,,,,
1,GSM2901827,GSE108497,Expression,GPL10558,Healthy,Whole blood,,Female,21-30,Not Hispanic or Latino,...,0,0,0,0,0,3,,,,
2,GSM2901828,GSE108497,Expression,GPL10558,Healthy,Whole blood,,Female,21-30,Not Hispanic or Latino,...,0,0,0,0,0,1,,,,
3,GSM2901829,GSE108497,Expression,GPL10558,Healthy,Whole blood,,Female,21-30,Not Hispanic or Latino,...,0,0,0,0,0,3,,,,
4,GSM2901830,GSE108497,Expression,GPL10558,Healthy,Whole blood,,Female,31-40,Not Hispanic or Latino,...,0,0,0,0,0,3,,,,


In [34]:
reloaded_df.tail()

Unnamed: 0,Sample,GSE,Experimental Strategy,GPL,Condition,Tissue,Cell Type,Gender,Age,Ethnicity,...,fd,nnd,pl_insuff,iugr,sga,batch,time_point,ga_at_collection,ga_at_end_of_pregnancy,if_pe_before_or_after_36_weeks
507,GSM2902333,GSE108497,Expression,GPL10558,SLE,Whole blood,,Female,31-40,Not Hispanic or Latino,...,0,0,0,0,0,4,PP,11.3 WPP,38.0,
508,GSM2902334,GSE108497,Expression,GPL10558,SLE,Whole blood,,Female,31-40,Not Hispanic or Latino,...,0,0,0,0,0,2,PP,17.4 WPP,40.1,
509,GSM2902335,GSE108497,Expression,GPL10558,SLE,Whole blood,,Female,31-40,Not Hispanic or Latino,...,0,0,0,0,0,4,PP,13.2 WPP,40.1,
510,GSM2902336,GSE108497,Expression,GPL10558,SLE,Whole blood,,Female,21-30,Not Hispanic or Latino,...,0,0,0,0,0,4,PP,15.3 WPP,37.6,
511,GSM2902337,GSE108497,Expression,GPL10558,SLE,Whole blood,,Female,21-30,Not Hispanic or Latino,...,0,0,0,0,0,4,PP,16.6 WPP,38.2,


In [35]:
# NOW I want to perfect the appearence of my metadata by dropping empty or duplicate columns, and changes their titles and order
# Rename columns as needed
rename_dict = {
    'donor_id': 'Donor_id',
    'sample_name': 'Sample_name',
    'race': 'Race',
    'Age': 'Age Group',
    'age': 'Age'
}
reloaded_df = reloaded_df.rename(columns=rename_dict)

In [36]:
# Drop empty or duplicate columns
columns_to_drop = [
    'Cell Type', # empty
    'tissue', # duplicate
    'ethnicity', # duplicate
    'gender' # duplicate
]
# Drop only columns that exist to avoid errors
columns_to_drop = [col for col in columns_to_drop if col in reloaded_df.columns]
reloaded_df = reloaded_df.drop(columns=columns_to_drop)

In [37]:
# Check to see if changes are working correctly
reloaded_df.head()

Unnamed: 0,Sample,GSE,Experimental Strategy,GPL,Condition,Tissue,Gender,Age Group,Ethnicity,grp_p_tp,...,fd,nnd,pl_insuff,iugr,sga,batch,time_point,ga_at_collection,ga_at_end_of_pregnancy,if_pe_before_or_after_36_weeks
0,GSM2901826,GSE108497,Expression,GPL10558,Healthy,Whole blood,Female,21-30,Not Hispanic or Latino,HC_NP_5,...,0,0,0,0,0,3,,,,
1,GSM2901827,GSE108497,Expression,GPL10558,Healthy,Whole blood,Female,21-30,Not Hispanic or Latino,HC_NP_5,...,0,0,0,0,0,3,,,,
2,GSM2901828,GSE108497,Expression,GPL10558,Healthy,Whole blood,Female,21-30,Not Hispanic or Latino,HC_NP_5,...,0,0,0,0,0,1,,,,
3,GSM2901829,GSE108497,Expression,GPL10558,Healthy,Whole blood,Female,21-30,Not Hispanic or Latino,HC_NP_5,...,0,0,0,0,0,3,,,,
4,GSM2901830,GSE108497,Expression,GPL10558,Healthy,Whole blood,Female,31-40,Not Hispanic or Latino,HC_NP_5,...,0,0,0,0,0,3,,,,


In [39]:
# Reorder columns
desired_order = [
    'Sample', 'GSE', 'Experiment', 'GPL', 'Condition', 'Tissue', 'Gender', 'Age Group', 'Age', 'Race', 'Ethnicity', 'grp_p_tp', 'Sample_name', 'Donor_id',
    'sle', 'apl', 'lac', 'tp', 'pe', 'fd', 'nnd', 'pl_insuff', 'iugr', 'sga', 'batch', 'time_point', 'ga_at_collection', 'ga_at_end_of_pregnancy', 'if_pe_before_or_after_36_weeks'
]
reloaded_df = reloaded_df[[col for col in desired_order if col in reloaded_df.columns]]


In [40]:
# Check to see if changes are working correctly
reloaded_df.head()

Unnamed: 0,Sample,GSE,GPL,Condition,Tissue,Gender,Age Group,Age,Race,Ethnicity,...,fd,nnd,pl_insuff,iugr,sga,batch,time_point,ga_at_collection,ga_at_end_of_pregnancy,if_pe_before_or_after_36_weeks
0,GSM2901826,GSE108497,GPL10558,Healthy,Whole blood,Female,21-30,25,C,Not Hispanic or Latino,...,0,0,0,0,0,3,,,,
1,GSM2901827,GSE108497,GPL10558,Healthy,Whole blood,Female,21-30,25,C,Not Hispanic or Latino,...,0,0,0,0,0,3,,,,
2,GSM2901828,GSE108497,GPL10558,Healthy,Whole blood,Female,21-30,25,AA,Not Hispanic or Latino,...,0,0,0,0,0,1,,,,
3,GSM2901829,GSE108497,GPL10558,Healthy,Whole blood,Female,21-30,24,C,Not Hispanic or Latino,...,0,0,0,0,0,3,,,,
4,GSM2901830,GSE108497,GPL10558,Healthy,Whole blood,Female,31-40,33,C,Not Hispanic or Latino,...,0,0,0,0,0,3,,,,


In [43]:
# Check to see if changes are working correctly
reloaded_df.shape

(512, 28)

In [53]:
# Save the result
reloaded_df.to_csv(output_file, sep='\t', index=False)