## Metadata Scrapper of samples in Mammalian Methylation Consortium (MMC) dataset (GSE223748)

In [1]:
# Import libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
from IPython.display import display, clear_output

In [2]:
### Functions

# Function to print a simple loading bar
def print_loading_bar(progress):
    """
    Function to print a simple loading bar.
    """
    bar_length = 50
    progress_length = int(progress * bar_length)
    bar = '=' * progress_length + '-' * (bar_length - progress_length)
    return f"[{bar}] {int(progress * 100)}%"   

### 1. Loading Data

In [3]:
### Load methylation data 
complete_methylation_data = pd.read_csv("GSE223748_datBetaNormalized.csv")

In [235]:
complete_methylation_data.shape

(37554, 15044)

In [4]:
### Load human methylation data
human_methylation_data = pd.read_csv("GSE184211_datBetaNormalized.csv")

In [53]:
### Load the metadata file from https://mydata.clockfoundation.org/app/mammalian-consortium-data-browser
metadata = pd.read_csv("complete_metadada_MMDB.csv")

## 2. Data Preprocessing

2.1. Data Cleaning

In [230]:
metadata.head()

Unnamed: 0,SpeciesCommonName,SpeciesLatinName,Tissue,Female,Age,SID
0,Olive baboon,Papio hamadryas,Cortex,1.0,22.12,X202794570001_R01C01
1,Olive baboon,Papio hamadryas,Cortex,0.0,8.04,X202794570001_R01C02
2,Olive baboon,Papio hamadryas,Cortex,1.0,18.95,X202794570001_R02C01
3,Olive baboon,Papio hamadryas,Cortex,0.0,18.64,X202794570001_R02C02
4,Olive baboon,Papio hamadryas,Cortex,1.0,18.61,X202794570001_R03C01


In [6]:
print(f'Metadata shape: {metadata.shape}')
print(f'Complete methylation shape: {complete_methylation_data.shape}')

Metadata shape: (15037, 6)
Complete methylation shape: (37554, 15044)


In [12]:
### Look into the sample number discrepancy

# Remove the X in the name of all the samples in SID and save it in a new df
metadata_missing = metadata.copy()
metadata_missing["SID"] = metadata_missing["SID"].str.replace("X", "")

# check which samples are not present in metadata compared to the complete methylation data
missing_samples = set(complete_methylation_data.columns[1:]) - set(metadata_missing['SID']) 
print(f'Number of samples missing: {len(missing_samples)}')
missing_samples

Number of samples missing: 6


{'204529320081_R02C01',
 '204529320081_R02C02',
 '204529320081_R03C01',
 '204529320081_R04C01',
 '204529320081_R04C02',
 '204529320081_R05C01'}

The number of samples in the downloaded metadata from the MMC website does not match with the methylation data from the 2023 paper. The metatada from MMC is missing 6 samples. 

'204529320081_R02C01',
'204529320081_R02C02',
'204529320081_R03C01',
'204529320081_R04C01',
'204529320081_R04C02',
'204529320081_R05C01',

Metadata for these samples will be entered manually. 

2.2. Data Scrapping

We need to get GEO ID for samples as this is not available in the metadata file. For that we scrape the GEO page associated with the data:    
https://www.ncbi.nlm.nih.gov/geo/browse/?view=samples&display=500&page=1&series=223748&suppl=IDAT&zsort=date

In [13]:
# Create list of sample names for the complete methylation data
sample_names = complete_methylation_data.columns[1:]

In [256]:
sample_names    

Index(['202897220093_R01C01', '202897220093_R02C01', '202897220093_R03C01',
       '202897220093_R04C01', '202897220093_R05C01', '202897220093_R06C01',
       '202897220093_R01C02', '202897220093_R02C02', '202897220093_R03C02',
       '202897220093_R04C02',
       ...
       '206139140103_R02C02', '206139140103_R03C02', '206139140103_R04C02',
       '206139140103_R05C02', '206139140103_R06C02', '206139140104_R01C01',
       '206139140104_R02C01', '206139140104_R03C01', '206139140104_R04C01',
       '206139140104_R05C01'],
      dtype='object', length=15043)

In [222]:
### Fetch pages from NCBI GEO database ### 

# fetching the content of the pages and storing it in a dictionary makes it easier 
# and faster to scrape the content of the pages later on.

# For loading bar
total_pages = 31
processed_pages = 0

# Fetch all pages' content and store in a dictionary
page_contents = {}
for page_num in range(1, total_pages + 1):  # Adjust range as needed
    try:
        url = f'https://www.ncbi.nlm.nih.gov/geo/browse/?view=samples&display=1000&series=223748&page={page_num}'
        response = requests.get(url)
        response.raise_for_status()
        page_contents[page_num] = response.content
    
    except Exception as e:
        print(f"Error fetching page {page_num}: {e}")

    processed_pages += 1
    progress = processed_pages / total_pages
    loading_bar = print_loading_bar(progress)
    clear_output(wait=True)
    display(loading_bar)

# Check if all pages were fetched
if len(page_contents) == total_pages:
    print("All pages fetched successfully.")



All pages fetched successfully.


In [257]:
### Scrape the GEO IDs from the fetched HTML content ###

geo_id_samples = []

total_pages = 31
processed_pages = 0

# Process the downloaded HTML content
for page_num, content in page_contents.items():
    try:
        soup = BeautifulSoup(content, 'html.parser')
        html_text = soup.get_text()

        for sample_name in sample_names: # sample_names is a list of sample names define above
            # search_term = sample_name.split('X')[1] # if the sample starts with X, remove it
            search_term = sample_name
            # Modified pattern to match sample at the end of the descriptor
            pattern = r'\b(GSM\d+)\s+[^\n]*\b{}\b'.format(re.escape(search_term))
            # pattern = r'\b(GSM\d+)\s+Sample\d+\.Human\.{}\b'.format(re.escape(search_term))
            match = re.search(pattern, html_text)
            
            if match:
                geo_id = match.group(1)                
                geo_id_samples.append([sample_name, geo_id])
    
    except Exception as e:
        print(f"Error processing page {page_num}: {e}")

    processed_pages += 1
    progress = processed_pages / total_pages
    loading_bar = print_loading_bar(progress)
    clear_output(wait=True)
    display(loading_bar)

# Check if metadata list is empty
if not geo_id_samples:
    print("No samples found.")
else:
    # Create a pandas DataFrame from the metadata list
    geo_ID_df = pd.DataFrame(geo_id_samples, columns=['Sample', 'GEO_ID'])
    
# Check for missing samples
found_samples = geo_ID_df['Sample'].tolist() if not geo_ID_df.empty else []
missing_samples = [sample_name for sample_name in sample_names if sample_name not in found_samples]

# Print counts of samples found and not found
print("Total samples:", len(sample_names))
print("Samples found:", len(found_samples))
print("Samples not found:", len(missing_samples))
print('List of Samples not found:', missing_samples)
geo_ID_df




Total samples: 15043
Samples found: 15043
Samples not found: 0
List of Samples not found: []


Unnamed: 0,Sample,GEO_ID
0,202897220093_R01C01,GSM6979529
1,202897220093_R02C01,GSM6979530
2,202897220093_R03C01,GSM6979531
3,202897220093_R04C01,GSM6979532
4,202897220093_R05C01,GSM6979533
...,...,...
15038,206139140104_R01C01,GSM6994567
15039,206139140104_R02C01,GSM6994568
15040,206139140104_R03C01,GSM6994569
15041,206139140104_R04C01,GSM6994570


In [31]:
### Add an X to all the samples in the geo_ID_df dataframe
geo_ID_df["Sample"] = "X" + geo_ID_df["Sample"]

In [32]:
### save the dataframe to a csv file (so the script doesn't have to be run again)
geo_ID_df.to_csv("GSE223748_Geo_ID_df.csv", index=False)

In [14]:
### Load the dataframe from the csv file
geo_ID_df = pd.read_csv("GSE223748_Geo_ID_df.csv")

In [35]:
geo_ID_df.head()

Unnamed: 0,Sample,GEO_ID
0,X202897220093_R01C01,GSM6979529
1,X202897220093_R02C01,GSM6979530
2,X202897220093_R03C01,GSM6979531
3,X202897220093_R04C01,GSM6979532
4,X202897220093_R05C01,GSM6979533


I want to match the metadata pulled from the website with the metadata file I created in the notebook 'Methylation_Patterns_Human'. Things that need to match are:
- 1. SID > Sample
- 2. Add a column 'GEO_ID' 
- 3. Change sex format to male/female from 0/1 
- 4. Add missing samples

In [54]:
### 1.Change SID for Sample
metadata = metadata.rename(columns={"SID": "Sample"})

In [55]:
### 2. Add the GEO_ID to the metadata by merging the metadata and the geo_ID_df dataframes
metadata = pd.merge(metadata, geo_ID_df, on='Sample', how='left')
metadata.head()

Unnamed: 0,SpeciesCommonName,SpeciesLatinName,Tissue,Female,Age,Sample,GEO_ID
0,Olive baboon,Papio hamadryas,Cortex,1.0,22.12,X202794570001_R01C01,GSM6981727
1,Olive baboon,Papio hamadryas,Cortex,0.0,8.04,X202794570001_R01C02,GSM6981733
2,Olive baboon,Papio hamadryas,Cortex,1.0,18.95,X202794570001_R02C01,GSM6981728
3,Olive baboon,Papio hamadryas,Cortex,0.0,18.64,X202794570001_R02C02,GSM6981734
4,Olive baboon,Papio hamadryas,Cortex,1.0,18.61,X202794570001_R03C01,GSM6981729


In [57]:
### 3. Rename the column Female to Sex and change format of 1/0 to Female/Male respectively

# Rename Sex column
metadata.rename(columns={'Female':'Sex'}, inplace=True)

# Rename sex values
metadata['Sex'] = metadata['Sex'].replace({1.0: 'Female', 0.0: 'Male'})
metadata.head()


Unnamed: 0,SpeciesCommonName,SpeciesLatinName,Tissue,Sex,Age,Sample,GEO_ID
0,Olive baboon,Papio hamadryas,Cortex,Female,22.12,X202794570001_R01C01,GSM6981727
1,Olive baboon,Papio hamadryas,Cortex,Male,8.04,X202794570001_R01C02,GSM6981733
2,Olive baboon,Papio hamadryas,Cortex,Female,18.95,X202794570001_R02C01,GSM6981728
3,Olive baboon,Papio hamadryas,Cortex,Male,18.64,X202794570001_R02C02,GSM6981734
4,Olive baboon,Papio hamadryas,Cortex,Female,18.61,X202794570001_R03C01,GSM6981729


'204529320081_R02C01',
'204529320081_R02C02', 
'204529320081_R03C01', 
'204529320081_R04C01', 
'204529320081_R04C02', 
'204529320081_R05C01',

In [62]:
### Add missing samples to the metadata dataframe

# Create a DataFrame for new samples
new_samples = pd.DataFrame({
    'SpeciesCommonName': ['Prairie vole', 'Prairie vole', 'Prairie vole', 'Prairie vole', 'Prairie vole', 'Prairie vole'],
    'SpeciesLatinName': ['Microtus ochrogaster', 'Microtus ochrogaster', 'Microtus ochrogaster', 'Microtus ochrogaster', 'Microtus ochrogaster', 'Microtus ochrogaster'],
    'Tissue': ['Ear', 'Liver', 'Liver', 'Liver', 'Ear', 'Liver'],
    'Sex': ['Female', 'Male', 'Female', 'Male', 'Male', 'Female'],
    'Age': [0.3671, 0.8493, 0.789, 0.3644, 0.874, 0.5808],
    'Sample': ['X204529320081_R02C01', 'X204529320081_R02C02', 'X204529320081_R03C01','X204529320081_R04C01', 'X204529320081_R04C02', 'X204529320081_R05C01'],
    'GEO_ID': ['GSM6989921', 'GSM6989927', 'GSM6989922', 'GSM6989923', 'GSM6989929', 'GSM6989924']
})

# Append the new samples to the existing DataFrame
metadata = pd.concat([metadata, new_samples], ignore_index=True)
metadata.head()


Unnamed: 0,SpeciesCommonName,SpeciesLatinName,Tissue,Sex,Age,Sample,GEO_ID
0,Olive baboon,Papio hamadryas,Cortex,Female,22.12,X202794570001_R01C01,GSM6981727
1,Olive baboon,Papio hamadryas,Cortex,Male,8.04,X202794570001_R01C02,GSM6981733
2,Olive baboon,Papio hamadryas,Cortex,Female,18.95,X202794570001_R02C01,GSM6981728
3,Olive baboon,Papio hamadryas,Cortex,Male,18.64,X202794570001_R02C02,GSM6981734
4,Olive baboon,Papio hamadryas,Cortex,Female,18.61,X202794570001_R03C01,GSM6981729


In [72]:
# Compare size of metadata and complete methylation data
print(f'Metadata shape: {metadata.shape}')
print(f'Complete methylation shape: {complete_methylation_data.iloc[:, 1:].shape}')

Metadata shape: (15043, 7)
Complete methylation shape: (37554, 15043)


We now have the same number of samples in the metadata file. 

In [None]:
### Save the complete metadata to a csv file
metadata.to_csv("GSE223748_complete_metadata.csv", index=False)