<a href="https://colab.research.google.com/github/AnzorGozalishvili/NASA_ODSR_DATA/blob/main/dataset_curation_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Requirements

In [1]:
!pip install bio
!pip install biopython
!pip install boto3 torch



# Import Libraries
- for data access on s3
- for data read and manupulation

In [2]:
from Bio import SeqIO
import gzip
import pandas as pd
import boto3
from botocore import UNSIGNED
from botocore.client import Config
import requests
import os
from torch.utils.data import Dataset, DataLoader
import torch

import zipfile
import io
import csv


In [3]:
# remove downloads folder to start from scratch if needed
# !rm -rf downloaded_data

## Retrieving all `OSD-XXX` experiment directories from public s3 bucket `nasa-osdr`

Reference: https://registry.opendata.aws/nasa-osdr/

In [4]:
BUCKET_NAME='nasa-osdr'

In [5]:
CLIENT = boto3.client('s3', config=Config(signature_version=UNSIGNED))
PAGINATOR = CLIENT.get_paginator('list_objects_v2')

In [6]:
def list_all_directories(directory_prefix, delimiter='/', show_errors=True):
  result = PAGINATOR.paginate(Bucket=BUCKET_NAME, Prefix=directory_prefix, Delimiter=delimiter)

  all_directories = []
  for prefix in result.search('CommonPrefixes'):
      try:
          all_directories.append(prefix.get('Prefix').removesuffix(delimiter))
      except Exception as e:
          if show_errors is True:
            print(bucket_name, directory_prefix, delimiter, e)

  return all_directories

In [7]:
assert len(list_all_directories(directory_prefix='OSD-', delimiter='/')) == 451

## Get all versions sorted to be able to access latest only records

In [8]:
def get_osdr_versions_sorted(osdr_directory_name, delimiter='/', show_errors=True):
    result = PAGINATOR.paginate(
        Bucket=BUCKET_NAME,
        Prefix=osdr_directory_name + delimiter,
        Delimiter=delimiter
        )

    all_versions = []
    for prefix in result.search('CommonPrefixes'):
        version_directory_name = prefix.get('Prefix').removesuffix(delimiter).removeprefix(osdr_directory_name + delimiter)
        try:
            assert version_directory_name.startswith('version-')
            version_number = int(version_directory_name.split('-')[-1])
            all_versions.append((version_directory_name, version_number))
        except:
            if show_errors is True:
                print(f'skipping invalid version directory name:{version_directory_name} under: {osdr_directory_name}')

    all_versions_sorted = sorted(all_versions, key=lambda x: x[1])

    return all_versions_sorted

In [9]:
[
    (osd, get_osdr_versions_sorted(osdr_directory_name=osd, delimiter='/'))
    for osd in list_all_directories(directory_prefix='OSD-', delimiter='/')[:10]
]

[('OSD-1', [('version-6', 6)]),
 ('OSD-100', [('version-5', 5)]),
 ('OSD-101', [('version-5', 5)]),
 ('OSD-102', [('version-5', 5)]),
 ('OSD-103', [('version-5', 5)]),
 ('OSD-104', [('version-4', 4)]),
 ('OSD-105', [('version-4', 4)]),
 ('OSD-106', [('version-1', 1)]),
 ('OSD-107', [('version-1', 1)]),
 ('OSD-108', [('version-2', 2)])]

## count the number of versions distribution per OSD-XXX **(takes about a minute to run!)**

In [10]:
osdr_versions_sorted = {
    osdr: get_osdr_versions_sorted(osdr_directory_name=osdr, delimiter='/')
    for osdr in list_all_directories(directory_prefix='OSD-', delimiter='/')
}
osdr_versions_counts = {
    osdr: len(versions_sorted)
    for  osdr, versions_sorted in osdr_versions_sorted.items()
}

pd.Series(osdr_versions_counts).value_counts()

1    417
2     34
dtype: int64

In [11]:
pd.Series(osdr_versions_sorted)

OSD-1      [(version-6, 6)]
OSD-100    [(version-5, 5)]
OSD-101    [(version-5, 5)]
OSD-102    [(version-5, 5)]
OSD-103    [(version-5, 5)]
                 ...       
OSD-95     [(version-1, 1)]
OSD-96     [(version-6, 6)]
OSD-97     [(version-1, 1)]
OSD-98     [(version-8, 8)]
OSD-99     [(version-5, 5)]
Length: 451, dtype: object

In [12]:
osdr_latest_versions = {
    osdr: sorted_versions[-1]
    for osdr, sorted_versions in osdr_versions_sorted.items()
}

In [13]:
list(osdr_latest_versions.items())[:5]

[('OSD-1', ('version-6', 6)),
 ('OSD-100', ('version-5', 5)),
 ('OSD-101', ('version-5', 5)),
 ('OSD-102', ('version-5', 5)),
 ('OSD-103', ('version-5', 5))]

## Access latest metadata files for each `OSD-XXX`

In [14]:
def get_osdr_versioned_study_names(osdr_versioned_directory_name, delimiter='/', show_errors=True):
    result = PAGINATOR.paginate(
        Bucket=BUCKET_NAME,
        Prefix=osdr_versioned_directory_name + delimiter,
        Delimiter=delimiter
    )

    all_study_names = []
    for prefix in result.search('CommonPrefixes'):
        study_name = prefix.get('Prefix').removesuffix(delimiter).removeprefix(osdr_versioned_directory_name + delimiter)
        all_study_names.append(study_name)

    try:
        assert all_study_names, f'No studies found for: {osdr_versioned_directory_name}'
        assert 'metadata' in all_study_names, \
        f'No metadata found for: {osdr_versioned_directory_name}\tStudies found: {all_study_names}'
    except:
        if show_errors is True:
            print(f'Validation Failed for: {osdr_versioned_directory_name}\tStudies found: {all_study_names}')

    return all_study_names

In [15]:
[
    get_osdr_versioned_study_names(
        osdr_versioned_directory_name=osdr + '/' + latest_version[0],
        delimiter='/',
    )
    for osdr, latest_version in list(osdr_latest_versions.items())[:5]
]

[['array', 'metadata', 'microarray'],
 ['design', 'metadata', 'rna-seq', 'rna_seq', 'wgbs'],
 ['epigenomics',
  'epitranscriptomics',
  'metadata',
  'proteomics',
  'rna_seq',
  'sup',
  'transcriptomics'],
 ['design',
  'epigenomics',
  'epitranscriptomics',
  'metadata',
  'proteomics',
  'rna_seq',
  'transcriptomics'],
 ['design', 'metadata', 'proteomics', 'rna-seq', 'rna_seq', 'wgbs', 'wtbs']]

In [19]:
osdr_latest_version_studies = {
    osdr: get_osdr_versioned_study_names(
        osdr_versioned_directory_name=osdr + '/' + latest_version[0],
        delimiter='/',
    )
    for osdr, latest_version in list(osdr_latest_versions.items())
}
osdr_latest_version_studies_counts = {
    osdr: len(studies)
    for  osdr, studies in osdr_latest_version_studies.items()
}

pd.Series(osdr_latest_version_studies_counts).value_counts()

Validation Failed for: OSD-235/version-6	Studies found: ['rna-seq', 'rna_seq']
Validation Failed for: OSD-245/version-14	Studies found: ['rna-seq', 'rna_seq']


2    232
3    177
5     18
4     15
7      4
6      4
8      1
dtype: int64

In [20]:
pd.Series(pd.Series(osdr_latest_version_studies).sum()).value_counts().to_dict()

{'metadata': 449,
 'microarray': 153,
 'rna-seq': 123,
 'rna_seq': 107,
 'array': 68,
 'proteomics': 29,
 'wgs': 28,
 'transcriptomics': 22,
 'Amplicon': 17,
 'GAmplicon': 17,
 'metagenomics': 14,
 'epigenomics': 13,
 'sequencing': 13,
 'micoarray': 10,
 'wgbs': 8,
 'metabolomics': 7,
 'microCT': 7,
 'GSpatialTranscriptomics': 6,
 'sup': 6,
 'Histology': 6,
 'SpatialTranscriptomics': 6,
 'miRNA-Seq': 6,
 'design': 5,
 'scRNA-Seq': 4,
 'scRNA_Seq': 4,
 'GMetagenomics': 4,
 'RT-PCR': 3,
 'western-blot': 3,
 'Epigenomics': 3,
 'rnaseq': 3,
 'epitranscriptomics': 3,
 'miRNA_microarray': 3,
 'Gwgbs': 2,
 'snRNA-Seq': 2,
 'EPM': 2,
 'IHC': 2,
 'Bone_Biomechanical': 2,
 'wtbs': 2,
 'GsnATAC-Seq': 2,
 'snATAC-Seq': 2,
 'GsnRNA-Seq': 2,
 'microscopy': 2,
 'histomorphometry': 2,
 'immunostaining_microscopy': 2,
 'Balance_Beam': 1,
 'Radial_Arm_Water_Maze': 1,
 'Novel_Object_Recognition': 1,
 'Illumina sequencing': 1,
 'Flow_Cytometry': 1,
 'peripheral_quantitative_computed_tomography_pQCT': 1,
 

**Need to find the location of metadata tables for `OSD-245` and `OSD-235` !**

In [29]:
def list_all_file_by_extensions(directory_prefix, show_errors=True):
  result = PAGINATOR.paginate(Bucket=BUCKET_NAME, Prefix=directory_prefix)

  all_files = []
  for prefix in result.search('Contents'):
      try:
          file_prefix = prefix.get('Key')
          file_prefix_without_directory_prefix = file_prefix.removeprefix(directory_prefix)
          file_extension = ".".join(file_prefix_without_directory_prefix.split('.')[1:])
          all_files.append((file_prefix_without_directory_prefix, file_extension))
      except Exception as e:
          if show_errors is True:
              print(BUCKET_NAME, directory_prefix, e)

  all_extensions = set([x[1] for x in all_files])
  files_by_extensions = {extension:[] for extension in all_extensions}
  for file_prefix_without_directory_prefix, file_extension in all_files:
      files_by_extensions[file_extension].append(file_prefix_without_directory_prefix)

  return files_by_extensions

In [30]:
assert list_all_file_by_extensions(
            directory_prefix=f'OSD-235/version-6/rna_seq/',
        )

In [33]:
def explore_file_types_in_studies_to_identify_metadata(osdrs):
  separator = "="*80 + '\n'

  print("Versions\n", separator)
  # check number of versions
  for osdr in osdrs:
    print(osdr, 'versions found: ', osdr_versions_sorted[osdr])

  print("Studies\n", separator)
  # check studies
  for osdr in osdrs:
    latest_version = osdr_latest_versions[osdr][0]
    latest_studies = osdr_latest_version_studies[osdr]
    print(separator)
    print(osdr, latest_version, 'studies found:', latest_studies)


  # check study contents
  for osdr in osdrs:
    latest_version = osdr_latest_versions[osdr][0]
    latest_studies = osdr_latest_version_studies[osdr]

    print("Studies Contents\n", separator)
    # check all file types in each study
    for study in latest_studies:

        # check if more directories exist in studies
        study_subdirectories = list_all_directories(
            directory_prefix=f'{osdr}/{latest_version}/{study}/',
            delimiter='/',
            show_errors = False,
        )
        print(separator)
        print(osdr, latest_version, study, 'subdirectories found:', study_subdirectories)


        # group all files by extensions in study
        study_files_by_extensions = list_all_file_by_extensions(
            directory_prefix=f'{osdr}/{latest_version}/{study}/',
            show_errors = False,
        )
        try:
            extension_value_counts = pd.Series(study_files_by_extensions).apply(len).to_dict()
            extension_file_examples = pd.Series(study_files_by_extensions).apply(lambda x: x[:2]).to_dict()
            print(separator)
            print(
                osdr, latest_version, study,
                'files grouped by extensions:', extension_value_counts, extension_file_examples
            )
        except Exception as e:
            print(e)

In [34]:
explore_file_types_in_studies_to_identify_metadata(['OSD-245', 'OSD-235'])

Versions

OSD-245 versions found:  [('version-14', 14)]
OSD-235 versions found:  [('version-6', 6)]
Studies


OSD-245 version-14 studies found: ['rna-seq', 'rna_seq']

OSD-235 version-6 studies found: ['rna-seq', 'rna_seq']
Studies Contents


OSD-245 version-14 rna-seq subdirectories found: []

OSD-245 version-14 rna-seq files grouped by extensions: {'fastq.gz': 118, 'zip': 107, 'html': 107} {'fastq.gz': ['GLDS-245_rna-seq_Mmus_C57-6T_LVR_BSL_ISS-T_Rep10_B2_R1_raw.fastq.gz', 'GLDS-245_rna-seq_Mmus_C57-6T_LVR_BSL_ISS-T_Rep10_B2_R2_raw.fastq.gz'], 'zip': ['GLDS-245_rna-seq_Mmus_C57-6T_LVR_BSL_ISS-T_Rep10_B2_R1_raw_fastqc.zip', 'GLDS-245_rna-seq_Mmus_C57-6T_LVR_BSL_ISS-T_Rep10_B2_R2_raw_fastqc.zip'], 'html': ['GLDS-245_rna-seq_Mmus_C57-6T_LVR_BSL_ISS-T_Rep10_B2_R1_raw_fastqc.html', 'GLDS-245_rna-seq_Mmus_C57-6T_LVR_BSL_ISS-T_Rep10_B2_R2_raw_fastqc.html']}

OSD-245 version-14 rna_seq subdirectories found: []

OSD-245 version-14 rna_seq files grouped by extensions: {'sortedByCoord.out.bam':

## Find `samples` and `assays` tables in `metadata` and check consistency to join them.

In [None]:
# setup s3 dataset params
s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))
bucket_name = 'nasa-osdr'
directory = 'OSD-'

directory_file_dict = {}
directory_list = []

# List directories under the base directory
directories = list_all_directories(bucket_name='nasa-osdr', directory_prefix='OSD-', delimiter='/')

# For each directory, list zip files and download them
for directory in directories:
    # List zip files in the directory
    response = s3.list_objects(Bucket=bucket_name, Prefix=directory)
    zip_files = [(obj['Key'], obj. for obj in response['Contents'] if obj['Key'].endswith('.zip') if "metadata" in obj["Key"]]

    # Initialize a list to store filenames for this directory
    directory_filenames = []

    # Download each zip file
    for zip_file_key in zip_files:
        # Extract filename from file key
        filename = zip_file_key.split('/')[-1]

        # Add the filename to the list for this directory
        directory_filenames.append(filename)
        dir = "/".join(zip_file_key.split('/')[:1]) + "/"
        directory_list.append(dir)

        # Create a directory to store downloaded files
        os.makedirs(f'downloaded_data/'+ dir, exist_ok=True)

        # Check if the file already exists
        if os.path.exists(f'downloaded_data/{dir}'+"/"+filename):
            # print(f'{filename} already exists. Skipping...')
            continue
        # Download the zip file
        s3.download_file(bucket_name, zip_file_key, f'downloaded_data/{dir}'+"/"+filename)

    # Add the list of filenames to the dictionary under the directory name
    directory_name = directory.removesuffix('/')  # Remove trailing slash
    directory_file_dict[directory_name] = directory_filenames


In [None]:
next(iter(directory_file_dict))

In [None]:
directory_list[:10]

In [None]:
for directory in directory_list:
  # Define the directory containing the downloaded zip files
  downloaded_dir = f'downloaded_data/{directory}'
  # Loop through the downloaded zip files
  for filename in os.listdir(downloaded_dir):
      # print(filename)
      if filename.endswith('.txt'):
        continue
      if filename.endswith('.zip'):
          zip_filepath = os.path.join(downloaded_dir, filename)

          # Extract only text files from the specified folder
          with zipfile.ZipFile(zip_filepath, 'r') as zip_ref:
              for item in zip_ref.infolist():
                zip_ref.extract(item, downloaded_dir)

          # Remove the zip file after extraction
          os.remove(zip_filepath)
          # print(f'Removed {filename}')
      else:
          continue
      #     # Add the code you want to execute if zip files are found here

In [None]:
from genericpath import exists
import pandas as pd


# Define the delimiter used in the text files
delimiter = '\t'  # For tab-separated files
# found_files = []
for dir in directory_list:
  # Define the root directory
  root_dir = f'downloaded_data/'+ dir  # Replace with the actual root directory
  # Define the file extension you're looking for
  target_extension = '.txt'  # Change to the extension you're interested in

  # List to store found file paths

  # Loop through all directories and subdirectories
  for dirpath, dirnames, filenames in os.walk(root_dir):
      for filename in filenames:
          target_extension = '.txt'
          if filename.endswith(target_extension):
            file_path = os.path.join(dirpath, filename)

            if "investigation" in filename.lower():
              os.remove(file_path)
              continue

            # Read the text file and convert to CSV
            try:
              df = pd.read_csv(file_path, delimiter=delimiter, encoding='cp1252', on_bad_lines='skip')
            except:
              pass

            csv_file = file_path.replace('.txt', '.csv')  # Replace .txt with .csv in the file name
            df.to_csv(csv_file, index=False)
            os.remove(file_path)



In [None]:
!ls downloaded_data/OSD-136/

In [None]:
per_dir_assays = {}
per_dir_samples = {}
for directory in directory_list:
  per_dir_assays[directory] = []
  per_dir_samples[directory] = []

  root_dir = f'downloaded_data/'+ directory

  for dirpath, dirnames, filenames in os.walk(root_dir):
    for filename in filenames:

      if filename.endswith('.csv'):
        file_path = os.path.join(dirpath, filename)
        try:
          df = pd.read_csv(file_path, encoding='cp1252')
        except:
          df = pd.read_csv(file_path)

        df_columns_lowercase = [str(x.remove().lower()) for x in df.columns]

        if 'source name' in df_columns_lowercase:
          per_dir_assays[directory].append(file_path)

        elif 'sample name' in df_columns_lowercase:
          per_dir_samples[directory].append(file_path)

In [None]:
next(iter(per_dir_samples.items()))

In [None]:
pd.Series(per_dir_samples).apply(len).value_counts()

In [None]:
pd.Series(per_dir_assays).apply(len).value_counts()

In [None]:
bad_experiment_id = pd.Series(per_dir_assays)[pd.Series(per_dir_assays).apply(len) == 0].index[0]

In [None]:
del per_dir_assays[bad_experiment_id]

In [None]:
len(pd.Series(pd.Series(per_dir_samples).apply(lambda x: [x.strip().lower() for x in pd.read_csv(x[0]).columns]).sum()).unique().tolist())

In [None]:
len(pd.Series(pd.Series(per_dir_assays).apply(lambda x: [x.strip().lower() for x in pd.read_csv(x[0]).columns]).sum()).unique().tolist())

In [None]:
all_assay_dfs = []
for dir, assay_files in per_dir_assays.items():
  # take first file only!
  df = pd.read_csv(assay_files[0])
  df.columns = [x.strip().lower() for x in df.columns]
  df['osdr_experiment_id'] = dir.removesuffix('/')
  all_assay_dfs.append(df)

assays_df = pd.concat(all_assay_dfs)
assays_df.reset_index(drop=True, inplace=True)
assays_df.to_csv('assays.csv')

In [None]:
# all_sample_dfs = []
# for dir, sample_files in per_dir_samples.items():
#   # take first file only!
#   df = pd.read_csv(sample_files[0])
#   df.columns = [x.strip().lower() for x in df.columns]
#   df['osdr_experiment_id'] = dir.removesuffix('/')
#   all_sample_dfs.append(df)

# samples_df = pd.concat(all_sample_dfs)
# samples_df.reset_index(drop=True, inplace=True)
# samples_df.to_csv('samples.csv')

In [None]:
# Assays and Samples were different number of records. Need to merge those tables somehow

In [None]:
assays_df.shape

In [None]:
assays_df.dtypes

In [None]:
assays_df.dtypes.to_dict()

In [None]:
import json
with open('assays_dtypes.json', 'w') as file:
  json.dump(assays_df.columns.tolist(), file)