<a href="https://colab.research.google.com/github/AnzorGozalishvili/NASA_ODSR_DATA/blob/main/dataset_curation_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install Requirements

In [1]:
!pip install bio
!pip install biopython
!pip install boto3 torch

Collecting bio
  Downloading bio-1.5.9-py3-none-any.whl (276 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m276.4/276.4 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting biopython>=1.80 (from bio)
  Downloading biopython-1.81-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m56.6 MB/s[0m eta [36m0:00:00[0m
Collecting mygene (from bio)
  Downloading mygene-3.2.2-py2.py3-none-any.whl (5.4 kB)
Collecting gprofiler-official (from bio)
  Downloading gprofiler_official-1.0.0-py3-none-any.whl (9.3 kB)
Collecting biothings-client>=0.2.6 (from mygene->bio)
  Downloading biothings_client-0.3.0-py2.py3-none-any.whl (29 kB)
Installing collected packages: biopython, gprofiler-official, biothings-client, mygene, bio
Successfully installed bio-1.5.9 biopython-1.81 biothings-client-0.3.0 gprofiler-official-1.0.0 mygene-3.2.2
Collecting boto3
  Downloading boto3-1.28

In [91]:
from tqdm import tqdm
tqdm.pandas()

# Import Libraries
- for data access on s3
- for data read and manupulation

In [2]:
from Bio import SeqIO
import gzip
import pandas as pd
import boto3
from botocore import UNSIGNED
from botocore.client import Config
import requests
import os
from torch.utils.data import Dataset, DataLoader
import torch

import zipfile
import io
import csv


In [3]:
# remove downloads folder to start from scratch if needed
# !rm -rf downloaded_data

## Retrieving all `OSD-XXX` experiment directories from public s3 bucket `nasa-osdr`

Reference: https://registry.opendata.aws/nasa-osdr/

In [4]:
BUCKET_NAME='nasa-osdr'

In [5]:
CLIENT = boto3.client('s3', config=Config(signature_version=UNSIGNED))
PAGINATOR = CLIENT.get_paginator('list_objects_v2')

In [6]:
def list_all_directories(directory_prefix, delimiter='/', show_errors=True):
  result = PAGINATOR.paginate(Bucket=BUCKET_NAME, Prefix=directory_prefix, Delimiter=delimiter)

  all_directories = []
  for prefix in result.search('CommonPrefixes'):
      try:
          all_directories.append(prefix.get('Prefix').removesuffix(delimiter))
      except Exception as e:
          if show_errors is True:
            print(bucket_name, directory_prefix, delimiter, e)

  return all_directories

In [7]:
assert len(list_all_directories(directory_prefix='OSD-', delimiter='/')) == 451

## Get all versions sorted to be able to access latest only records

In [8]:
def get_osdr_versions_sorted(osdr_directory_name, delimiter='/', show_errors=True):
    result = PAGINATOR.paginate(
        Bucket=BUCKET_NAME,
        Prefix=osdr_directory_name + delimiter,
        Delimiter=delimiter
        )

    all_versions = []
    for prefix in result.search('CommonPrefixes'):
        version_directory_name = prefix.get('Prefix').removesuffix(delimiter).removeprefix(osdr_directory_name + delimiter)
        try:
            assert version_directory_name.startswith('version-')
            version_number = int(version_directory_name.split('-')[-1])
            all_versions.append((version_directory_name, version_number))
        except:
            if show_errors is True:
                print(f'skipping invalid version directory name:{version_directory_name} under: {osdr_directory_name}')

    all_versions_sorted = sorted(all_versions, key=lambda x: x[1])

    return all_versions_sorted

In [9]:
[
    (osd, get_osdr_versions_sorted(osdr_directory_name=osd, delimiter='/'))
    for osd in list_all_directories(directory_prefix='OSD-', delimiter='/')[:10]
]

[('OSD-1', [('version-6', 6)]),
 ('OSD-100', [('version-5', 5)]),
 ('OSD-101', [('version-5', 5)]),
 ('OSD-102', [('version-5', 5)]),
 ('OSD-103', [('version-5', 5)]),
 ('OSD-104', [('version-4', 4)]),
 ('OSD-105', [('version-4', 4)]),
 ('OSD-106', [('version-1', 1)]),
 ('OSD-107', [('version-1', 1)]),
 ('OSD-108', [('version-2', 2)])]

## count the number of versions distribution per OSD-XXX **(takes about a minute to run!)**

In [10]:
osdr_versions_sorted = {
    osdr: get_osdr_versions_sorted(osdr_directory_name=osdr, delimiter='/')
    for osdr in list_all_directories(directory_prefix='OSD-', delimiter='/')
}
osdr_versions_counts = {
    osdr: len(versions_sorted)
    for  osdr, versions_sorted in osdr_versions_sorted.items()
}

pd.Series(osdr_versions_counts).value_counts()

1    417
2     34
dtype: int64

In [11]:
pd.Series(osdr_versions_sorted)

OSD-1      [(version-6, 6)]
OSD-100    [(version-5, 5)]
OSD-101    [(version-5, 5)]
OSD-102    [(version-5, 5)]
OSD-103    [(version-5, 5)]
                 ...       
OSD-95     [(version-1, 1)]
OSD-96     [(version-6, 6)]
OSD-97     [(version-1, 1)]
OSD-98     [(version-8, 8)]
OSD-99     [(version-5, 5)]
Length: 451, dtype: object

In [12]:
osdr_latest_versions = {
    osdr: sorted_versions[-1]
    for osdr, sorted_versions in osdr_versions_sorted.items()
}

In [13]:
list(osdr_latest_versions.items())[:5]

[('OSD-1', ('version-6', 6)),
 ('OSD-100', ('version-5', 5)),
 ('OSD-101', ('version-5', 5)),
 ('OSD-102', ('version-5', 5)),
 ('OSD-103', ('version-5', 5))]

## Access latest metadata files for each `OSD-XXX`

In [14]:
def get_osdr_versioned_study_names(osdr_versioned_directory_name, delimiter='/', show_errors=True):
    result = PAGINATOR.paginate(
        Bucket=BUCKET_NAME,
        Prefix=osdr_versioned_directory_name + delimiter,
        Delimiter=delimiter
    )

    all_study_names = []
    for prefix in result.search('CommonPrefixes'):
        study_name = prefix.get('Prefix').removesuffix(delimiter).removeprefix(osdr_versioned_directory_name + delimiter)
        all_study_names.append(study_name)

    try:
        assert all_study_names, f'No studies found for: {osdr_versioned_directory_name}'
        assert 'metadata' in all_study_names, \
        f'No metadata found for: {osdr_versioned_directory_name}\tStudies found: {all_study_names}'
    except:
        if show_errors is True:
            print(f'Validation Failed for: {osdr_versioned_directory_name}\tStudies found: {all_study_names}')

    return all_study_names

In [15]:
[
    get_osdr_versioned_study_names(
        osdr_versioned_directory_name=osdr + '/' + latest_version[0],
        delimiter='/',
    )
    for osdr, latest_version in list(osdr_latest_versions.items())[:5]
]

[['array', 'metadata', 'microarray'],
 ['design', 'metadata', 'rna-seq', 'rna_seq', 'wgbs'],
 ['epigenomics',
  'epitranscriptomics',
  'metadata',
  'proteomics',
  'rna_seq',
  'sup',
  'transcriptomics'],
 ['design',
  'epigenomics',
  'epitranscriptomics',
  'metadata',
  'proteomics',
  'rna_seq',
  'transcriptomics'],
 ['design', 'metadata', 'proteomics', 'rna-seq', 'rna_seq', 'wgbs', 'wtbs']]

In [16]:
osdr_latest_version_studies = {
    osdr: get_osdr_versioned_study_names(
        osdr_versioned_directory_name=osdr + '/' + latest_version[0],
        delimiter='/',
    )
    for osdr, latest_version in list(osdr_latest_versions.items())
}
osdr_latest_version_studies_counts = {
    osdr: len(studies)
    for  osdr, studies in osdr_latest_version_studies.items()
}

pd.Series(osdr_latest_version_studies_counts).value_counts()

Validation Failed for: OSD-235/version-6	Studies found: ['rna-seq', 'rna_seq']
Validation Failed for: OSD-245/version-14	Studies found: ['rna-seq', 'rna_seq']


2    232
3    177
5     18
4     15
7      4
6      4
8      1
dtype: int64

In [17]:
pd.Series(pd.Series(osdr_latest_version_studies).sum()).value_counts().to_dict()

{'metadata': 449,
 'microarray': 153,
 'rna-seq': 123,
 'rna_seq': 107,
 'array': 68,
 'proteomics': 29,
 'wgs': 28,
 'transcriptomics': 22,
 'Amplicon': 17,
 'GAmplicon': 17,
 'metagenomics': 14,
 'epigenomics': 13,
 'sequencing': 13,
 'micoarray': 10,
 'wgbs': 8,
 'metabolomics': 7,
 'microCT': 7,
 'GSpatialTranscriptomics': 6,
 'sup': 6,
 'Histology': 6,
 'SpatialTranscriptomics': 6,
 'miRNA-Seq': 6,
 'design': 5,
 'scRNA-Seq': 4,
 'scRNA_Seq': 4,
 'GMetagenomics': 4,
 'RT-PCR': 3,
 'western-blot': 3,
 'Epigenomics': 3,
 'rnaseq': 3,
 'epitranscriptomics': 3,
 'miRNA_microarray': 3,
 'Gwgbs': 2,
 'snRNA-Seq': 2,
 'EPM': 2,
 'IHC': 2,
 'Bone_Biomechanical': 2,
 'wtbs': 2,
 'GsnATAC-Seq': 2,
 'snATAC-Seq': 2,
 'GsnRNA-Seq': 2,
 'microscopy': 2,
 'histomorphometry': 2,
 'immunostaining_microscopy': 2,
 'Balance_Beam': 1,
 'Radial_Arm_Water_Maze': 1,
 'Novel_Object_Recognition': 1,
 'Illumina sequencing': 1,
 'Flow_Cytometry': 1,
 'peripheral_quantitative_computed_tomography_pQCT': 1,
 

**Try to find the location of metadata tables for `OSD-245` and `OSD-235` !**

In [18]:
def list_all_file_by_extensions(directory_prefix, show_errors=True):
  result = PAGINATOR.paginate(Bucket=BUCKET_NAME, Prefix=directory_prefix)

  all_files = []
  for prefix in result.search('Contents'):
      try:
          file_prefix = prefix.get('Key')
          file_prefix_without_directory_prefix = file_prefix.removeprefix(directory_prefix)
          file_extension = ".".join(file_prefix_without_directory_prefix.split('.')[1:])
          all_files.append((file_prefix_without_directory_prefix, file_extension))
      except Exception as e:
          if show_errors is True:
              print(BUCKET_NAME, directory_prefix, e)

  all_extensions = set([x[1] for x in all_files])
  files_by_extensions = {extension:[] for extension in all_extensions}
  for file_prefix_without_directory_prefix, file_extension in all_files:
      files_by_extensions[file_extension].append(file_prefix_without_directory_prefix)

  return files_by_extensions

In [19]:
assert list_all_file_by_extensions(
            directory_prefix=f'OSD-235/version-6/rna_seq/',
        )

In [20]:
def explore_file_types_in_studies_to_identify_metadata(osdrs):
  separator = "="*80 + '\n'

  print("Versions\n", separator)
  # check number of versions
  for osdr in osdrs:
    print(osdr, 'versions found: ', osdr_versions_sorted[osdr])

  print("Studies\n", separator)
  # check studies
  for osdr in osdrs:
    latest_version = osdr_latest_versions[osdr][0]
    latest_studies = osdr_latest_version_studies[osdr]
    print(separator)
    print(osdr, latest_version, 'studies found:', latest_studies)


  # check study contents
  for osdr in osdrs:
    latest_version = osdr_latest_versions[osdr][0]
    latest_studies = osdr_latest_version_studies[osdr]

    print("Studies Contents\n", separator)
    # check all file types in each study
    for study in latest_studies:

        # check if more directories exist in studies
        study_subdirectories = list_all_directories(
            directory_prefix=f'{osdr}/{latest_version}/{study}/',
            delimiter='/',
            show_errors = False,
        )
        print(separator)
        print(osdr, latest_version, study, 'subdirectories found:', study_subdirectories)


        # group all files by extensions in study
        study_files_by_extensions = list_all_file_by_extensions(
            directory_prefix=f'{osdr}/{latest_version}/{study}/',
            show_errors = False,
        )
        try:
            extension_value_counts = pd.Series(study_files_by_extensions).apply(len).to_dict()
            extension_file_examples = pd.Series(study_files_by_extensions).apply(lambda x: x[:2]).to_dict()
            print(separator)
            print(
                osdr, latest_version, study,
                'files grouped by extensions:', extension_value_counts, extension_file_examples
            )
        except Exception as e:
            print(e)

In [21]:
explore_file_types_in_studies_to_identify_metadata(['OSD-245', 'OSD-235'])

Versions

OSD-245 versions found:  [('version-14', 14)]
OSD-235 versions found:  [('version-6', 6)]
Studies


OSD-245 version-14 studies found: ['rna-seq', 'rna_seq']

OSD-235 version-6 studies found: ['rna-seq', 'rna_seq']
Studies Contents


OSD-245 version-14 rna-seq subdirectories found: []

OSD-245 version-14 rna-seq files grouped by extensions: {'html': 107, 'fastq.gz': 118, 'zip': 107} {'html': ['GLDS-245_rna-seq_Mmus_C57-6T_LVR_BSL_ISS-T_Rep10_B2_R1_raw_fastqc.html', 'GLDS-245_rna-seq_Mmus_C57-6T_LVR_BSL_ISS-T_Rep10_B2_R2_raw_fastqc.html'], 'fastq.gz': ['GLDS-245_rna-seq_Mmus_C57-6T_LVR_BSL_ISS-T_Rep10_B2_R1_raw.fastq.gz', 'GLDS-245_rna-seq_Mmus_C57-6T_LVR_BSL_ISS-T_Rep10_B2_R2_raw.fastq.gz'], 'zip': ['GLDS-245_rna-seq_Mmus_C57-6T_LVR_BSL_ISS-T_Rep10_B2_R1_raw_fastqc.zip', 'GLDS-245_rna-seq_Mmus_C57-6T_LVR_BSL_ISS-T_Rep10_B2_R2_raw_fastqc.zip']}

OSD-245 version-14 rna_seq subdirectories found: []

OSD-245 version-14 rna_seq files grouped by extensions: {'final.out': 59, 'html':

## Find `samples` and `assays` tables in `metadata` and check consistency to join them.

In [27]:
list_all_file_by_extensions(directory_prefix=f'OSD-235/version-6/metadata/', show_errors=False)

{}

In [29]:
osdr_metadata_files_by_extension = {
    osdr: list_all_file_by_extensions(directory_prefix=f'{osdr}/{latest_version[0]}/metadata/')
    for osdr, latest_version in list(osdr_latest_versions.items())
}

nasa-osdr OSD-235/version-6/metadata/ 'NoneType' object has no attribute 'get'
nasa-osdr OSD-245/version-14/metadata/ 'NoneType' object has no attribute 'get'


Check that all contain files ending with `ISA.zip`

In [33]:
pd.Series(osdr_metadata_files_by_extension).apply(lambda x: [item.endswith('ISA.zip') for item in x.get('zip')] if x else []).value_counts()

[True]     448
[]           2
[False]      1
dtype: int64

In [36]:
pd.Series(osdr_metadata_files_by_extension)[pd.Series(osdr_metadata_files_by_extension).apply(lambda x: [item.endswith('ISA.zip') for item in x.get('zip')] if x else []).apply(lambda x: x == [False])]

OSD-640    {'zip': ['OSD-640_metadata_OSD-640-ISA 2.zip']}
dtype: object

## Create a table where we define the following:
- `experiment_id`
- `total_versions`
- `version`
- `all_studies`
- `study_file_extensions`
- `metadata_file_extensions`
- `metadata_uri`


In [52]:
[item for item in osdr_metadata_files_by_extension['OSD-102'].get('zip', []) if item.endswith('ISA.zip') or item.endswith('ISA 2.zip')][0]

'OSD-102_metadata_OSD-102-ISA.zip'

In [61]:
def get_metadata_zip_file_name(osdr):
  metadata_files = [
      item for item in osdr_metadata_files_by_extension[osdr].get('zip', [])
      if item.endswith('ISA.zip') or item.endswith('ISA 2.zip')
  ]
  return metadata_files[0] if metadata_files else ""

In [62]:
stats = pd.DataFrame(
    [
        {
            "experiment_id": osdr,
            "total_versions": len(osdr_versions_sorted[osdr]),
            "version": latest_version[1],
            "all_studies": osdr_latest_version_studies[osdr],
            "metadata_file_extensions": osdr_metadata_files_by_extension[osdr],
            "metadata_filename": get_metadata_zip_file_name(osdr),
            "metadata_path": os.path.join(osdr, latest_version[0], 'metadata', get_metadata_zip_file_name(osdr))
        }
        for osdr, latest_version in osdr_latest_versions.items()
    ]
)

In [63]:
stats

Unnamed: 0,experiment_id,total_versions,version,all_studies,metadata_file_extensions,metadata_filename,metadata_path
0,OSD-1,1,6,"[array, metadata, microarray]",{'zip': ['OSD-1_metadata_OSD-1-ISA.zip']},OSD-1_metadata_OSD-1-ISA.zip,OSD-1/version-6/metadata/OSD-1_metadata_OSD-1-...
1,OSD-100,1,5,"[design, metadata, rna-seq, rna_seq, wgbs]",{'zip': ['OSD-100_metadata_OSD-100-ISA.zip']},OSD-100_metadata_OSD-100-ISA.zip,OSD-100/version-5/metadata/OSD-100_metadata_OS...
2,OSD-101,1,5,"[epigenomics, epitranscriptomics, metadata, pr...",{'zip': ['GLDS-101_metadata_GLDS-101-ISA.zip']},GLDS-101_metadata_GLDS-101-ISA.zip,OSD-101/version-5/metadata/GLDS-101_metadata_G...
3,OSD-102,1,5,"[design, epigenomics, epitranscriptomics, meta...",{'zip': ['OSD-102_metadata_OSD-102-ISA.zip']},OSD-102_metadata_OSD-102-ISA.zip,OSD-102/version-5/metadata/OSD-102_metadata_OS...
4,OSD-103,1,5,"[design, metadata, proteomics, rna-seq, rna_se...",{'zip': ['OSD-103_metadata_OSD-103-ISA.zip']},OSD-103_metadata_OSD-103-ISA.zip,OSD-103/version-5/metadata/OSD-103_metadata_OS...
...,...,...,...,...,...,...,...
446,OSD-95,1,1,"[metadata, sequencing]",{'zip': ['GLDS-95_metadata_GSE82341-ISA.zip']},GLDS-95_metadata_GSE82341-ISA.zip,OSD-95/version-1/metadata/GLDS-95_metadata_GSE...
447,OSD-96,1,6,"[metadata, rna-seq, rna_seq]",{'zip': ['GLDS-96_metadata_GLDS-96-ISA.zip']},GLDS-96_metadata_GLDS-96-ISA.zip,OSD-96/version-6/metadata/GLDS-96_metadata_GLD...
448,OSD-97,1,1,"[metadata, microarray]",{'zip': ['GLDS-97_metadata_GSE9464-ISA.zip']},GLDS-97_metadata_GSE9464-ISA.zip,OSD-97/version-1/metadata/GLDS-97_metadata_GSE...
449,OSD-98,1,8,"[epigenomics, metadata, proteomics, rna_seq, t...",{'zip': ['OSD-98_metadata_OSD-98-ISA.zip']},OSD-98_metadata_OSD-98-ISA.zip,OSD-98/version-8/metadata/OSD-98_metadata_OSD-...


Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.
Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.


## Download metadata zip files

In [68]:
DOWNLOAD_DIR = "download_dir"
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

In [95]:
def download_single_metadata_zip(metadata_path, metadata_filename):
    local_metadata_path = ""
    if metadata_filename:
        local_metadata_path = os.path.join(DOWNLOAD_DIR, metadata_filename)
        CLIENT.download_file(BUCKET_NAME, metadata_path, local_metadata_path)
    return local_metadata_path

In [96]:
stats['local_metadata_path'] = stats[['metadata_filename', 'metadata_path']].progress_apply(lambda x: download_single_metadata_zip(**x.to_dict()), axis=1)

100%|██████████| 451/451 [01:20<00:00,  5.60it/s]


## Analyze tabular files in metadata zips

In [174]:
def list_filenames_in_zip(local_file_path):
    filenames = []
    if local_file_path:
        filenames = [x.filename for x in zipfile.ZipFile(local_file_path, 'r').infolist() if not x.filename.endswith('/') and not x.filename.endswith('DS_Store')]

    return filenames

In [175]:
stats['metadata_filenames'] = stats['local_metadata_path'].progress_apply(list_filenames_in_zip)

100%|██████████| 451/451 [00:00<00:00, 12488.40it/s]


#### Check file types in metadata_zips

In [176]:
pd.Series(stats['metadata_filenames'].apply(
    lambda x: list(set([item.split('.')[-1] for item in x]))
).sum()).value_counts()

txt    449
zip      2
dtype: int64

#### Try to match study names with metadata files using exact match

In [177]:
def match_study_with_file(all_studies, metadata_filenames):
    matches = {}
    for study in all_studies:
      if study != 'metadata':
          matches[study] = []
          for filename in metadata_filenames:
              if study.lower() in filename.lower():
                  matches[study].append(filename)

    return matches

In [178]:
stats['study_metadata_filenames'] = stats[['all_studies', 'metadata_filenames']].progress_apply(lambda x: match_study_with_file(**x.to_dict()), axis=1)

100%|██████████| 451/451 [00:00<00:00, 29111.87it/s]


In [179]:
stats['study_metadata_filenames'].apply(lambda x: sum(len(item) for item in x.values())).value_counts()

1    186
0    175
2     83
3      4
6      2
4      1
Name: study_metadata_filenames, dtype: int64

In [180]:
stats['all_studies'].apply(lambda x: len(x)-1).value_counts()

1    232
2    177
4     18
3     15
6      4
5      4
7      1
Name: all_studies, dtype: int64

## Detect Samples Metadata Tables

In [192]:
stats.head(1).T

Unnamed: 0,0
experiment_id,OSD-1
total_versions,1
version,6
all_studies,"[array, metadata, microarray]"
metadata_file_extensions,{'zip': ['OSD-1_metadata_OSD-1-ISA.zip']}
metadata_filename,OSD-1_metadata_OSD-1-ISA.zip
metadata_path,OSD-1/version-6/metadata/OSD-1_metadata_OSD-1-...
local_metadata_path,download_dir/OSD-1_metadata_OSD-1-ISA.zip
metadata_filenames,"[s_OSD-1.txt, i_Investigation.txt, a_OSD-1_tra..."
study_metadata_filenames,{'array': ['a_OSD-1_transcription-profiling_dn...


In [245]:
!pip install chardet



In [328]:
def read_from_metadata_zip(metadata_zip_path, file_path):
    print(metadata_zip_path, file_path)
    file_content = None
    with zipfile.ZipFile(metadata_zip_path, 'r') as zip_fp:
        with zip_fp.open(file_path, 'r') as file_fp:
            file_content = file_fp.read()
    return file_content

def detect_encoding(fp):
    import chardet
    detector = chardet.universaldetector.UniversalDetector()
    for line in fp:
        detector.feed(line)
        if detector.done:
            break
    detector.close()
    encoding = detector.result['encoding']

    return encoding

def read_df_with_right_encoding(fp, encoding='utf-8'):
    df = pd.DataFrame()
    # try passed encoding first
    try:
        df = pd.read_csv(fp, delimiter='\t', encoding=encoding)
    except:
        # try utf-8
        try:
            df = pd.read_csv(fp, delimiter='\t', encoding='utf-8')
        except:
            try:
                df = pd.read_csv(fp, delimiter='\t', encoding='windows-1252')
            except:
                pass
    return df

def read_dataframe_from_zip(metadata_zip_path, file_path):
    df = pd.DataFrame()
    encoding = 'utf-8'
    with zipfile.ZipFile(metadata_zip_path, 'r') as zip_fp:
        with zip_fp.open(file_path, 'r') as file_fp:
            possible_encoding = detect_encoding(file_fp)
        with zip_fp.open(file_path, 'r') as file_fp:
            df = read_df_with_right_encoding(file_fp, possible_encoding)

    return df

def is_sample_table(df):
    lower_columns = [x.lower() for x in df.columns]
    is_sample_table = sum([x in lower_columns for x in ['source name', 'sample name']]) == 2
    return is_sample_table

In [342]:
stats['metadata_sample_filenames'] = stats[['local_metadata_path', 'metadata_filenames']].progress_apply(
    lambda x: {
        metadata_filename: is_sample_table(
            read_dataframe_from_zip(
                x['local_metadata_path'], metadata_filename
            )
        )
        for metadata_filename in x['metadata_filenames']
    },
    axis=1
)

100%|██████████| 451/451 [00:52<00:00,  8.60it/s]


In [343]:
stats['metadata_sample_filenames'].apply(
    lambda x: sum(list(x.values()))
).value_counts()

1    449
0      2
Name: metadata_sample_filenames, dtype: int64

## Merge All Samples Tables

In [347]:
stats['metadata_sample_filename'] = stats['metadata_sample_filenames'].apply(
    lambda x: [file for file, is_sample_table in x.items() if is_sample_table is True]
).apply(lambda x: x[0] if x else "")

In [358]:
stats.head(1).T

Unnamed: 0,0
experiment_id,OSD-1
total_versions,1
version,6
all_studies,"[array, metadata, microarray]"
metadata_file_extensions,{'zip': ['OSD-1_metadata_OSD-1-ISA.zip']}
metadata_filename,OSD-1_metadata_OSD-1-ISA.zip
metadata_path,OSD-1/version-6/metadata/OSD-1_metadata_OSD-1-...
local_metadata_path,download_dir/OSD-1_metadata_OSD-1-ISA.zip
metadata_filenames,"[s_OSD-1.txt, i_Investigation.txt, a_OSD-1_tra..."
study_metadata_filenames,{'array': ['a_OSD-1_transcription-profiling_dn...


In [360]:
all_sample_dfs = []
for idx, row in stats.iterrows():
    if row['metadata_sample_filename']:
        # read samples dataframe
        df = read_dataframe_from_zip(row['local_metadata_path'], row['metadata_sample_filename'])

        # add experiment level attributes
        df['experiment_id'] = row['experiment_id']
        df['version'] = row['version']
        df['metadata_filename'] = row['metadata_filename']
        df['metadata_path'] = row['metadata_path']
        df['metadata_sample_filename'] = row['metadata_sample_filename']

        # normalize columns
        df.columns = [x.lower().strip() for x in df.columns]

        all_sample_dfs.append(df)
samples_df = pd.concat(all_sample_dfs)
samples_df.to_csv('samples.csv')

In [361]:
samples_df.shape

(24436, 799)

In [363]:
import json
with open('samples_columns.json', 'w') as file:
    json.dump(samples_df.columns.tolist(), file)

# TODO: Try to simplify column names and match between sample tables