<a href="https://colab.research.google.com/github/AnzorGozalishvili/NASA_ODSR_DATA/blob/main/dataset_curation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install bio
!pip install biopython
!pip install boto3 torch

Collecting bio
  Downloading bio-1.5.9-py3-none-any.whl (276 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/276.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━[0m [32m153.6/276.4 kB[0m [31m4.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m276.4/276.4 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting biopython>=1.80 (from bio)
  Downloading biopython-1.81-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m49.9 MB/s[0m eta [36m0:00:00[0m
Collecting mygene (from bio)
  Downloading mygene-3.2.2-py2.py3-none-any.whl (5.4 kB)
Collecting gprofiler-official (from bio)
  Downloading gprofiler_official-1.0.0-py3-none-any.whl (9.3 kB)
Collecting biothings-client>=0.2.6 (from mygene->bio)
  Downloading biothings_client-0.3.0-py2.py3-none-any.

In [3]:
from Bio import SeqIO
import gzip
import pandas as pd
import boto3
from botocore import UNSIGNED
from botocore.client import Config
import requests
import os
from torch.utils.data import Dataset, DataLoader
import torch

import zipfile
import io
import csv


In [4]:
# Initialize S3 client with no-sign-request configuration
s3 = boto3.client('s3', config=Config(signature_version=UNSIGNED))
# dir_list = ["466","465"] #,"524","520"
# Specify your S3 bucket name and directory
bucket_name = 'nasa-osdr'
directory = 'OSD-1'

directory_file_dict = {}
directory_list = []

# for i in dir_list:
#   directory = 'OSD-'+i

# List directories under the base directory
response = s3.list_objects(Bucket=bucket_name, Prefix=directory, Delimiter='/')
directories = [common_prefix['Prefix'] for common_prefix in response.get('CommonPrefixes', [])]

# For each directory, list zip files and download them
for directory in directories:
    # List zip files in the directory
    response = s3.list_objects(Bucket=bucket_name, Prefix=directory)
    zip_files = [obj['Key'] for obj in response['Contents'] if obj['Key'].endswith('.zip') if "metadata" in obj["Key"]]

    # Initialize a list to store filenames for this directory
    directory_filenames = []

    # Download each zip file
    for zip_file_key in zip_files:
        # Extract filename from file key
        filename = zip_file_key.split('/')[-1]

        # Add the filename to the list for this directory
        directory_filenames.append(filename)
        dir = "/".join(zip_file_key.split('/')[:1]) + "/"
        directory_list.append(dir)

        # Create a directory to store downloaded files
        os.makedirs(f'downloaded_data/'+ dir, exist_ok=True)

        # Check if the file already exists
        if os.path.exists(f'downloaded_data/{dir}'+"/"+filename):
            # print(f'{filename} already exists. Skipping...')
            continue
        # Download the zip file
        s3.download_file(bucket_name, zip_file_key, f'downloaded_data/{dir}'+"/"+filename)

    # Add the list of filenames to the dictionary under the directory name
    directory_name = directory.rstrip('/')  # Remove trailing slash
    directory_file_dict[directory_name] = directory_filenames


In [8]:
directory_file_dict

{'OSD-1': ['OSD-1_metadata_OSD-1-ISA.zip'],
 'OSD-100': ['OSD-100_metadata_OSD-100-ISA.zip'],
 'OSD-101': ['GLDS-101_metadata_GLDS-101-ISA.zip'],
 'OSD-102': ['OSD-102_metadata_OSD-102-ISA.zip'],
 'OSD-103': ['OSD-103_metadata_OSD-103-ISA.zip'],
 'OSD-104': ['GLDS-104_metadata_GLDS-104-ISA.zip'],
 'OSD-105': ['GLDS-105_metadata_GLDS-105-ISA.zip'],
 'OSD-106': ['GLDS-106_metadata_GSE90166-ISA.zip'],
 'OSD-107': ['GLDS-107_metadata_E-GEOD-78980-ISA.zip'],
 'OSD-108': ['GLDS-108_metadata_STS135_Liver_MET-ISA.zip'],
 'OSD-109': ['GLDS-109_metadata_E-GEOD-68874-ISA.zip'],
 'OSD-11': ['OSD-11_metadata_GLDS-11-ISA.zip'],
 'OSD-110': ['GLDS-110_metadata_PXD002096-ISA.zip'],
 'OSD-111': ['GLDS-111_metadata_GSE80223-ISA.zip'],
 'OSD-112': ['GLDS-112_metadata_GSE71770-ISA.zip'],
 'OSD-113': ['OSD-113_metadata_OSD-113-ISA.zip'],
 'OSD-114': ['GLDS-114_metadata_GSE93860-ISA.zip'],
 'OSD-115': ['GLDS-115_metadata_E-GEOD-12647-ISA.zip'],
 'OSD-116': ['GLDS-116_metadata_STS-135_Skin-ISA.zip'],
 'OSD-1

In [5]:
directory_list

['OSD-1/',
 'OSD-100/',
 'OSD-101/',
 'OSD-102/',
 'OSD-103/',
 'OSD-104/',
 'OSD-105/',
 'OSD-106/',
 'OSD-107/',
 'OSD-108/',
 'OSD-109/',
 'OSD-11/',
 'OSD-110/',
 'OSD-111/',
 'OSD-112/',
 'OSD-113/',
 'OSD-114/',
 'OSD-115/',
 'OSD-116/',
 'OSD-117/',
 'OSD-118/',
 'OSD-119/',
 'OSD-12/',
 'OSD-120/',
 'OSD-120/',
 'OSD-121/',
 'OSD-122/',
 'OSD-123/',
 'OSD-124/',
 'OSD-125/',
 'OSD-126/',
 'OSD-127/',
 'OSD-128/',
 'OSD-129/',
 'OSD-13/',
 'OSD-130/',
 'OSD-131/',
 'OSD-132/',
 'OSD-133/',
 'OSD-134/',
 'OSD-135/',
 'OSD-136/',
 'OSD-137/',
 'OSD-137/',
 'OSD-138/',
 'OSD-139/',
 'OSD-14/',
 'OSD-140/',
 'OSD-141/',
 'OSD-144/',
 'OSD-145/',
 'OSD-146/',
 'OSD-147/',
 'OSD-147/',
 'OSD-148/',
 'OSD-149/',
 'OSD-15/',
 'OSD-151/',
 'OSD-152/',
 'OSD-153/',
 'OSD-154/',
 'OSD-155/',
 'OSD-156/',
 'OSD-157/',
 'OSD-158/',
 'OSD-159/',
 'OSD-16/',
 'OSD-160/',
 'OSD-161/',
 'OSD-162/',
 'OSD-163/',
 'OSD-164/',
 'OSD-165/',
 'OSD-166/',
 'OSD-167/',
 'OSD-168/',
 'OSD-17/',
 'OSD-17

In [9]:
for directory in directory_list:
  # Define the directory containing the downloaded zip files
  downloaded_dir = f'downloaded_data/{directory}'
  # Loop through the downloaded zip files
  for filename in os.listdir(downloaded_dir):
      # print(filename)
      if filename.endswith('.txt'):
        continue
      if filename.endswith('.zip'):
          zip_filepath = os.path.join(downloaded_dir, filename)

          # Extract only text files from the specified folder
          with zipfile.ZipFile(zip_filepath, 'r') as zip_ref:
              for item in zip_ref.infolist():
                zip_ref.extract(item, downloaded_dir)

          # Remove the zip file after extraction
          os.remove(zip_filepath)
          # print(f'Removed {filename}')
      else:
          continue
      #     # Add the code you want to execute if zip files are found here

In [None]:
# lst_file_keys[:5]

In [10]:
from genericpath import exists
import pandas as pd


# Define the delimiter used in the text files
delimiter = '\t'  # For tab-separated files
# found_files = []
for dir in directory_list:
  # Define the root directory
  root_dir = f'downloaded_data/'+ dir  # Replace with the actual root directory
  # Define the file extension you're looking for
  target_extension = '.txt'  # Change to the extension you're interested in

  # List to store found file paths

  # Loop through all directories and subdirectories
  for dirpath, dirnames, filenames in os.walk(root_dir):
      for filename in filenames:
          target_extension = '.txt'
          if filename.endswith(target_extension):
            file_path = os.path.join(dirpath, filename)

            if "investigation" in filename.lower():
              os.remove(file_path)
              continue

            # Read the text file and convert to CSV
            df = pd.read_csv(file_path, delimiter=delimiter, encoding='cp1252', on_bad_lines='skip')
            csv_file = file_path.replace('.txt', '.csv')  # Replace .txt with .csv in the file name
            df.to_csv(csv_file, index=False)
            os.remove(file_path)



In [27]:
!ls downloaded_data/OSD-136/

a_GLDS-136_microarray_metadata.csv  s_GLDS-136_microarray_metadata.csv
i_GLDS-136_microarray_metadata.csv


In [34]:
per_dir_assays = {}
per_dir_samples = {}
for directory in directory_list:
  per_dir_assays[directory] = []
  per_dir_samples[directory] = []

  root_dir = f'downloaded_data/'+ directory

  for dirpath, dirnames, filenames in os.walk(root_dir):
    for filename in filenames:

      if filename.endswith('.csv'):
        file_path = os.path.join(dirpath, filename)
        df = pd.read_csv(file_path, encoding='cp1252')
        df_columns_lowercase = [str(x.strip().lower()) for x in df.columns]

        if 'source name' in df_columns_lowercase:
          per_dir_assays[directory].append(file_path)

        elif 'sample name' in df_columns_lowercase:
          per_dir_samples[directory].append(file_path)

In [None]:
per_dir_samples

In [41]:
pd.Series(per_dir_samples).apply(len).value_counts()

1    79
2     7
4     7
3     3
dtype: int64

In [42]:
pd.Series(per_dir_assays).apply(len).value_counts()

1    94
2     2
dtype: int64

In [55]:
len(pd.Series(pd.Series(per_dir_samples).apply(lambda x: [x.strip().lower() for x in pd.read_csv(x[0]).columns]).sum()).unique().tolist())

274

In [54]:
len(pd.Series(pd.Series(per_dir_assays).apply(lambda x: [x.strip().lower() for x in pd.read_csv(x[0]).columns]).sum()).unique().tolist())

336

In [71]:
all_assay_dfs = []
for dir, assay_files in per_dir_assays.items():
  # take first file only!
  df = pd.read_csv(assay_files[0])
  df.columns = [x.strip().lower() for x in df.columns]
  df['osdr_experiment_id'] = dir.rstrip('/')
  all_assay_dfs.append(df)

assays_df = pd.concat(all_assay_dfs)
assays_df.reset_index(drop=True, inplace=True)
assays_df.to_csv('assays.csv')

In [72]:
all_sample_dfs = []
for dir, sample_files in per_dir_samples.items():
  # take first file only!
  df = pd.read_csv(sample_files[0])
  df.columns = [x.strip().lower() for x in df.columns]
  df['osdr_experiment_id'] = dir.rstrip('/')
  all_sample_dfs.append(df)

samples_df = pd.concat(all_sample_dfs)
samples_df.reset_index(drop=True, inplace=True)
samples_df.to_csv('samples.csv')