In [1]:
import os
import sys
import subprocess
import shutil
import pandas as pd
import numpy as np
import pysam
import json
import glob
import re

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def format_date_string(date_str):
  """
  This function takes a string in various date formats and returns a consistently formatted YYYY-MM-DD string.

  Args:
      date_str (str): The input date string.

  Returns:
      str: The consistently formatted date string (YYYY-MM-DD).
      None: If the input string does not contain a valid date.
  """

  # Attempt to match YYYY-MM-DD at the beginning
  match = re.match(r'^(\d{4})-(\d{2})-(\d{2})', date_str)
  if match:
      return '-'.join(match.groups())

  # Otherwise, try to extract date components from other parts
  for pattern in (r'\d{8}', r'\d{6}'):  # Patterns for MMDDYYYY and MMDDYY
      match = re.search(pattern, date_str)
      if match:
          date = match.group(0)
          year = date[4:] if len(date) == 8 else '20' + date[4:]
          month = date[:2]
          day = date[2:4]
          return f"{year}-{month}-{day}"

  # If no valid date found
  return None 

def parse_config(input_list):
  """
  This function parses a list of command-line arguments (strings) into a dictionary.

  Args:
      input_list (list): A list of strings representing command-line arguments.

  Returns:
      dict: A dictionary containing the parsed key-value pairs.
  """
  config_dict = {}
  for arg in input_list:
    if '=' in arg:
      key, value = arg.split('=')
      # Remove leading/trailing quotes if present around the value
      value = value.strip('"')
      # Handle values that may contain spaces (e.g., file paths)
      if value.startswith('[') and value.endswith(']'):
        try:
          # Try parsing list values (e.g., reference_files)
          value = eval(value)
        except (NameError, SyntaxError):
          # If parsing fails, treat it as a string
          pass
      config_dict[key.strip('--')] = value
  return config_dict

In [3]:
dpath = "/nfs/turbo/umms-indikar/shared/projects/HSC/data/adaptive_sampling/experiment_data/"

res = []

for subdir in os.listdir(dpath):
    date = format_date_string(subdir)
    


    search = f"{dpath}{subdir}/report*.json"
    report_path = glob.glob(search)[0]
    print(report_path)

    with open(report_path, 'r') as file:
        data = json.load(file)

        if 'use_count' in data['protocol_run_info']['flow_cell'].keys():
            use_count = data['protocol_run_info']['flow_cell']['use_count']
        else:
            use_count

        row = {
            'run_name' : subdir,
            'date' : date,
            'device' : data['protocol_run_info']['device']['device_type'],
            'server_path' : data['protocol_run_info']['user_info']['protocol_group_id'],
            'flow_cell_id' : data['protocol_run_info']['user_info']['user_specified_flow_cell_id'],
            'product_code' : data['protocol_run_info']['user_info']['user_specified_product_code'],
            'flowcell_use_count' : use_count,
            'kit' : data['protocol_run_info']['meta_info']['tags']['kit']['string_value'],
            'config_file' : data['protocol_run_info']['meta_info']['tags']['config path']['string_value'].split("/")[-1],
        }

        row.update(parse_config(data['protocol_run_info']['args']))
        res.append(row)

res = pd.DataFrame(res)
res.head()

/nfs/turbo/umms-indikar/shared/projects/HSC/data/adaptive_sampling/experiment_data/2024-04-26-iHSC-AS-6th-Run/report_FAX16607_20240426_1111_e8a5d8e8.json
/nfs/turbo/umms-indikar/shared/projects/HSC/data/adaptive_sampling/experiment_data/fifth_AS_test_03142024/report_PAS55331_20240314_1638_bfc3b4f2.json
/nfs/turbo/umms-indikar/shared/projects/HSC/data/adaptive_sampling/experiment_data/first_AS_test_03122024/report_PAS55331_20240312_1414_1cbba94d.json
/nfs/turbo/umms-indikar/shared/projects/HSC/data/adaptive_sampling/experiment_data/sixth_AS_test_03142024/report_PAS55331_20240314_1654_9357e86a.json
/nfs/turbo/umms-indikar/shared/projects/HSC/data/adaptive_sampling/experiment_data/2024-04-26-iHSC-AS-7th-Run/report_FAX16607_20240426_1531_0e7ed665.json
/nfs/turbo/umms-indikar/shared/projects/HSC/data/adaptive_sampling/experiment_data/second_AS_test_03132024/report_PAS55331_20240313_1306_36a181e6.json
/nfs/turbo/umms-indikar/shared/projects/HSC/data/adaptive_sampling/experiment_data/seventh_

Unnamed: 0,run_name,date,device,server_path,flow_cell_id,product_code,flowcell_use_count,kit,config_file,fast5,...,guppy_filename,min_qscore,min_read_length,split_files_by_barcode,pod5_reads_per_file,fastq_reads_per_file,raw,bam_only_write_primary_alignment,fast5_reads_per_file,enable
0,2024-04-26-iHSC-AS-6th-Run,2024-04-26,GRIDION,2024-04-26-iHSC-AS-6th-Run,FAX16607,FLO-MIN106,5,SQK-PCS111,custom_MIN106_DNA.toml,on,...,dna_r9.4.1_450bps_hac.cfg,9.0,200,off,,,,,,
1,fifth_AS_test_03142024,2024-03-14,P2_SOLO,2024-03-14-Adaptive-Sampling-05,PAS55331,FLO-PRO114M,7,SQK-LSK114,custom_PRO114_DNA.toml,off,...,dna_r10.4.1_e8.2_400bps_5khz_fast_prom.cfg,,200,,4000.0,4000.0,,,,
2,first_AS_test_03122024,2024-03-12,P2_SOLO,2024-03-12-iHSc-Adaptive-Sequencing-01,PAS55331,FLO-PRO114,3,SQK-LSK114,custom_PRO114_DNA.toml,off,...,dna_r10.4.1_e8.2_400bps_5khz_fast_prom.cfg,,200,,4000.0,4000.0,,,,
3,sixth_AS_test_03142024,2024-03-14,P2_SOLO,2024-03-14-06,PAS55331,FLO-PRO114M,8,SQK-LSK114,custom_PRO114_DNA.toml,off,...,dna_r10.4.1_e8.2_400bps_5khz_fast_prom.cfg,,200,,4000.0,4000.0,,,,
4,2024-04-26-iHSC-AS-7th-Run,2024-04-26,GRIDION,2024-04-26-iHSC-7th-Run,FAX16607,FLO-MIN106,7,SQK-PCS111,custom_MIN106_DNA.toml,on,...,dna_r9.4.1_450bps_hac.cfg,9.0,200,off,,,,,,


In [4]:
res = res.sort_values(by='run_name')
res.to_csv('metadata.csv', index=False)

In [5]:
t = data['protocol_run_info']['args']
t

['--fast5=on',
 '--pod5=off',
 '--fastq=on',
 '--bam=on',
 '--bam_only_write_primary_alignment=off',
 '--generate_bulk_file=on',
 '--active_channel_selection=on',
 '--base_calling=on',
 '--read_until',
 'reference_files=["/data/ADAPTIVE_SAMPLING_iHSC/2000_closest.fasta"]',
 'filter_type=enrich',
 'first_channel=1',
 'last_channel=512',
 '--fast5_reads_per_file=4000',
 '--fast5_data',
 'vbz_compress',
 '--fastq_reads_per_file=4000',
 '--fastq_data',
 'compress',
 '--bulk_file_content',
 'events=[[1,512]]',
 'read_table=[[1,512]]',
 '--mux_scan_period=1.5',
 '--pore_reserve=on',
 '--guppy_filename=dna_r9.4.1_450bps_hac.cfg',
 '--alignment',
 'reference_files=["/data/ADAPTIVE_SAMPLING_iHSC/2000_closest.fasta"]',
 '--min_read_length=200']

In [6]:
parse_config(t)

{'fast5': 'on',
 'pod5': 'off',
 'fastq': 'on',
 'bam': 'on',
 'bam_only_write_primary_alignment': 'off',
 'generate_bulk_file': 'on',
 'active_channel_selection': 'on',
 'base_calling': 'on',
 'reference_files': ['/data/ADAPTIVE_SAMPLING_iHSC/2000_closest.fasta'],
 'filter_type': 'enrich',
 'first_channel': '1',
 'last_channel': '512',
 'fast5_reads_per_file': '4000',
 'fastq_reads_per_file': '4000',
 'events': [[1, 512]],
 'read_table': [[1, 512]],
 'mux_scan_period': '1.5',
 'pore_reserve': 'on',
 'guppy_filename': 'dna_r9.4.1_450bps_hac.cfg',
 'min_read_length': '200'}