## TODO
* `gb_accession` and `gisaid_accession` are not found for new sequences, how do we concat to `metadata.csv` without them?
* metadata format for NCBI
* support tools for manual sanity checks

In [3]:
from bjorn import *

In [4]:
!cat run.sh

#!/bin/bash

python3 bjorn.py --not-dry-run --include-bams --out-dir /home/gk/southpark/2020-11-21_release --sample-sheet /home/gk/code/hCoV19/release_summary_csv/2020-11-20_seq_summary.csv --cpus 25 --analysis-folder /home/gk/analysis/ --output-metadata /home/gk/analysis/hcov-19-genomics/metadata.csv


In [7]:
# whether or not to include bam files in the release
include_bams = False
# these are the columns to include in the metadata.csv output
meta_columns = ['sample_id', 'Virus name', 'Submitting lab', 'Location', 'Collection date', 'AVG_DEPTH', 'COVERAGE']
# path to reference sequence (used later for MSA and tree construction)
# this is the directory where results get saved
out_dir = Path('/home/al/data/bjorn_test')
# number of cores to use
num_cpus = 4
# file path to samples sheet (make sure it is the most recent)
sample_sheet_fpath = Path('/home/al/data/COVID_sequencing_summary-GISAID.csv')
# path to analysis results
analysis_fpath = Path('/home/gk/analysis/')
# file path to metadata of samples that have already been released
released_samples_fpath = Path('/home/gk/analysis/hcov-19-genomics/metadata.csv')
# Whether run is dry
dry_run = True

In [31]:



# # Test
# out_dir = "/home/gk/southpark/2020-11-21_release"
# sample_sheet_fpath = "/home/gk/code/hCoV19/release_summary_csv/2020-11-20_seq_summary.csv"
# analysis_fpath = "/home/gk/analysis/"
# released_samples_fpath = "/home/gk/analysis/hcov-19-genomics/metadata.csv"
# dry_run = True

print(f"""
Dry run: {dry_run}.
Include BAMS: {include_bams}.
Reading release summary file from {sample_sheet_fpath}.
Reading repository metadata from {released_samples_fpath}.
Searching analysis folder {analysis_fpath}.
"""
)

# Collecting Sequence Data


# grab all filepaths for bam data
bam_filepaths = glob.glob("{}/**/merged_aligned_bams/illumina/*.bam".format(analysis_fpath))
bam_filepaths = [Path(fp) for fp in bam_filepaths]
# consolidate sample ID format
bam_ids = get_ids(bam_filepaths)
# Turn into dataframe
bam_data = list(zip(*[bam_ids, bam_filepaths]))
bam_df = pd.DataFrame(data=bam_data, columns=['sample_id', 'PATH'])
# grab all paths to consensus sequences
consensus_filepaths = glob.glob("{}/**/consensus_sequences/illumina/*.fa".format(analysis_fpath))
consensus_filepaths = [Path(fp) for fp in consensus_filepaths]
# consolidate sample ID format
consensus_ids = get_ids(consensus_filepaths)
# Turn into dataframe
consensus_data = list(zip(*[consensus_ids, consensus_filepaths]))
consensus_df = pd.DataFrame(data=consensus_data, columns=['sample_id', 'PATH'])
# clean up cns and bam (remove duplicate IDs)
bam_df.drop_duplicates(subset=['sample_id'], keep='last', inplace=True)
consensus_df.drop_duplicates(subset=['sample_id'], keep='last', inplace=True)
# include only SEARCH samples
consensus_df = consensus_df[(consensus_df['sample_id'].str.contains('SEARCH'))]
# merge consensus and bam filepaths for each sample ID
analysis_df = pd.merge(consensus_df, bam_df, on='sample_id', how='left')
if include_bams:
    num_samples_missing_bams = analysis_df[analysis_df['PATH_y'].isna()].shape[0]
    # exclude any samples that do not have BAM data
    analysis_df = analysis_df[~analysis_df['PATH_y'].isna()]
    
    

# load sample sheet data (GISAID) - make sure to download most recent one
seqsum = pd.read_csv(sample_sheet_fpath)
# clean up
seqsum = seqsum[(~seqsum['SEARCH SampleID'].isna()) & (seqsum['SEARCH SampleID']!='#REF!')]
# consolidate sample ID format
seqsum.loc[:, 'sample_id'] = seqsum['SEARCH SampleID'].apply(process_id)
seqsum.drop_duplicates(subset=['sample_id'], keep='last', inplace=True)
seqsum = seqsum[seqsum['New sequences ready for release'] == 'Yes']
# JOIN summary sheet with analysis meta data
sequence_results = pd.merge(seqsum, analysis_df, on='sample_id', how='inner')
print("Preparing {} samples for release".format(sequence_results.shape[0]))
# samples missing consensus or BAM sequence files
# ## Make sure to remove any samples that have already been uploaded to github (just an extra safety step)
# load metadata.csv from github repo, then clean up
meta_df = pd.read_csv(released_samples_fpath)
meta_df = meta_df[meta_df['ID'].str.contains('SEARCH')]
# consolidate sample ID format
meta_df.loc[:, 'sample_id'] = meta_df['ID'].apply(process_id)
# meta_df['sample_id']
# get IDs of samples that have already been released
released_seqs = meta_df['sample_id'].unique()
# filter out released samples from all the samples we got
final_result = sequence_results.copy()
# final_result = sequence_results[~sequence_results['sample_id'].isin(released_seqs)]
# Transfer files
if not dry_run:
    transfer_files(final_result, out_dir, include_bams=include_bams, ncpus=num_cpus)
# ## Getting coverage information
cov_filepaths = glob.glob("{}/**/trimmed_bams/illumina/reports/*.tsv".format(analysis_fpath))
# get_ipython().getoutput("find {analysis_fpath} -type f -path '*trimmed_bams/illumina/reports*' -name '*.tsv'")
cov_filepaths = [Path(fp) for fp in cov_filepaths]
# read coverage data and clean it up
cov_df = pd.concat((pd.read_csv(f, sep='\t').assign(path=f) for f in cov_filepaths))
cov_df.loc[:,'sample_id'] = cov_df['SAMPLE'].apply(process_coverage_sample_ids)
cov_df.loc[:,'date'] = cov_df['path'].apply(lambda x: ''.join(x.split('/')[4].split('.')[:3]))
cov_df = (cov_df.sort_values('date')
          .drop_duplicates(subset=['sample_id'], keep='last'))


Dry run: True.
Include BAMS: False.
Reading release summary file from /home/al/data/COVID_sequencing_summary-GISAID.csv.
Reading repository metadata from /home/gk/analysis/hcov-19-genomics/metadata.csv.
Searching analysis folder /home/gk/analysis/.

Preparing 1205 samples for release


In [33]:
# ans.columns

In [40]:
# JOIN results with coverage info
ans = (
    pd.merge(final_result, cov_df, on='sample_id')
    .assign(
        collection_date = lambda x: pd.to_datetime(x["Collection date"]).dt.strftime("%Y-%m-%d")
    )
    .rename(columns={
        "SEARCH SampleID": "ID",
        "Location": "location",
        "COVERAGE": "percent_coverage_cds",
        "AVG_DEPTH": "avg_depth",
        "Authors": "authors",
        "Originating lab": "originating_lab"
    })
)
# COLUMNS TO INCLUDE IN METADATA
meta_cols = ["ID", "collection_date", "location", "percent_coverage_cds", "avg_depth", "authors", "originating_lab"]
#, "gb_accession", "gisaid_accession"
old_metadata = pd.read_csv(released_samples_fpath)

ans = pd.concat([old_metadata, ans.loc[:, meta_cols]])

ans.to_csv(out_dir/'metadata.csv', index=False)

low_coverage_samples = ans[ans["percent_coverage_cds"] < 90]
# Data logging
with open("{}/data_release.log".format(out_dir), 'w') as f:
    f.write('{} samples were found to have coverage below 90%\n'.format(low_coverage_samples.shape[0]))
    f.write('{} samples were ignored because they were missing consensus sequence files\n'.format(num_samples_missing_cons))
    f.write('{} samples were ignored because they were missing BAM sequence files\n'.format(num_samples_missing_bams))
print("Transfer Complete. All results saved in {}".format(out_dir))

Transfer Complete. All results saved in /home/al/data/bjorn_test


In [49]:
# ans

## JSON metadata for CViSB portal

In [48]:
def to_json_entry(x):
    """Return experiment metadata in JSON format compatible with CViSB Data Portal"""
    patient_id = x['ID']
    date = x['collection_date']
    authors = x['authors']
    return {"experimentID": f"{patient_id}-sarscov2", "privatePatientID": patient_id, 
           "variableMeasured": "SARS-CoV-2 virus sequence", 
           "measurementTechnique": "Nucleic Acid Sequencing", 
           "includedInDataset": "sarscov2-virus-seq", 
           "dateModified": date, 
           "citation": [{"name": authors, 
                         "@type": "Organization"}], "data": []}

ans.apply(to_json_entry, axis=1).tolist()

[{'experimentID': 'MG0987-sarscov2',
  'privatePatientID': 'MG0987',
  'variableMeasured': 'SARS-CoV-2 virus sequence',
  'measurementTechnique': 'Nucleic Acid Sequencing',
  'includedInDataset': 'sarscov2-virus-seq',
  'dateModified': '2020-03-18',
  'citation': [{'name': 'SEARCH Alliance San Diego', '@type': 'Organization'}],
  'data': []},
 {'experimentID': 'PC00101P-sarscov2',
  'privatePatientID': 'PC00101P',
  'variableMeasured': 'SARS-CoV-2 virus sequence',
  'measurementTechnique': 'Nucleic Acid Sequencing',
  'includedInDataset': 'sarscov2-virus-seq',
  'dateModified': '2020-03-11',
  'citation': [{'name': 'SEARCH Alliance San Diego', '@type': 'Organization'}],
  'data': []},
 {'experimentID': 'SEARCH-0007-SAN-sarscov2',
  'privatePatientID': 'SEARCH-0007-SAN',
  'variableMeasured': 'SARS-CoV-2 virus sequence',
  'measurementTechnique': 'Nucleic Acid Sequencing',
  'includedInDataset': 'sarscov2-virus-seq',
  'dateModified': '2020-03-21',
  'citation': [{'name': 'SEARCH Allian

## NCBI metadata