## Internal Mutations Report
This notebook contains developmental code for analyzing mutations in SARS-CoV-2 samples released by Andersen Lab. 

In [1]:
from bjorn import *
from bjorn_support import *
from onion_trees import *
import gffutils
import math
from mutationsaa import *

In [26]:
in_dir = Path('/home/al/code/HCoV-19-Genomics/consensus_sequences/')
out_dir = Path('/home/al/analysis/mutations/alab_git')
meta_fp = Path('/home/al/code/HCoV-19-Genomics/metadata.csv')

In [None]:
fasta_fp = concat_fasta(in_dir, out_dir/'cns_seqs')

In [8]:
msa_fp = align_fasta_reference(fasta_fp, num_cpus=20, ref_fp='/home/gk/code/hCoV19/db/NC045512.fasta')

In [9]:
msa_fp

'/home/al/analysis/mutations/alab_git/cns_seqs_aligned.fa'

In [36]:
def identify_replacements(input_fasta, 
                          meta_fp,
                          patient_zero: str='NC_045512.2', 
                          gene2pos: dict=GENE2POS):
    print(f"Loading Alignment file at: {input_fasta}")
    cns = AlignIO.read(input_fasta, 'fasta')
    print(f"Initial cleaning...")
    seqs, ref_seq = process_cns_seqs(cns, patient_zero,
                                     start_pos=0, end_pos=30000)
#     ref_seq = get_seq(cns, patient_zero)
#     seqs = get_seqs(cns, 0, 30000)
    print(f"Creating a dataframe...")
    seqsdf = (pd.DataFrame(index=seqs.keys(), 
                           data=seqs.values(), 
                           columns=['sequence'])
                .reset_index()
                .rename(columns={'index': 'idx'}))
    print(f"Identifying mutations...")
    # for each sample, identify list of substitutions (position:alt)
    seqsdf['replacements'] = seqsdf['sequence'].apply(find_replacements, args=(ref_seq,))
    # wide-to-long data manipulation
    seqsdf = seqsdf.explode('replacements')
    # initialize position column
    seqsdf['pos'] = -1
    # populate position column
    seqsdf.loc[~seqsdf['replacements'].isna(), 'pos'] = (seqsdf.loc[~seqsdf['replacements'].isna(), 'replacements']
       .apply(lambda x: int(x.split(':')[0])))
    # filter out non-substitutions
    seqsdf = seqsdf.loc[seqsdf['pos']!=-1]
    print(f"Mapping Genes to mutations...")
    # identify gene of each substitution
    seqsdf['gene'] = seqsdf['pos'].apply(map_gene_to_pos)
    seqsdf = seqsdf.loc[~seqsdf['gene'].isna()]
    # filter our substitutions in non-gene positions
    seqsdf = seqsdf.loc[seqsdf['gene']!='nan']
    print(f"Compute codon numbers...")
    # compute codon number of each substitution
    seqsdf['codon_num'] = seqsdf.apply(compute_codon_num, args=(GENE2POS,), axis=1)
    print(f"Fetch reference codon...")
    # fetch the reference codon for each substitution
    seqsdf['ref_codon'] = seqsdf.apply(get_ref_codon, args=(ref_seq, GENE2POS), axis=1)
    print(f"Fetch alternative codon...")
    # fetch the alternative codon for each substitution
    seqsdf['alt_codon'] = seqsdf.apply(get_alt_codon, args=(GENE2POS,), axis=1)
    print(f"Map amino acids...")
    # fetch the reference and alternative amino acids
    seqsdf['ref_aa'] = seqsdf['ref_codon'].apply(get_aa)
    seqsdf['alt_aa'] = seqsdf['alt_codon'].apply(get_aa)
    # filter out substitutions with non-amino acid alternates (bad consensus calls)
    seqsdf = seqsdf.loc[seqsdf['alt_aa']!='nan']
    print(f"Fuse with metadata...")
    # load and join metadata
    meta = pd.read_csv(meta_fp)
    seqsdf = pd.merge(seqsdf, meta, left_on='idx', right_on='fasta_hdr')
    # clean and process sample collection dates
    seqsdf = seqsdf.loc[(seqsdf['collection_date']!='Unknown') 
                   & (seqsdf['collection_date']!='1900-01-00')]
    seqsdf.loc[seqsdf['collection_date'].str.contains('/'), 'collection_date'] = seqsdf['collection_date'].apply(lambda x: x.split('/')[0])
    seqsdf['date'] = pd.to_datetime(seqsdf['collection_date'])
    # aggregate on each substitutions, compute number of samples and other attributes
    subs = (seqsdf.groupby(['gene', 'pos', 'ref_aa', 'codon_num', 'alt_aa'])
    .agg(
     num_samples=('ID', 'nunique'),
     first_detected=('date', 'min'),
     last_detected=('date', 'max'),
#      locations=('location', uniq_locs),
     location_counts=('location', lambda x: np.unique(x, return_counts=True)),
     samples=('ID', 'unique')
    )
    .reset_index())
    # 1-based nucleotide position coordinate system
    subs['pos'] = subs['pos'] + 1
    return subs

def identify_deletions(input_filepath: str, 
                       meta_fp: str,
                       patient_zero: str='NC_045512.2', 
                       min_del_len: int=2,
                       start_pos: int=265, 
                       end_pos: int=29674) -> pd.DataFrame:
    """Identify deletions found in the aligned sequences. 
    input_filepath: path to fasta multiple sequence alignment
    patient_zero: name of the reference sequence in the alignment
    min_del_len: minimum length of deletions to be identified"""
    # read MSA file
    consensus_data = AlignIO.read(input_filepath, 'fasta')
    # prcess MSA to remove insertions and fix position coordinate systems
    seqs, ref_seq = process_cns_seqs(consensus_data, patient_zero, start_pos, end_pos)
    # load into dataframe
    seqsdf = (pd.DataFrame(index=seqs.keys(), data=seqs.values(), 
                           columns=['sequence'])
                .reset_index().rename(columns={'index': 'idx'}))
    # load and join metadata
    meta = pd.read_csv(meta_fp)
    seqsdf = pd.merge(seqsdf, meta, left_on='idx', right_on='fasta_hdr')
    # clean and process sample collection dates
    seqsdf = seqsdf.loc[(seqsdf['collection_date']!='Unknown') 
                   & (seqsdf['collection_date']!='1900-01-00')]
    seqsdf.loc[seqsdf['collection_date'].str.contains('/'), 'collection_date'] = seqsdf['collection_date'].apply(lambda x: x.split('/')[0])
    seqsdf['date'] = pd.to_datetime(seqsdf['collection_date'])
    # compute length of each sequence
    seqsdf['seq_len'] = seqsdf['sequence'].str.len()
    # identify deletion positions
    seqsdf['del_positions'] = seqsdf['sequence'].apply(find_deletions)
    # sequences with one or more deletions
    del_seqs = seqsdf.loc[seqsdf['del_positions'].str.len() > 0]
    del_seqs = del_seqs.explode('del_positions')
    # compute length of each deletion
    del_seqs['del_len'] = del_seqs['del_positions'].apply(len)
    # only consider deletions longer than 2nts
    del_seqs = del_seqs[del_seqs['del_len'] >= min_del_len]
    # fetch coordinates of each deletion
    del_seqs['relative_coords'] = del_seqs['del_positions'].apply(get_indel_coords)
    # group sample by the deletion they share
    del_seqs = (del_seqs.groupby(['relative_coords', 'del_len'])
                        .agg(samples=('idx', 'unique'),
                             num_samples=('idx', 'nunique'),
                             first_detected=('date', 'min'),
                             last_detected=('date', 'max'),
#                              locations=('location', uniq_locs),
                             location_counts=('location', lambda x: np.unique(x, return_counts=True)))
                        .reset_index()
                        .sort_values('num_samples'))
    del_seqs['type'] = 'deletion'
    # adjust coordinates to account for the nts trimmed from beginning e.g. 265nts
    del_seqs['absolute_coords'] = del_seqs['relative_coords'].apply(adjust_coords, args=(start_pos+1,))
    del_seqs['pos'] = del_seqs['absolute_coords'].apply(lambda x: int(x.split(':')[0]))
    # approximate the gene where each deletion was identified
    del_seqs['gene'] = del_seqs['pos'].apply(map_gene_to_pos)
    del_seqs = del_seqs.loc[~del_seqs['gene'].isna()]
    # filter our substitutions in non-gene positions
    del_seqs = del_seqs.loc[del_seqs['gene']!='nan']
    # compute codon number of each substitution
    del_seqs['codon_num'] = del_seqs.apply(compute_codon_num, args=(GENE2POS,), axis=1)
    # fetch the reference codon for each substitution
    del_seqs['ref_codon'] = del_seqs.apply(get_ref_codon, args=(ref_seq, GENE2POS), axis=1)
    # fetch the reference and alternative amino acids
    del_seqs['ref_aa'] = del_seqs['ref_codon'].apply(get_aa)
    # record the 5 nts before each deletion (based on reference seq)
    del_seqs['prev_5nts'] = del_seqs['absolute_coords'].apply(lambda x: ref_seq[int(x.split(':')[0])-5:int(x.split(':')[0])])
    # record the 5 nts after each deletion (based on reference seq)
    del_seqs['next_5nts'] = del_seqs['absolute_coords'].apply(lambda x: ref_seq[int(x.split(':')[1])+1:int(x.split(':')[1])+6])
    return del_seqs[['type', 'gene', 'absolute_coords', 'del_len', 'pos', 
                     'ref_aa', 'codon_num', 'num_samples',
                     'first_detected', 'last_detected',
                     'location_counts', 'samples',
                     'ref_codon', 'prev_5nts', 'next_5nts'
                     ]]

In [32]:
subs = identify_replacements(msa_fp, meta_fp)

Loading Alignment file at: /home/al/analysis/mutations/alab_git/cns_seqs_aligned.fa
Initial cleaning...
Creating a dataframe...
Identifying mutations...
Mapping Genes to mutations...
Compute codon numbers...
Fetch reference codon...
Fetch alternative codon...
Map amino acids...
Fuse with metadata...


In [33]:
subs.explode('samples')['samples'].unique().shape

(3252,)

In [34]:
subs.sort_values('num_samples', ascending=False).to_csv(out_dir/"replacements_22-12-2020_orig.csv", index=False)

In [37]:
dels = identify_deletions(msa_fp, meta_fp, min_del_len=1)

In [39]:
dels.sort_values('num_samples', ascending=False).to_csv(out_dir/"deletions_22-12-2020_orig.csv", index=False)

In [None]:
align_fasta_reference(seqs_fp, num_cpus=25, ref_fp=ref_fp)

## CNS Mutations Report

In [6]:
analysis_folder = Path('/home/al/code/HCoV-19-Genomics/consensus_sequences/')
meta_fp = Path('/home/al/code/HCoV-19-Genomics/metadata.csv')
ref_path = Path('/home/gk/code/hCoV19/db/NC045512.fasta')
patient_zero = 'NC_045512.2'
in_fp = '/home/al/analysis/mutations/S501Y/msa_aligned.fa'

In [3]:
subs = identify_replacements(in_fp, meta_fp)

In [4]:
subs.head()

Unnamed: 0,gene,pos,ref_aa,codon_num,alt_aa,num_samples,first_detected,last_detected,locations,location_counts,samples
0,3UTR,29679,S,2,F,2,2020-07-14,2020-10-07,USA/California/San Diego,2,"[SEARCH-3119-SAN, SEARCH-4245-SAN]"
1,3UTR,29681,L,3,L,2,2020-07-11,2020-07-21,"[USA/California/Los Angeles, USA/California/Sa...","[1, 1]","[SEARCH-2600-SAN, SEARCH-2692-LAX]"
2,3UTR,29688,S,5,I,4,2020-03-25,2020-10-26,USA/California/San Diego,4,"[SEARCH-0113-SAN, SEARCH-2855-SAN, SEARCH-3609..."
3,3UTR,29690,V,6,L,1,2020-08-13,2020-08-13,USA/California/San Diego,1,[SEARCH-4455-SAN]
4,3UTR,29692,V,6,V,1,2020-10-03,2020-10-03,Jordan/Amman,1,[SEARCH-4034-JOR]


In [5]:
dels = identify_deletions(in_fp, meta_fp, patient_zero)
dels

Unnamed: 0,type,gene,absolute_coords,del_len,pos,ref_aa,codon_num,num_samples,first_detected,last_detected,locations,location_counts,samples,ref_codon,prev_5nts,next_5nts
30,deletion,ORF7a,27538:27572,35,27538,L,49,1,2020-04-28,2020-04-28,MEX/Baja California/Tijuana,1,[hCoV-19/MEX/SEARCH-1480-TIJ/2020],CTA,tcctc,actca
27,deletion,ORF6,27264:27290,27,27264,F,22,1,2020-07-28,2020-07-28,USA/California/San Diego,1,[hCoV-19/USA/SEARCH-3236-SAN/2020],TTT,ggact,tacat
28,deletion,ORF6,27266:27293,28,27266,F,22,1,2020-04-23,2020-04-23,USA/California/San Diego,1,[hCoV-19/USA/SEARCH-1921-SAN/2020],TTT,acttt,atcat
29,deletion,ORF7a,27498:27531,34,27498,S,36,1,2020-05-12,2020-05-12,USA/California/Imperial,1,[hCoV-19/USA/SEARCH-0573-IPL/2020],TCT,cttgc,atcct
59,deletion,ORF1ab,6656:6679,24,6656,N,2131,1,2020-06-13,2020-06-13,USA/California/San Diego,1,[hCoV-19/USA/SEARCH-3559-SAN/2020],AAT,tgtta,ctaat
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24,deletion,ORF3a,26158:26161,4,26158,V,256,9,2020-07-17,2020-09-14,USA/California/San Diego,9,"[hCoV-19/USA/SEARCH-2907-SAN/2020, hCoV-19/USA...",GTT,agttg,tccag
16,deletion,ORF1ab,516:518,3,516,V,84,10,2020-07-22,2020-08-01,USA/California/San Diego,10,"[hCoV-19/USA/SEARCH-4342-SAN/2020, hCoV-19/USA...",GTT,catgt,ggttg
56,deletion,ORF1ab,686:694,9,686,K,141,17,2020-03-31,2020-10-03,"[Jordan/Amman, USA/California/San Diego, USA/L...","[2, 13, 2]","[hCoV-19/USA/SEARCH-0264-NBG/2020, hCoV-19/USA...",AAG,tctaa,actta
51,deletion,N,28890:28901,12,28890,S,206,23,2020-06-16,2020-08-06,USA/California/San Diego,23,"[hCoV-19/USA/SEARCH-2285-SAN/2020, hCoV-19/USA...",TCT,acttc,ggctg


In [3]:
dels[dels['gene']=='S'].sort_values('num_samples', ascending=False)#.to_csv('S_deletions_consensus.csv', index=False)

NameError: name 'dels' is not defined

In [4]:
identify_insertions(in_fp, patient_zero).to_csv('test.csv', index=False)

## dev 

In [10]:
GENE2POS = {
            '5UTR': {'start': 0, 'end': 265},
            'ORF1ab': {'start': 265, 'end': 21555},
            'S': {'start': 21562, 'end': 25384},
            'ORF3a': {'start': 25392, 'end': 26220},
            'E': {'start': 26244, 'end': 26472},
            'M': {'start': 26522, 'end': 27191},
            'ORF6': {'start': 27201, 'end': 27387},
            'ORF7a': {'start': 27393, 'end': 27759},
            'ORF7b': {'start': 27755, 'end': 27887},
            'ORF8': {'start': 27893, 'end': 28259},
            'N': {'start': 28273, 'end': 29533},
            'ORF10': {'start': 29557, 'end': 29674},
            '3UTR': {'start': 29674, 'end': 29902}
           }

In [11]:
in_dir = '/home/al/analysis/mutations/fa/'
out_dir = '/home/al/analysis/mutations/msa/'

In [14]:
!rm -r /home/al/analysis/mutations
!mkdir /home/al/analysis/mutations
!mkdir /home/al/analysis/mutations/fa

In [15]:
for filename in analysis_folder.listdir():
    if (filename.endswith('fa') or filename.endswith('fasta')):
        copy(filename, '/home/al/analysis/mutations/fa/')
#         print(filename)

In [179]:
copy(ref_path, in_dir)

'/home/al/analysis/mutations/fa/NC045512.fasta'

In [180]:
in_dir = '/home/al/analysis/mutations/fa/'
out_dir = '/home/al/analysis/mutations/msa'
concat_fasta(in_dir, out_dir)

'/home/al/analysis/mutations/msa.fa'

In [17]:
align_fasta_reference('/home/al/analysis/mutations/msa.fa',  num_cpus=12, ref_fp=ref_path)

'/home/al/analysis/mutations/msa_aligned.fa'

In [14]:
cns = AlignIO.read('/home/al/analysis/mutations/msa_aligned.fa', 'fasta')

In [15]:
ref_seq = get_seq(cns, patient_zero)

In [16]:
len(ref_seq)

29903

In [17]:
seqs = get_seqs(cns, 0, 30000)

In [18]:
seqsdf = (pd.DataFrame(index=seqs.keys(), data=seqs.values(), columns=['sequence'])
                .reset_index().rename(columns={'index': 'idx'}))

In [19]:
# seqsdf

In [20]:
def find_replacements(x, ref):
    return [f'{i}:{n}' for i, n in enumerate(x) 
            if n!=ref[i] and n!='-' and n!='n']

In [21]:
seqsdf['replacements'] = seqsdf['sequence'].apply(find_replacements, args=(ref_seq,))

In [22]:
seqsdf = seqsdf.explode('replacements')
seqsdf['pos'] = -1
seqsdf.loc[~seqsdf['replacements'].isna(), 'pos'] = seqsdf.loc[~seqsdf['replacements'].isna(), 'replacements'].apply(lambda x: int(x.split(':')[0]))
seqsdf = seqsdf.loc[seqsdf['pos']!=-1]

In [23]:
def compute_codon_num(x, gene2pos: dict):
    pos = x['pos']
    ref_pos = gene2pos[x['gene']]['start']
    return math.ceil((pos - ref_pos + 1) / 3)

In [24]:
seqsdf['gene'] = seqsdf['pos'].apply(map_gene_to_pos)
seqsdf = seqsdf.loc[~seqsdf['gene'].isna()]
seqsdf = seqsdf.loc[seqsdf['gene']!='nan']
seqsdf['codon_num'] = seqsdf.apply(compute_codon_num, args=(GENE2POS,), axis=1)

In [25]:
def get_ref_codon(x, ref_seq, gene2pos: dict):
    ref_pos = gene2pos[x['gene']]['start']
    codon_start = ref_pos + ((x['codon_num'] - 1) * 3)
    return ref_seq[codon_start: codon_start+3].upper()
seqsdf['ref_codon'] = seqsdf.apply(get_ref_codon, args=(ref_seq, GENE2POS), axis=1)

In [26]:
def get_alt_codon(x, gene2pos: dict):
    ref_pos = gene2pos[x['gene']]['start']
    codon_start = ref_pos + ((x['codon_num'] - 1) * 3)
    return x['sequence'][codon_start: codon_start+3].upper()
seqsdf['alt_codon'] = seqsdf.apply(get_alt_codon, args=(GENE2POS,), axis=1)

In [27]:
def get_aa(codon: str):
    CODON2AA = { 
        'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M', 
        'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', 
        'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K', 
        'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',                  
        'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 
        'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 
        'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q', 
        'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R', 
        'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 
        'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A', 
        'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', 
        'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 
        'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 
        'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L', 
        'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_', 
        'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W', 
    } 
    return CODON2AA.get(codon, 'nan')
seqsdf['ref_aa'] = seqsdf['ref_codon'].apply(get_aa)
seqsdf['alt_aa'] = seqsdf['alt_codon'].apply(get_aa)
seqsdf = seqsdf.loc[seqsdf['alt_aa']!='nan']

In [28]:
seqsdf.columns

Index(['idx', 'sequence', 'replacements', 'pos', 'gene', 'codon_num',
       'ref_codon', 'alt_codon', 'ref_aa', 'alt_aa'],
      dtype='object')

In [29]:
meta = pd.read_csv(meta_fp)
print(seqsdf['idx'].unique().shape)
seqsdf = pd.merge(seqsdf, meta, left_on='idx', right_on='fasta_hdr')
print(seqsdf['idx'].unique().shape)

(2765,)
(2765,)


In [30]:
seqsdf = seqsdf.loc[(seqsdf['collection_date']!='Unknown') 
                   & (seqsdf['collection_date']!='1900-01-00')]
seqsdf.loc[seqsdf['collection_date'].str.contains('/'), 'collection_date'] = seqsdf['collection_date'].apply(lambda x: x.split('/')[0])
seqsdf['date'] = pd.to_datetime(seqsdf['collection_date'])

In [31]:
seqsdf['date'].min()

Timestamp('2020-03-04 00:00:00')

In [32]:
# (seqsdf.groupby(['gene', 'ref_aa', 'codon_num', 'alt_aa'])
# .agg(
#      num_samples=('ID', 'nunique')))

In [35]:
def uniq_locs(x):
    return np.unique(x)
def loc_counts(x):
    _, counts = np.unique(x, return_counts=True)
    return counts

In [37]:
subs = (seqsdf.groupby(['gene', 'pos', 'ref_aa', 'codon_num', 'alt_aa'])
.agg(
     num_samples=('ID', 'nunique'),
     first_detected=('date', 'min'),
     last_detected=('date', 'max'),
     locations=('location', uniq_locs),
     location_counts=('location', loc_counts),
     samples=('ID', 'unique')
    )
.reset_index())
subs['pos'] = subs['pos'] + 1

In [175]:
(subs[subs['gene']=='S'].sort_values('num_samples', ascending=False)
 .to_csv('S_mutations_consensus.csv', index=False))

## Consolidate metadata ID and fasta headers

In [134]:
def fix_header(x):
    if 'Consensus' in x:
        return x.split('_')[1]
    else:
        return x.split('/')[2]
seqsdf['n_ID'] = seqsdf['idx'].apply(fix_header)

In [135]:
seqsdf['n_ID'] = seqsdf['n_ID'].str.replace('ALSR', 'SEARCH')

In [136]:
meta = pd.read_csv(meta_fp)
meta['n_ID'] = meta['ID'].apply(lambda x: '-'.join(x.split('-')[:2]))

In [137]:
seqsdf['n_ID'] = seqsdf['n_ID'].apply(lambda x: '-'.join(x.split('-')[:2]))

In [138]:
tmp = pd.merge(seqsdf, meta, on='n_ID')

In [122]:
# tmp[tmp['ID'].str.contains('2112')]

In [98]:
# seqsdf

In [139]:
set(meta['n_ID'].unique()) - set(tmp['n_ID'].unique())

{'SEARCH-1668'}

In [140]:
seqsdf['idx'].unique().shape

(2765,)

In [141]:
meta['ID'].unique().shape

(2766,)

In [147]:
s = seqsdf[['n_ID', 'idx']].drop_duplicates()

In [151]:
new_meta = pd.merge(meta, s, on='n_ID', how='left')
(new_meta.drop(columns=['n_ID'])
.rename(columns={'idx': 'fasta_hdr'})
.to_csv('metadata.csv', index=False))

In [152]:
new_meta.shape 

(2766, 11)

In [153]:
new_meta

Unnamed: 0,ID,gb_accession,gisaid_accession,collection_date,location,percent_coverage_cds,avg_depth,authors,originating_lab,n_ID,idx
0,MG0987,MT598172,EPI_ISL_416457,2020-03-18,USA/California/San Diego,99.5954,2465.60,SEARCH Alliance San Diego,Andersen lab at Scripps Research,MG0987,Consensus_MG0987
1,PC00101P,MT192765,EPI_ISL_414648,2020-03-11,USA/California/San Diego,99.7525,3516.14,SEARCH Alliance San Diego,Andersen lab at Scripps Research,PC00101P,Consensus_PC00101P_threshold_0_quality_20
2,SEARCH-0007-SAN,MT598171,EPI_ISL_429990,2020-03-21,USA/California/San Diego,100.0000,6215.17,SEARCH Alliance San Diego with Christina Clark...,Rady's Childrens Hospital,SEARCH-0007,Consensus_SEARCH-0007-SAN_L1_threshold_0_quali...
3,SEARCH-0016-SAN,MT598173,EPI_ISL_430016,2020-03-24,USA/California/San Diego,100.0000,6440.67,SEARCH Alliance San Diego,Andersen lab at Scripps Research,SEARCH-0016,Consensus_SEARCH-0016-SAN_L1_threshold_0_quali...
4,SEARCH-0017-SAN,MT598174,EPI_ISL_429991,2020-03-24,USA/California/San Diego,100.0000,4947.09,SEARCH Alliance San Diego,Andersen lab at Scripps Research,SEARCH-0017,Consensus_SEARCH-0017-SAN_L1_threshold_0_quali...
...,...,...,...,...,...,...,...,...,...,...,...
2761,SEARCH-4685-SAN,,,2020-11-02,USA/California/San Diego,100.0000,4831.37,"SEARCH Alliance San Diego with Tracy Basler, J...",San Diego County Public Health Laboratory,SEARCH-4685,hCoV-19/USA/SEARCH-4685-SAN/2020
2762,SEARCH-4686-SAN,,,2020-11-05,USA/California/San Diego,100.0000,3864.73,"SEARCH Alliance San Diego with Tracy Basler, J...",San Diego County Public Health Laboratory,SEARCH-4686,hCoV-19/USA/SEARCH-4686-SAN/2020
2763,SEARCH-4687-ORA,,,2020-11-02,USA/California/Orange,98.1400,3123.54,"SEARCH Alliance San Diego with Tracy Basler, J...",San Diego County Public Health Laboratory,SEARCH-4687,hCoV-19/USA/SEARCH-4687-ORA/2020
2764,SEARCH-4690-SAN,,,2020-05-28,USA/California/San Diego,98.7283,2625.16,"SEARCH Alliance San Diego with Tracy Basler, J...",San Diego County Public Health Laboratory,SEARCH-4690,hCoV-19/USA/SEARCH-4690-SAN/2020


In [81]:
len(ref_seq)

29903