## TODO
* `gb_accession` and `gisaid_accession` are not found for new sequences, how do we concat to `metadata.csv` without them?
* metadata format for NCBI
* support tools for manual sanity checks

In [2]:
from bjorn import *
from bjorn_support import *
from onion_trees import *
import gffutils
import math
from mutations import *

In [9]:

meta_fp = "/home/al/analysis/mutations/S501Y/metadata_2020-12-20_12-24.tsv"
out_dir = "/home/al/analysis/mutations/S501Y/"
ref_fp = "/home/al/data/test_inputs/NC045512.fasta"
patient_zero = 'NC_045512.2'

In [4]:
input_fasta = "/home/al/analysis/mutations/S501Y/msa_reference.fa"
## keep only seqs contained in meta_file and save to fasta file
## concat with internal SD file
## generate MSA
meta = pd.read_csv(meta_fp, sep='\t')
meta.columns

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Index(['strain', 'virus', 'gisaid_epi_isl', 'genbank_accession', 'date',
       'region', 'country', 'division', 'location', 'region_exposure',
       'country_exposure', 'division_exposure', 'segment', 'length', 'host',
       'age', 'sex', 'Nextstrain_clade', 'pangolin_lineage', 'GISAID_clade',
       'originating_lab', 'submitting_lab', 'authors', 'url', 'title',
       'paper_url', 'date_submitted'],
      dtype='object')

In [5]:
# consensus_data = SeqIO.to_dict(SeqIO.parse(seqs_fp, "fasta"))

In [6]:
strains = meta['strain'].unique().tolist()
len(strains)

273267

In [7]:
print(f"Loading Alignment file at: {input_fasta}")
cns = AlignIO.read(input_fasta, 'fasta')

Loading Alignment file at: /home/al/analysis/mutations/S501Y/msa_reference.fa


In [10]:
print(f"Initial cleaning...")
seqs, ref_seq = process_cns_seqs(cns, patient_zero,
                                 start_pos=0, end_pos=30000)

Initial cleaning...


In [11]:
print(f"Creating a dataframe...")
seqsdf = (pd.DataFrame(index=seqs.keys(), 
                       data=seqs.values(), 
                       columns=['sequence'])
            .reset_index()
            .rename(columns={'index': 'idx'}))

Creating a dataframe...


In [None]:
def find_replacements(x, ref):
    return [f'{i}:{n}' for i, n in enumerate(x)
            if n!=ref[i] and n!='-' and n!='n']

In [13]:
print(f"Identifying mutations...")
# for each sample, identify list of substitutions (position:alt)
seqsdf['replacements'] = seqsdf['sequence'].apply(find_replacements, args=(ref_seq,))

Identifying mutations...


In [14]:
# wide-to-long data manipulation
seqsdf = seqsdf.explode('replacements')

In [15]:
seqsdf

Unnamed: 0,idx,sequence,replacements
0,NC_045512.2,attaaaggtttataccttcccaggtaacaaaccaaccaactttcga...,
1,Wuhan/IVDC-HB-01/2019,attaaaggtttataccttcccaggtaacaaaccaaccaactttcga...,
2,Wuhan/IVDC-HB-04/2020,attaaaggtttataccttcccaggtaacaaaccaaccaactttcga...,27492:t
2,Wuhan/IVDC-HB-04/2020,attaaaggtttataccttcccaggtaacaaaccaaccaactttcga...,28252:t
3,Wuhan/IVDC-HB-05/2019,attaaaggtttataccttcccaggtaacaaaccaaccaactttcga...,20669:a
...,...,...,...
273267,Canada/ON-SLB3080/2020,----------------------------------------------...,18376:t
273267,Canada/ON-SLB3080/2020,----------------------------------------------...,23402:g
273267,Canada/ON-SLB3080/2020,----------------------------------------------...,23592:t
273267,Canada/ON-SLB3080/2020,----------------------------------------------...,25562:t


In [16]:
seqsdf['pos'] = -1
# populate position column
seqsdf.loc[~seqsdf['replacements'].isna(), 'pos'] = (seqsdf.loc[~seqsdf['replacements'].isna(), 'replacements']
   .apply(lambda x: int(x.split(':')[0])))

In [17]:

# filter out non-substitutions
seqsdf = seqsdf.loc[seqsdf['pos']!=-1]
print(f"Mapping Genes to mutations...")
# identify gene of each substitution
seqsdf['gene'] = seqsdf['pos'].apply(map_gene_to_pos)
seqsdf = seqsdf.loc[~seqsdf['gene'].isna()]

Mapping Genes to mutations...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  seqsdf['gene'] = seqsdf['pos'].apply(map_gene_to_pos)


In [18]:
seqsdf

Unnamed: 0,idx,sequence,replacements,pos,gene
2,Wuhan/IVDC-HB-04/2020,attaaaggtttataccttcccaggtaacaaaccaaccaactttcga...,27492:t,27492,ORF7a
2,Wuhan/IVDC-HB-04/2020,attaaaggtttataccttcccaggtaacaaaccaaccaactttcga...,28252:t,28252,ORF8
3,Wuhan/IVDC-HB-05/2019,attaaaggtttataccttcccaggtaacaaaccaaccaactttcga...,20669:a,20669,ORF1ab
3,Wuhan/IVDC-HB-05/2019,attaaaggtttataccttcccaggtaacaaaccaaccaactttcga...,20678:a,20678,ORF1ab
4,Wuhan/IPBCAMS-WH-01/2019,attaaaggtttataccttcccaggtaacaaaccaaccaactttcga...,3777:g,3777,ORF1ab
...,...,...,...,...,...
273267,Canada/ON-SLB3080/2020,----------------------------------------------...,18376:t,18376,ORF1ab
273267,Canada/ON-SLB3080/2020,----------------------------------------------...,23402:g,23402,S
273267,Canada/ON-SLB3080/2020,----------------------------------------------...,23592:t,23592,S
273267,Canada/ON-SLB3080/2020,----------------------------------------------...,25562:t,25562,ORF3a


In [22]:
# filter our substitutions in non-gene positions
seqsdf = seqsdf.loc[seqsdf['gene']!='nan']
print(f"Compute codon numbers...")

# compute codon number of each substitution
seqsdf['codon_num'] = seqsdf.apply(compute_codon_num, args=(GENE2POS,), axis=1)


Compute codon numbers...


In [23]:

print(f"Fetch reference codon...")
# fetch the reference codon for each substitution
seqsdf['ref_codon'] = seqsdf.apply(get_ref_codon, args=(ref_seq, GENE2POS), axis=1)

Fetch reference codon...


In [24]:

print(f"Fetch alternative codon...")
# fetch the alternative codon for each substitution
seqsdf['alt_codon'] = seqsdf.apply(get_alt_codon, args=(GENE2POS,), axis=1)

Fetch alternative codon...


In [25]:
print(f"Map amino acids...")
# fetch the reference and alternative amino acids
seqsdf['ref_aa'] = seqsdf['ref_codon'].apply(get_aa)
seqsdf['alt_aa'] = seqsdf['alt_codon'].apply(get_aa)


# filter out substitutions with non-amino acid alternates (bad consensus calls)
seqsdf = seqsdf.loc[seqsdf['alt_aa']!='nan']

Map amino acids...


In [28]:
seqsdf

Unnamed: 0,idx,sequence,replacements,pos,gene,codon_num,ref_codon,alt_codon,ref_aa,alt_aa
2,Wuhan/IVDC-HB-04/2020,attaaaggtttataccttcccaggtaacaaaccaaccaactttcga...,27492:t,27492,ORF7a,34,CCT,TCT,P,S
2,Wuhan/IVDC-HB-04/2020,attaaaggtttataccttcccaggtaacaaaccaaccaactttcga...,28252:t,28252,ORF8,120,TTC,TTT,F,F
3,Wuhan/IVDC-HB-05/2019,attaaaggtttataccttcccaggtaacaaaccaaccaactttcga...,20669:a,20669,ORF1ab,6802,CGT,CAT,R,H
3,Wuhan/IVDC-HB-05/2019,attaaaggtttataccttcccaggtaacaaaccaaccaactttcga...,20678:a,20678,ORF1ab,6805,CGG,CAG,R,Q
4,Wuhan/IPBCAMS-WH-01/2019,attaaaggtttataccttcccaggtaacaaaccaaccaactttcga...,3777:g,3777,ORF1ab,1171,ACA,ACG,T,T
...,...,...,...,...,...,...,...,...,...,...
273267,Canada/ON-SLB3080/2020,----------------------------------------------...,18376:t,18376,ORF1ab,6038,CAG,TAG,Q,_
273267,Canada/ON-SLB3080/2020,----------------------------------------------...,23402:g,23402,S,614,GAT,GGT,D,G
273267,Canada/ON-SLB3080/2020,----------------------------------------------...,23592:t,23592,S,677,CAG,CAT,Q,H
273267,Canada/ON-SLB3080/2020,----------------------------------------------...,25562:t,25562,ORF3a,57,CAG,CAT,Q,H


In [31]:
meta.columns

Index(['strain', 'virus', 'gisaid_epi_isl', 'genbank_accession', 'date',
       'region', 'country', 'division', 'location', 'region_exposure',
       'country_exposure', 'division_exposure', 'segment', 'length', 'host',
       'age', 'sex', 'Nextstrain_clade', 'pangolin_lineage', 'GISAID_clade',
       'originating_lab', 'submitting_lab', 'authors', 'url', 'title',
       'paper_url', 'date_submitted'],
      dtype='object')

In [33]:
meta['strain']

0            Algeria/G0638_2264/2020
1            Algeria/G0640_2265/2020
2            Algeria/G0860_2262/2020
3                Andorra/202552/2020
4                   Anhui/SZ005/2020
                     ...            
273262    tiger/USA/NY-2-040420/2020
273263    tiger/USA/NY-3-040420/2020
273264    tiger/USA/NY-4-040420/2020
273265          tiger/USA/NY-P3/2020
273266          tiger/USA/NY-P4/2020
Name: strain, Length: 273267, dtype: object

In [32]:
seqsdf

Unnamed: 0,idx,sequence,replacements,pos,gene,codon_num,ref_codon,alt_codon,ref_aa,alt_aa
2,Wuhan/IVDC-HB-04/2020,attaaaggtttataccttcccaggtaacaaaccaaccaactttcga...,27492:t,27492,ORF7a,34,CCT,TCT,P,S
2,Wuhan/IVDC-HB-04/2020,attaaaggtttataccttcccaggtaacaaaccaaccaactttcga...,28252:t,28252,ORF8,120,TTC,TTT,F,F
3,Wuhan/IVDC-HB-05/2019,attaaaggtttataccttcccaggtaacaaaccaaccaactttcga...,20669:a,20669,ORF1ab,6802,CGT,CAT,R,H
3,Wuhan/IVDC-HB-05/2019,attaaaggtttataccttcccaggtaacaaaccaaccaactttcga...,20678:a,20678,ORF1ab,6805,CGG,CAG,R,Q
4,Wuhan/IPBCAMS-WH-01/2019,attaaaggtttataccttcccaggtaacaaaccaaccaactttcga...,3777:g,3777,ORF1ab,1171,ACA,ACG,T,T
...,...,...,...,...,...,...,...,...,...,...
273267,Canada/ON-SLB3080/2020,----------------------------------------------...,18376:t,18376,ORF1ab,6038,CAG,TAG,Q,_
273267,Canada/ON-SLB3080/2020,----------------------------------------------...,23402:g,23402,S,614,GAT,GGT,D,G
273267,Canada/ON-SLB3080/2020,----------------------------------------------...,23592:t,23592,S,677,CAG,CAT,Q,H
273267,Canada/ON-SLB3080/2020,----------------------------------------------...,25562:t,25562,ORF3a,57,CAG,CAT,Q,H


In [34]:
print(f"Fuse with metadata...")
# load and join metadata
meta = pd.read_csv(meta_fp, sep='\t')
seqsdf = pd.merge(seqsdf, meta, left_on='idx', right_on='strain')

Fuse with metadata...


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [None]:
# clean and process sample collection dates
seqsdf = seqsdf.loc[(seqsdf['collection_date']!='Unknown') 
               & (seqsdf['collection_date']!='1900-01-00')]
seqsdf.loc[seqsdf['collection_date'].str.contains('/'), 'collection_date'] = seqsdf['collection_date'].apply(lambda x: x.split('/')[0])
seqsdf['date'] = pd.to_datetime(seqsdf['collection_date'])

In [37]:
seqsdf['date'] = pd.to_datetime(seqsdf['date_submitted'])

In [39]:
seqsdf.columns

Index(['idx', 'sequence', 'replacements', 'pos', 'gene', 'codon_num',
       'ref_codon', 'alt_codon', 'ref_aa', 'alt_aa', 'strain', 'virus',
       'gisaid_epi_isl', 'genbank_accession', 'date', 'region', 'country',
       'division', 'location', 'region_exposure', 'country_exposure',
       'division_exposure', 'segment', 'length', 'host', 'age', 'sex',
       'Nextstrain_clade', 'pangolin_lineage', 'GISAID_clade',
       'originating_lab', 'submitting_lab', 'authors', 'url', 'title',
       'paper_url', 'date_submitted'],
      dtype='object')

In [52]:
print(f"Aggregate final results...")
# aggregate on each substitutions, compute number of samples and other attributes
subs = (seqsdf.groupby(['gene', 'pos', 'ref_aa', 'codon_num', 'alt_aa'])
.agg(
 num_samples=('idx', 'nunique'),
 first_detected=('date', 'min'),
 last_detected=('date', 'max'),
 num_locations=('location', 'nunique'),
 locations=('location', 'unique'),
 num_countries=('country', 'nunique'),
 countries=('country', 'unique'),
#  location_counts=('location', loc_counts)
)
.reset_index())
# 1-based nucleotide position coordinate system
subs['pos'] = subs['pos'] + 1

Aggregate final results...


In [53]:
subs.sort_values('num_samples', ascending=False)

Unnamed: 0,gene,pos,ref_aa,codon_num,alt_aa,num_samples,first_detected,last_detected,num_locations,locations,num_countries,countries
42231,S,23403,D,614,G,249395,2020-01-31,2020-12-19,2158,"[Starnberg, nan, São Paulo, Wuhan, Milan, Berl...",129,"[Germany, Brazil, Finland, Mexico, Italy, Chin..."
10332,ORF1ab,3037,F,924,F,248474,2020-01-24,2020-12-19,2149,"[Pu'Er, Starnberg, nan, São Paulo, Xishuangban...",129,"[China, Germany, Brazil, Finland, Mexico, Ital..."
25627,ORF1ab,14408,L,4715,L,248394,2020-02-28,2020-12-19,2142,"[nan, São Paulo, Milan, Berlicum, Blaricum, De...",128,"[Germany, Brazil, Finland, Mexico, Italy, Swit..."
2249,5UTR,241,R,81,C,242249,2020-01-31,2020-12-19,2133,"[Starnberg, nan, São Paulo, Milan, Berlicum, B...",128,"[Germany, China, Brazil, Finland, Mexico, Ital..."
4335,N,28881,R,203,K,98146,2020-02-28,2020-12-19,1013,"[nan, Berlicum, Delft, Diemen, Eindhoven, Helm...",104,"[Germany, Mexico, Switzerland, United Kingdom,..."
...,...,...,...,...,...,...,...,...,...,...,...,...
33595,ORF1ab,20733,V,6823,G,1,2020-05-02,2020-05-02,0,[nan],1,[South Africa]
25096,ORF1ab,13999,C,4578,S,1,2020-06-29,2020-06-29,0,[nan],1,[India]
33597,ORF1ab,20735,T,6824,A,1,2020-09-16,2020-09-16,0,[nan],1,[Singapore]
9132,ORF1ab,2344,I,693,I,1,2020-08-30,2020-08-30,0,[nan],1,[Dominican Republic]


In [54]:
subs.loc[(subs['gene']=='S')&(subs['codon_num']==501)]

Unnamed: 0,gene,pos,ref_aa,codon_num,alt_aa,num_samples,first_detected,last_detected,num_locations,locations,num_countries,countries
41777,S,23063,N,501,D,1,2020-01-24,2020-01-24,1,[Pu'Er],1,[China]
41778,S,23063,N,501,H,1,2020-06-23,2020-06-23,0,[nan],1,[United Kingdom]
41779,S,23063,N,501,V,1,2020-03-02,2020-03-02,1,[Xishuangbanna],1,[China]
41780,S,23063,N,501,Y,2419,2020-06-01,2020-12-19,20,"[Queens, nan, Recife, Yamhill County, Winnebag...",6,"[USA, Australia, Brazil, United Kingdom, Denma..."
41781,S,23064,N,501,S,8,2020-06-11,2020-11-18,1,"[nan, Rock County]",3,"[United Kingdom, USA, Australia]"
41782,S,23064,N,501,T,54,2020-02-17,2020-12-18,6,"[nan, iLembe, Taylor County WI, New York City,...",11,"[China, Netherlands, Germany, South Africa, It..."
41783,S,23064,N,501,V,1,2020-03-02,2020-03-02,1,[Xishuangbanna],1,[China]
41784,S,23065,N,501,N,5,2020-07-14,2020-11-06,0,[nan],2,"[United Kingdom, Australia]"
41785,S,23065,N,501,T,5,2020-02-17,2020-02-17,0,[nan],1,[China]
41786,S,23065,N,501,V,1,2020-03-02,2020-03-02,1,[Xishuangbanna],1,[China]


In [55]:
subs.loc[(subs['gene']=='S')&(subs['codon_num']==501)&(subs['alt_aa']=='Y'), 'countries'].values

array([array(['USA', 'Australia', 'Brazil', 'United Kingdom', 'Denmark',
       'South Africa'], dtype=object)], dtype=object)

In [7]:
identify_replacements(seqs_fp, meta_fp)

KeyboardInterrupt: 

In [None]:
align_fasta_reference(seqs_fp, num_cpus=25, ref_fp=ref_fp)

## CNS Mutations Report

In [6]:
analysis_folder = Path('/home/al/code/HCoV-19-Genomics/consensus_sequences/')
meta_fp = Path('/home/al/code/HCoV-19-Genomics/metadata.csv')
ref_path = Path('/home/gk/code/hCoV19/db/NC045512.fasta')
patient_zero = 'NC_045512.2'
in_fp = '/home/al/analysis/mutations/S501Y/msa_aligned.fa'

In [3]:
subs = identify_replacements(in_fp, meta_fp)

In [4]:
subs.head()

Unnamed: 0,gene,pos,ref_aa,codon_num,alt_aa,num_samples,first_detected,last_detected,locations,location_counts,samples
0,3UTR,29679,S,2,F,2,2020-07-14,2020-10-07,USA/California/San Diego,2,"[SEARCH-3119-SAN, SEARCH-4245-SAN]"
1,3UTR,29681,L,3,L,2,2020-07-11,2020-07-21,"[USA/California/Los Angeles, USA/California/Sa...","[1, 1]","[SEARCH-2600-SAN, SEARCH-2692-LAX]"
2,3UTR,29688,S,5,I,4,2020-03-25,2020-10-26,USA/California/San Diego,4,"[SEARCH-0113-SAN, SEARCH-2855-SAN, SEARCH-3609..."
3,3UTR,29690,V,6,L,1,2020-08-13,2020-08-13,USA/California/San Diego,1,[SEARCH-4455-SAN]
4,3UTR,29692,V,6,V,1,2020-10-03,2020-10-03,Jordan/Amman,1,[SEARCH-4034-JOR]


In [5]:
dels = identify_deletions(in_fp, meta_fp, patient_zero)
dels

Unnamed: 0,type,gene,absolute_coords,del_len,pos,ref_aa,codon_num,num_samples,first_detected,last_detected,locations,location_counts,samples,ref_codon,prev_5nts,next_5nts
30,deletion,ORF7a,27538:27572,35,27538,L,49,1,2020-04-28,2020-04-28,MEX/Baja California/Tijuana,1,[hCoV-19/MEX/SEARCH-1480-TIJ/2020],CTA,tcctc,actca
27,deletion,ORF6,27264:27290,27,27264,F,22,1,2020-07-28,2020-07-28,USA/California/San Diego,1,[hCoV-19/USA/SEARCH-3236-SAN/2020],TTT,ggact,tacat
28,deletion,ORF6,27266:27293,28,27266,F,22,1,2020-04-23,2020-04-23,USA/California/San Diego,1,[hCoV-19/USA/SEARCH-1921-SAN/2020],TTT,acttt,atcat
29,deletion,ORF7a,27498:27531,34,27498,S,36,1,2020-05-12,2020-05-12,USA/California/Imperial,1,[hCoV-19/USA/SEARCH-0573-IPL/2020],TCT,cttgc,atcct
59,deletion,ORF1ab,6656:6679,24,6656,N,2131,1,2020-06-13,2020-06-13,USA/California/San Diego,1,[hCoV-19/USA/SEARCH-3559-SAN/2020],AAT,tgtta,ctaat
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24,deletion,ORF3a,26158:26161,4,26158,V,256,9,2020-07-17,2020-09-14,USA/California/San Diego,9,"[hCoV-19/USA/SEARCH-2907-SAN/2020, hCoV-19/USA...",GTT,agttg,tccag
16,deletion,ORF1ab,516:518,3,516,V,84,10,2020-07-22,2020-08-01,USA/California/San Diego,10,"[hCoV-19/USA/SEARCH-4342-SAN/2020, hCoV-19/USA...",GTT,catgt,ggttg
56,deletion,ORF1ab,686:694,9,686,K,141,17,2020-03-31,2020-10-03,"[Jordan/Amman, USA/California/San Diego, USA/L...","[2, 13, 2]","[hCoV-19/USA/SEARCH-0264-NBG/2020, hCoV-19/USA...",AAG,tctaa,actta
51,deletion,N,28890:28901,12,28890,S,206,23,2020-06-16,2020-08-06,USA/California/San Diego,23,"[hCoV-19/USA/SEARCH-2285-SAN/2020, hCoV-19/USA...",TCT,acttc,ggctg


In [3]:
dels[dels['gene']=='S'].sort_values('num_samples', ascending=False)#.to_csv('S_deletions_consensus.csv', index=False)

NameError: name 'dels' is not defined

In [4]:
identify_insertions(in_fp, patient_zero).to_csv('test.csv', index=False)

## dev 

In [10]:
GENE2POS = {
            '5UTR': {'start': 0, 'end': 265},
            'ORF1ab': {'start': 265, 'end': 21555},
            'S': {'start': 21562, 'end': 25384},
            'ORF3a': {'start': 25392, 'end': 26220},
            'E': {'start': 26244, 'end': 26472},
            'M': {'start': 26522, 'end': 27191},
            'ORF6': {'start': 27201, 'end': 27387},
            'ORF7a': {'start': 27393, 'end': 27759},
            'ORF7b': {'start': 27755, 'end': 27887},
            'ORF8': {'start': 27893, 'end': 28259},
            'N': {'start': 28273, 'end': 29533},
            'ORF10': {'start': 29557, 'end': 29674},
            '3UTR': {'start': 29674, 'end': 29902}
           }

In [11]:
in_dir = '/home/al/analysis/mutations/fa/'
out_dir = '/home/al/analysis/mutations/msa/'

In [14]:
!rm -r /home/al/analysis/mutations
!mkdir /home/al/analysis/mutations
!mkdir /home/al/analysis/mutations/fa

In [15]:
for filename in analysis_folder.listdir():
    if (filename.endswith('fa') or filename.endswith('fasta')):
        copy(filename, '/home/al/analysis/mutations/fa/')
#         print(filename)

In [179]:
copy(ref_path, in_dir)

'/home/al/analysis/mutations/fa/NC045512.fasta'

In [180]:
in_dir = '/home/al/analysis/mutations/fa/'
out_dir = '/home/al/analysis/mutations/msa'
concat_fasta(in_dir, out_dir)

'/home/al/analysis/mutations/msa.fa'

In [17]:
align_fasta_reference('/home/al/analysis/mutations/msa.fa',  num_cpus=12, ref_fp=ref_path)

'/home/al/analysis/mutations/msa_aligned.fa'

In [14]:
cns = AlignIO.read('/home/al/analysis/mutations/msa_aligned.fa', 'fasta')

In [15]:
ref_seq = get_seq(cns, patient_zero)

In [16]:
len(ref_seq)

29903

In [17]:
seqs = get_seqs(cns, 0, 30000)

In [18]:
seqsdf = (pd.DataFrame(index=seqs.keys(), data=seqs.values(), columns=['sequence'])
                .reset_index().rename(columns={'index': 'idx'}))

In [19]:
# seqsdf

In [20]:
def find_replacements(x, ref):
    return [f'{i}:{n}' for i, n in enumerate(x) 
            if n!=ref[i] and n!='-' and n!='n']

In [21]:
seqsdf['replacements'] = seqsdf['sequence'].apply(find_replacements, args=(ref_seq,))

In [22]:
seqsdf = seqsdf.explode('replacements')
seqsdf['pos'] = -1
seqsdf.loc[~seqsdf['replacements'].isna(), 'pos'] = seqsdf.loc[~seqsdf['replacements'].isna(), 'replacements'].apply(lambda x: int(x.split(':')[0]))
seqsdf = seqsdf.loc[seqsdf['pos']!=-1]

In [23]:
def compute_codon_num(x, gene2pos: dict):
    pos = x['pos']
    ref_pos = gene2pos[x['gene']]['start']
    return math.ceil((pos - ref_pos + 1) / 3)

In [24]:
seqsdf['gene'] = seqsdf['pos'].apply(map_gene_to_pos)
seqsdf = seqsdf.loc[~seqsdf['gene'].isna()]
seqsdf = seqsdf.loc[seqsdf['gene']!='nan']
seqsdf['codon_num'] = seqsdf.apply(compute_codon_num, args=(GENE2POS,), axis=1)

In [25]:
def get_ref_codon(x, ref_seq, gene2pos: dict):
    ref_pos = gene2pos[x['gene']]['start']
    codon_start = ref_pos + ((x['codon_num'] - 1) * 3)
    return ref_seq[codon_start: codon_start+3].upper()
seqsdf['ref_codon'] = seqsdf.apply(get_ref_codon, args=(ref_seq, GENE2POS), axis=1)

In [26]:
def get_alt_codon(x, gene2pos: dict):
    ref_pos = gene2pos[x['gene']]['start']
    codon_start = ref_pos + ((x['codon_num'] - 1) * 3)
    return x['sequence'][codon_start: codon_start+3].upper()
seqsdf['alt_codon'] = seqsdf.apply(get_alt_codon, args=(GENE2POS,), axis=1)

In [27]:
def get_aa(codon: str):
    CODON2AA = { 
        'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M', 
        'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', 
        'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K', 
        'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',                  
        'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 
        'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 
        'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q', 
        'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R', 
        'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 
        'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A', 
        'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', 
        'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 
        'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 
        'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L', 
        'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_', 
        'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W', 
    } 
    return CODON2AA.get(codon, 'nan')
seqsdf['ref_aa'] = seqsdf['ref_codon'].apply(get_aa)
seqsdf['alt_aa'] = seqsdf['alt_codon'].apply(get_aa)
seqsdf = seqsdf.loc[seqsdf['alt_aa']!='nan']

In [28]:
seqsdf.columns

Index(['idx', 'sequence', 'replacements', 'pos', 'gene', 'codon_num',
       'ref_codon', 'alt_codon', 'ref_aa', 'alt_aa'],
      dtype='object')

In [29]:
meta = pd.read_csv(meta_fp)
print(seqsdf['idx'].unique().shape)
seqsdf = pd.merge(seqsdf, meta, left_on='idx', right_on='fasta_hdr')
print(seqsdf['idx'].unique().shape)

(2765,)
(2765,)


In [30]:
seqsdf = seqsdf.loc[(seqsdf['collection_date']!='Unknown') 
                   & (seqsdf['collection_date']!='1900-01-00')]
seqsdf.loc[seqsdf['collection_date'].str.contains('/'), 'collection_date'] = seqsdf['collection_date'].apply(lambda x: x.split('/')[0])
seqsdf['date'] = pd.to_datetime(seqsdf['collection_date'])

In [31]:
seqsdf['date'].min()

Timestamp('2020-03-04 00:00:00')

In [32]:
# (seqsdf.groupby(['gene', 'ref_aa', 'codon_num', 'alt_aa'])
# .agg(
#      num_samples=('ID', 'nunique')))

In [35]:
def uniq_locs(x):
    return np.unique(x)
def loc_counts(x):
    _, counts = np.unique(x, return_counts=True)
    return counts

In [37]:
subs = (seqsdf.groupby(['gene', 'pos', 'ref_aa', 'codon_num', 'alt_aa'])
.agg(
     num_samples=('ID', 'nunique'),
     first_detected=('date', 'min'),
     last_detected=('date', 'max'),
     locations=('location', uniq_locs),
     location_counts=('location', loc_counts),
     samples=('ID', 'unique')
    )
.reset_index())
subs['pos'] = subs['pos'] + 1

In [175]:
(subs[subs['gene']=='S'].sort_values('num_samples', ascending=False)
 .to_csv('S_mutations_consensus.csv', index=False))

## Consolidate metadata ID and fasta headers

In [134]:
def fix_header(x):
    if 'Consensus' in x:
        return x.split('_')[1]
    else:
        return x.split('/')[2]
seqsdf['n_ID'] = seqsdf['idx'].apply(fix_header)

In [135]:
seqsdf['n_ID'] = seqsdf['n_ID'].str.replace('ALSR', 'SEARCH')

In [136]:
meta = pd.read_csv(meta_fp)
meta['n_ID'] = meta['ID'].apply(lambda x: '-'.join(x.split('-')[:2]))

In [137]:
seqsdf['n_ID'] = seqsdf['n_ID'].apply(lambda x: '-'.join(x.split('-')[:2]))

In [138]:
tmp = pd.merge(seqsdf, meta, on='n_ID')

In [122]:
# tmp[tmp['ID'].str.contains('2112')]

In [98]:
# seqsdf

In [139]:
set(meta['n_ID'].unique()) - set(tmp['n_ID'].unique())

{'SEARCH-1668'}

In [140]:
seqsdf['idx'].unique().shape

(2765,)

In [141]:
meta['ID'].unique().shape

(2766,)

In [147]:
s = seqsdf[['n_ID', 'idx']].drop_duplicates()

In [151]:
new_meta = pd.merge(meta, s, on='n_ID', how='left')
(new_meta.drop(columns=['n_ID'])
.rename(columns={'idx': 'fasta_hdr'})
.to_csv('metadata.csv', index=False))

In [152]:
new_meta.shape 

(2766, 11)

In [153]:
new_meta

Unnamed: 0,ID,gb_accession,gisaid_accession,collection_date,location,percent_coverage_cds,avg_depth,authors,originating_lab,n_ID,idx
0,MG0987,MT598172,EPI_ISL_416457,2020-03-18,USA/California/San Diego,99.5954,2465.60,SEARCH Alliance San Diego,Andersen lab at Scripps Research,MG0987,Consensus_MG0987
1,PC00101P,MT192765,EPI_ISL_414648,2020-03-11,USA/California/San Diego,99.7525,3516.14,SEARCH Alliance San Diego,Andersen lab at Scripps Research,PC00101P,Consensus_PC00101P_threshold_0_quality_20
2,SEARCH-0007-SAN,MT598171,EPI_ISL_429990,2020-03-21,USA/California/San Diego,100.0000,6215.17,SEARCH Alliance San Diego with Christina Clark...,Rady's Childrens Hospital,SEARCH-0007,Consensus_SEARCH-0007-SAN_L1_threshold_0_quali...
3,SEARCH-0016-SAN,MT598173,EPI_ISL_430016,2020-03-24,USA/California/San Diego,100.0000,6440.67,SEARCH Alliance San Diego,Andersen lab at Scripps Research,SEARCH-0016,Consensus_SEARCH-0016-SAN_L1_threshold_0_quali...
4,SEARCH-0017-SAN,MT598174,EPI_ISL_429991,2020-03-24,USA/California/San Diego,100.0000,4947.09,SEARCH Alliance San Diego,Andersen lab at Scripps Research,SEARCH-0017,Consensus_SEARCH-0017-SAN_L1_threshold_0_quali...
...,...,...,...,...,...,...,...,...,...,...,...
2761,SEARCH-4685-SAN,,,2020-11-02,USA/California/San Diego,100.0000,4831.37,"SEARCH Alliance San Diego with Tracy Basler, J...",San Diego County Public Health Laboratory,SEARCH-4685,hCoV-19/USA/SEARCH-4685-SAN/2020
2762,SEARCH-4686-SAN,,,2020-11-05,USA/California/San Diego,100.0000,3864.73,"SEARCH Alliance San Diego with Tracy Basler, J...",San Diego County Public Health Laboratory,SEARCH-4686,hCoV-19/USA/SEARCH-4686-SAN/2020
2763,SEARCH-4687-ORA,,,2020-11-02,USA/California/Orange,98.1400,3123.54,"SEARCH Alliance San Diego with Tracy Basler, J...",San Diego County Public Health Laboratory,SEARCH-4687,hCoV-19/USA/SEARCH-4687-ORA/2020
2764,SEARCH-4690-SAN,,,2020-05-28,USA/California/San Diego,98.7283,2625.16,"SEARCH Alliance San Diego with Tracy Basler, J...",San Diego County Public Health Laboratory,SEARCH-4690,hCoV-19/USA/SEARCH-4690-SAN/2020


In [81]:
len(ref_seq)

29903