## TODO
* `gb_accession` and `gisaid_accession` are not found for new sequences, how do we concat to `metadata.csv` without them?
* metadata format for NCBI
* support tools for manual sanity checks

In [1]:
from bjorn import *
from bjorn_support import *
from onion_trees import *
import gffutils
import math
from mutations import *

In [46]:
input_fasta = "/home/al/analysis/mutations/S501Y/msa_reference.fa"
meta_fp = "/home/al/analysis/mutations/S501Y/metadata_2020-12-20_12-24.tsv"
out_dir = "/home/al/analysis/mutations/S501Y/"
ref_fp = "/home/al/data/test_inputs/NC045512.fasta"
patient_zero = 'NC_045512.2'

In [47]:
## keep only seqs contained in meta_file and save to fasta file
## concat with internal SD file
## generate MSA
meta = pd.read_csv(meta_fp, sep='\t')
meta.columns

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Index(['strain', 'virus', 'gisaid_epi_isl', 'genbank_accession', 'date',
       'region', 'country', 'division', 'location', 'region_exposure',
       'country_exposure', 'division_exposure', 'segment', 'length', 'host',
       'age', 'sex', 'Nextstrain_clade', 'pangolin_lineage', 'GISAID_clade',
       'originating_lab', 'submitting_lab', 'authors', 'url', 'title',
       'paper_url', 'date_submitted'],
      dtype='object')

In [48]:
# consensus_data = SeqIO.to_dict(SeqIO.parse(seqs_fp, "fasta"))

In [49]:
strains = meta['strain'].unique().tolist()
len(strains)

273267

In [50]:
print(f"Loading Alignment file at: {input_fasta}")
cns = AlignIO.read(input_fasta, 'fasta')

Loading Alignment file at: /home/al/analysis/mutations/S501Y/msa_reference.fa


In [51]:
print(f"Initial cleaning...")
seqs, ref_seq = process_cns_seqs(cns, patient_zero,
                                 start_pos=0, end_pos=30000)

Initial cleaning...


In [52]:
print(f"Creating a dataframe...")
seqsdf = (pd.DataFrame(index=seqs.keys(), 
                       data=seqs.values(), 
                       columns=['sequence'])
            .reset_index()
            .rename(columns={'index': 'idx'}))

Creating a dataframe...


In [53]:
def find_replacements(x, ref):
    return [f'{i}:{n}' for i, n in enumerate(x)
            if n!=ref[i] and n!='-' and n!='n']

In [54]:
print(f"Identifying mutations...")
# for each sample, identify list of substitutions (position:alt)
seqsdf['replacements'] = seqsdf['sequence'].apply(find_replacements, args=(ref_seq,))

Identifying mutations...


In [55]:
# wide-to-long data manipulation
seqsdf = seqsdf.explode('replacements')

In [56]:
# seqsdf

In [57]:
seqsdf['pos'] = -1
# populate position column
seqsdf.loc[~seqsdf['replacements'].isna(), 'pos'] = (seqsdf.loc[~seqsdf['replacements'].isna(), 'replacements']
   .apply(lambda x: int(x.split(':')[0])))

In [58]:
# filter out non-substitutions
seqsdf = seqsdf.loc[seqsdf['pos']!=-1]
print(f"Mapping Genes to mutations...")
# identify gene of each substitution
seqsdf['gene'] = seqsdf['pos'].apply(map_gene_to_pos)
seqsdf = seqsdf.loc[~seqsdf['gene'].isna()]

Mapping Genes to mutations...


In [59]:
# seqsdf

In [60]:
# filter our substitutions in non-gene positions
seqsdf = seqsdf.loc[seqsdf['gene']!='nan']
print(f"Compute codon numbers...")

# compute codon number of each substitution
seqsdf['codon_num'] = seqsdf.apply(compute_codon_num, args=(GENE2POS,), axis=1)


Compute codon numbers...


In [61]:

print(f"Fetch reference codon...")
# fetch the reference codon for each substitution
seqsdf['ref_codon'] = seqsdf.apply(get_ref_codon, args=(ref_seq, GENE2POS), axis=1)

Fetch reference codon...


In [62]:

print(f"Fetch alternative codon...")
# fetch the alternative codon for each substitution
seqsdf['alt_codon'] = seqsdf.apply(get_alt_codon, args=(GENE2POS,), axis=1)

Fetch alternative codon...


In [63]:
print(f"Map amino acids...")
# fetch the reference and alternative amino acids
seqsdf['ref_aa'] = seqsdf['ref_codon'].apply(get_aa)
seqsdf['alt_aa'] = seqsdf['alt_codon'].apply(get_aa)


# filter out substitutions with non-amino acid alternates (bad consensus calls)
seqsdf = seqsdf.loc[seqsdf['alt_aa']!='nan']

Map amino acids...


In [64]:
print(f"Fuse with metadata...")
# load and join metadata
meta = pd.read_csv(meta_fp, sep='\t')
seqsdf = pd.merge(seqsdf, meta, left_on='idx', right_on='strain')

Fuse with metadata...


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [65]:
seqsdf['date'] = pd.to_datetime(seqsdf['date_submitted'])

In [66]:
seqsdf['month'] = seqsdf['date'].dt.month

In [67]:
seqsdf.columns

Index(['idx', 'sequence', 'replacements', 'pos', 'gene', 'codon_num',
       'ref_codon', 'alt_codon', 'ref_aa', 'alt_aa', 'strain', 'virus',
       'gisaid_epi_isl', 'genbank_accession', 'date', 'region', 'country',
       'division', 'location', 'region_exposure', 'country_exposure',
       'division_exposure', 'segment', 'length', 'host', 'age', 'sex',
       'Nextstrain_clade', 'pangolin_lineage', 'GISAID_clade',
       'originating_lab', 'submitting_lab', 'authors', 'url', 'title',
       'paper_url', 'date_submitted', 'month'],
      dtype='object')

In [68]:
seqsdf.loc[seqsdf['location'].isna(), 'location'] = 'unk'

In [139]:
seqsdf.drop(columns=['sequence']).to_csv(out_dir/'replacements_19-12-2020.csv', index=False)

In [140]:
subs.to_csv(out_dir/'replacements_aggregated_19-12-2020.csv', index=False)

In [141]:
seqsdf[['idx', 'sequence']].to_csv(out_dir/'cns_sequences_19-12-2020.csv', index=False)

In [106]:
print(f"Aggregate final results...")
# aggregate on each substitutions, compute number of samples and other attributes
subs = (seqsdf.groupby(['gene', 'pos', 'ref_aa', 'codon_num', 'alt_aa'])
.agg(
 num_samples=('idx', 'nunique'),
 first_detected=('date', 'min'),
 last_detected=('date', 'max'),
 num_locations=('location', 'nunique'),
#  locations=('location', 'unique'),
 location_counts=('location', lambda x: np.unique(x, return_counts=True)),
 num_divisions=('division', 'nunique'),
#  divisions=('division', 'unique'),
 division_counts=('division', lambda x: np.unique(x, return_counts=True)),
 num_countries=('country', 'nunique'),
#  countries=('country', 'unique'),
 country_counts=('country', lambda x: np.unique(x, return_counts=True))
#  location_counts=('location', loc_counts)
)
.reset_index())
# 1-based nucleotide position coordinate system
subs['pos'] = subs['pos'] + 1

Aggregate final results...


In [107]:
gisaid_subs = (subs.rename(columns={'num_samples': 'gisaid_num_samples', 'first_detected': 'gisaid_1st_detected', 'last_detected': 'gisaid_last_detected',
                                    'num_locations': 'gisaid_num_locations', 'locations': 'gisaid_locations', 'location_counts': 'gisaid_location_counts',
                                   'num_divisions': 'gisaid_num_states','divisions': 'gisaid_states', 'division_counts': 'gisaid_state_counts',
                                   'num_countries': 'gisaid_num_countries', 'countries': 'gisaid_countries', 'country_counts': 'gisaid_country_counts'})
               .drop(columns=['ref_aa', 'pos']))

In [108]:
gisaid_subs.columns

Index(['gene', 'codon_num', 'alt_aa', 'gisaid_num_samples',
       'gisaid_1st_detected', 'gisaid_last_detected', 'gisaid_num_locations',
       'gisaid_location_counts', 'gisaid_num_states', 'gisaid_state_counts',
       'gisaid_num_countries', 'gisaid_country_counts'],
      dtype='object')

In [114]:
# gisaid_subs.sort_values('gisaid_num_samples', ascending=False).iloc[0]['gisaid_country_counts']

In [115]:
our_subs = pd.read_csv("/home/al/analysis/mutations/alab_git/replacements_22-12-2020_orig.csv")
our_subs.shape

(3986, 10)

In [116]:
all_subs = pd.merge(our_subs, gisaid_subs, on=['gene', 'codon_num', 'alt_aa'], how='left').drop_duplicates(subset=['gene', 'codon_num', 'alt_aa'])

In [119]:
all_subs.to_csv("/home/al/analysis/mutations/alab_git/replacements_22-12-2020.csv", index=False)

In [28]:
subs.loc[(subs['gene']=='S')&(subs['alt_aa']=='L')&(subs['codon_num']==957)]

Unnamed: 0,gene,pos,ref_aa,codon_num,alt_aa,num_samples,first_detected,last_detected,num_locations,locations,location_counts,num_countries,countries
23070,S,24432,Q,957,L,3,2020-10-27,2020-12-09,3,"[Counties Manukau, King County, Sydney]","[1, 1, 1]",3,"[USA, Australia, New Zealand]"


In [29]:
print(f"Aggregate final results...")
# aggregate on each substitutions, compute number of samples and other attributes
subs_mnth = (seqsdf.groupby(['month', 'gene', 'pos', 'ref_aa', 'codon_num', 'alt_aa'])
.agg(
 num_samples=('idx', 'nunique'),
 first_detected_mnth=('date', 'min'),
 last_detected_mnth=('date', 'max'),
 num_locations=('location', 'nunique'),
 locations=('location', lambda x: list(np.unique(x))),
 location_counts=('location', lambda x: list(np.unique(x, return_counts=True)[1])),
 num_countries=('country', 'nunique'),
 countries=('country', lambda x: list(np.unique(x))),
 country_counts=('country', lambda x: list(np.unique(x, return_counts=True)[1])),
)
.reset_index())
# 1-based nucleotide position coordinate system
subs_mnth['pos'] = subs_mnth['pos'] + 1
subs_mnth = pd.merge(subs_mnth, subs[['gene', 'pos', 'alt_aa', 'first_detected', 'last_detected']], on=['gene', 'pos', 'alt_aa'])

Aggregate final results...


In [30]:
cols = ['month', 'ref_aa', 'codon_num', 'alt_aa', 'first_detected', 
        'last_detected', 'num_samples', 'num_countries', 
        'countries', 'country_counts', 'num_locations', 'locations', 'location_counts' ,
        'first_detected_mnth', 'last_detected_mnth']

In [72]:
# (subs_mnth[(subs_mnth['gene']=='S') & (subs_mnth['month']==12)]
#  .sort_values('num_samples', ascending=False)
#  .drop_duplicates(subset=['codon_num', 'alt_aa'], keep='first')
#  .iloc[:50]
#  .reset_index(drop=True))[cols]

In [132]:
# keys_df = seqsdf[['idx', 'sequence']]
# keys_df.to_csv('gisaid_replacements.csv', index=False)

In [133]:
sd = []
for d in seqsdf['location'].dropna().unique():
    if 'san diego' in d.lower():
        sd.append(d)

In [111]:
ca = []
for d in seqsdf['division'].unique():
    if 'cali' in d.lower():
        ca.append(d)

In [73]:
# cols = ['idx', 'location', 'division', 'pos']
# seqsdf.loc[(seqsdf['codon_num']==681) & (seqsdf['gene']=='S')][cols]

## Deletions

In [124]:
input_fasta = "/home/al/analysis/mutations/S501Y/msa_reference.fa"
meta_fp = "/home/al/analysis/mutations/S501Y/metadata_2020-12-20_12-24.tsv"
out_dir = "/home/al/analysis/mutations/S501Y/"
ref_fp = "/home/al/data/test_inputs/NC045512.fasta"
patient_zero = 'NC_045512.2'
min_del_len = 1
start_pos = 265
end_pos = 29674

In [125]:
# read MSA file
consensus_data = AlignIO.read(input_fasta, 'fasta')
# prcess MSA to remove insertions and fix position coordinate systems
seqs, ref_seq = process_cns_seqs(consensus_data, patient_zero, start_pos=start_pos, end_pos=end_pos)
# load into dataframe
seqsdf = (pd.DataFrame(index=seqs.keys(), data=seqs.values(), 
                       columns=['sequence'])
            .reset_index().rename(columns={'index': 'idx'}))

In [126]:
# load and join metadata
meta = pd.read_csv(meta_fp, sep='\t')
print(seqsdf.shape)
seqsdf = pd.merge(seqsdf, meta, left_on='idx', right_on='strain')
print(seqsdf.shape)
# # clean and process sample collection dates
# seqsdf = seqsdf.loc[(seqsdf['collection_date']!='Unknown') 
#                & (seqsdf['collection_date']!='1900-01-00')]
# seqsdf.loc[seqsdf['collection_date'].str.contains('/'), 'collection_date'] = seqsdf['collection_date'].apply(lambda x: x.split('/')[0])
seqsdf['date'] = pd.to_datetime(seqsdf['date_submitted'])

(273268, 2)
(273267, 29)


In [127]:
# compute length of each sequence
seqsdf['seq_len'] = seqsdf['sequence'].str.len()
# identify deletion positions
seqsdf['del_positions'] = seqsdf['sequence'].apply(find_deletions)

In [128]:
# sequences with one or more deletions
del_seqs = seqsdf.loc[seqsdf['del_positions'].str.len() > 0]
del_seqs = del_seqs.explode('del_positions')
# compute length of each deletion
del_seqs['del_len'] = del_seqs['del_positions'].apply(len)
# only consider deletions longer than 2nts
del_seqs = del_seqs[del_seqs['del_len'] >= min_del_len]
# fetch coordinates of each deletion
del_seqs['relative_coords'] = del_seqs['del_positions'].apply(get_indel_coords)
del_seqs.loc[del_seqs['location'].isna(), 'location'] = 'unk'
# group sample by the deletion they share
del_seqs = (del_seqs.groupby(['relative_coords', 'del_len'])
                    .agg(
                         samples=('idx', 'unique'),
                         num_samples=('idx', 'nunique'),
                         first_detected=('date', 'min'),
                         last_detected=('date', 'max'),
#                          locations=('location', lambda x: list(np.unique(x))),
                         location_counts=('location', lambda x: np.unique(x, return_counts=True)),
#                          divisions=('division', lambda x: list(np.unique(x))),
                         division_counts=('division', lambda x: np.unique(x, return_counts=True)),
#                          countries=('country', lambda x: list(np.unique(x))),
                         country_counts=('country', lambda x: np.unique(x, return_counts=True)),
                        )
                    .reset_index()
                    .sort_values('num_samples'))
del_seqs['type'] = 'deletion'
# adjust coordinates to account for the nts trimmed from beginning e.g. 265nts
del_seqs['absolute_coords'] = del_seqs['relative_coords'].apply(adjust_coords, args=(start_pos+1,))
del_seqs['pos'] = del_seqs['absolute_coords'].apply(lambda x: int(x.split(':')[0]))
# approximate the gene where each deletion was identified
del_seqs['gene'] = del_seqs['pos'].apply(map_gene_to_pos)
del_seqs = del_seqs.loc[~del_seqs['gene'].isna()]
# filter our substitutions in non-gene positions
del_seqs = del_seqs.loc[del_seqs['gene']!='nan']
# compute codon number of each substitution
del_seqs['codon_num'] = del_seqs.apply(compute_codon_num, args=(GENE2POS,), axis=1)
# fetch the reference codon for each substitution
del_seqs['ref_codon'] = del_seqs.apply(get_ref_codon, args=(ref_seq, GENE2POS), axis=1)
# fetch the reference and alternative amino acids
del_seqs['ref_aa'] = del_seqs['ref_codon'].apply(get_aa)
# record the 5 nts before each deletion (based on reference seq)
del_seqs['prev_5nts'] = del_seqs['absolute_coords'].apply(lambda x: ref_seq[int(x.split(':')[0])-5:int(x.split(':')[0])])
# record the 5 nts after each deletion (based on reference seq)
del_seqs['next_5nts'] = del_seqs['absolute_coords'].apply(lambda x: ref_seq[int(x.split(':')[1])+1:int(x.split(':')[1])+6])
del_seqs.sort_values('num_samples', ascending=False)

Unnamed: 0,relative_coords,del_len,samples,num_samples,first_detected,last_detected,location_counts,division_counts,country_counts,type,absolute_coords,pos,gene,codon_num,ref_codon,ref_aa,prev_5nts,next_5nts
969,21499:21504,6,"[England/SHEF-C43D2/2020, Australia/VIC1606/20...",6768,2020-05-27,2020-12-19,"([Benátky nad Jizerou, Bobo Dioulasso, Brno, C...","([Aargau, Agder, Auckland, Auvergne-Rhône-Alpe...","([Australia, Burkina Faso, Canada, Croatia, Cz...",deletion,21765:21770,21765,S,68,ATA,I,gctat,ctctg
236,0:76,77,"[Guangdong/ZQ-S2-P0061/2020, USA/CT-Yale-001/2...",3295,2020-03-09,2020-12-18,"([Alameda County, Barcelona, Barranquilla, Bro...","([Alaska, Antioquia, Atlantico, Auckland, Aust...","([Australia, Austria, Bangladesh, Belgium, Bra...",deletion,266:342,266,ORF1ab,1,ATG,M,taaga,ctcgt
1025,21725:21727,3,"[India/MH-1-27/2020, Netherlands/ZuidHolland_7...",2096,2020-03-06,2020-12-19,"([Alameda County, Allegheny County, Amadora, C...","([Abu Dhabi, Amman, Andalusia, Apulia, Aragon,...","([Australia, Bangladesh, Canada, Côte d'Ivoire...",deletion,21991:21993,21991,S,144,TAT,Y,gtgtt,tacca
470,1339:1341,3,"[France/ARA-739/2020, Netherlands/Andel_136506...",2064,2020-02-14,2020-12-18,"([Alcaniz, Algemesi, Alhaurin de la Torre, Alz...","([Agder, Andalusia, Aragon, Athens, Auckland, ...","([Australia, Belgium, Canada, Chile, Curacao, ...",deletion,1605:1607,1605,ORF1ab,447,AAT,N,cttaa,caacc
2654,29400:29408,9,"[Guangdong/DG-S9-P0045/2020, Guangdong/MM-S1-P...",2056,2020-03-09,2020-12-19,"([A Coruna, Alameda County, Alaquas, Brown Cou...","([Alaska, Amazonas, Aragon, Arizona, Auckland,...","([Australia, Austria, Bangladesh, Belgium, Bra...",deletion,29666:29674,29666,ORF10,37,CTC,L,taatc,aatct
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2074,27972:27974,3,[England/QEUH-B12C75/2020],1,2020-11-18,2020-11-18,"([unk], [1])","([England], [1])","([United Kingdom], [1])",deletion,28238:28240,28238,ORF8,116,GTT,V,ttcgt,gtttt
2058,2788:2844,57,[Japan/PG-4791/2020],1,2020-12-10,2020-12-10,"([unk], [1])","([Japan], [1])","([Japan], [1])",deletion,3054:3110,3054,ORF1ab,930,GAT,D,gagga,tggta
2059,27890:29408,1519,[NewZealand/20VR2530/2020],1,2020-10-14,2020-10-14,"([unk], [1])","([Auckland], [1])","([New Zealand], [1])",deletion,28156:29674,28156,ORF8,88,ATT,I,acaat,aatct
2060,27892:29408,1517,[NewZealand/20VR2503/2020],1,2020-10-14,2020-10-14,"([unk], [1])","([Auckland], [1])","([New Zealand], [1])",deletion,28158:29674,28158,ORF8,89,AAT,N,aatta,aatct


In [142]:
del_seqs.to_csv(out_dir/'deletions_aggregated_19-12-2020.csv', index=False)

In [130]:
gisaid_dels = (del_seqs.rename(columns={'num_samples': 'gisaid_num_samples', 'first_detected': 'gisaid_1st_detected', 'last_detected': 'gisaid_last_detected',
                                   'locations': 'gisaid_locations', 'location_counts': 'gisaid_location_counts',
                                   'divisions': 'gisaid_states', 'division_counts': 'gisaid_state_counts',
                                   'countries': 'gisaid_countries', 'country_counts': 'gisaid_country_counts'})
               .drop(columns=['ref_aa', 'pos', 'type', 'samples', 'ref_codon', 'prev_5nts', 'next_5nts', 'relative_coords', 'del_len']))

In [131]:
our_dels = pd.read_csv("/home/al/analysis/mutations/alab_git/deletions_22-12-2020_orig.csv")
# our_dels

In [132]:
cols = ['type', 'gene', 'absolute_coords', 'del_len', 'pos', 
                 'ref_aa', 'codon_num', 'num_samples',
                 'first_detected', 'last_detected',
                 'location_counts', 'gisaid_num_samples',
                 'gisaid_1st_detected', 'gisaid_last_detected', 'gisaid_country_counts',
                 'gisaid_state_counts', 'gisaid_location_counts', 'samples',
                 'ref_codon', 'prev_5nts', 'next_5nts'
                 ]

In [133]:
our_dels = pd.merge(our_dels, gisaid_dels, on=['gene', 'codon_num', 'absolute_coords'], how='left')

In [137]:
our_dels[cols].sort_values('num_samples', ascending=False).to_csv("/home/al/analysis/mutations/alab_git/deletions_22-12-2020.csv", index=False)

In [None]:
align_fasta_reference(seqs_fp, num_cpus=25, ref_fp=ref_fp)

## CNS Mutations Report

In [6]:
analysis_folder = Path('/home/al/code/HCoV-19-Genomics/consensus_sequences/')
meta_fp = Path('/home/al/code/HCoV-19-Genomics/metadata.csv')
ref_path = Path('/home/gk/code/hCoV19/db/NC045512.fasta')
patient_zero = 'NC_045512.2'
in_fp = '/home/al/analysis/mutations/S501Y/msa_aligned.fa'

In [3]:
subs = identify_replacements(in_fp, meta_fp)

In [4]:
subs.head()

Unnamed: 0,gene,pos,ref_aa,codon_num,alt_aa,num_samples,first_detected,last_detected,locations,location_counts,samples
0,3UTR,29679,S,2,F,2,2020-07-14,2020-10-07,USA/California/San Diego,2,"[SEARCH-3119-SAN, SEARCH-4245-SAN]"
1,3UTR,29681,L,3,L,2,2020-07-11,2020-07-21,"[USA/California/Los Angeles, USA/California/Sa...","[1, 1]","[SEARCH-2600-SAN, SEARCH-2692-LAX]"
2,3UTR,29688,S,5,I,4,2020-03-25,2020-10-26,USA/California/San Diego,4,"[SEARCH-0113-SAN, SEARCH-2855-SAN, SEARCH-3609..."
3,3UTR,29690,V,6,L,1,2020-08-13,2020-08-13,USA/California/San Diego,1,[SEARCH-4455-SAN]
4,3UTR,29692,V,6,V,1,2020-10-03,2020-10-03,Jordan/Amman,1,[SEARCH-4034-JOR]


In [5]:
dels = identify_deletions(in_fp, meta_fp, patient_zero)
dels

Unnamed: 0,type,gene,absolute_coords,del_len,pos,ref_aa,codon_num,num_samples,first_detected,last_detected,locations,location_counts,samples,ref_codon,prev_5nts,next_5nts
30,deletion,ORF7a,27538:27572,35,27538,L,49,1,2020-04-28,2020-04-28,MEX/Baja California/Tijuana,1,[hCoV-19/MEX/SEARCH-1480-TIJ/2020],CTA,tcctc,actca
27,deletion,ORF6,27264:27290,27,27264,F,22,1,2020-07-28,2020-07-28,USA/California/San Diego,1,[hCoV-19/USA/SEARCH-3236-SAN/2020],TTT,ggact,tacat
28,deletion,ORF6,27266:27293,28,27266,F,22,1,2020-04-23,2020-04-23,USA/California/San Diego,1,[hCoV-19/USA/SEARCH-1921-SAN/2020],TTT,acttt,atcat
29,deletion,ORF7a,27498:27531,34,27498,S,36,1,2020-05-12,2020-05-12,USA/California/Imperial,1,[hCoV-19/USA/SEARCH-0573-IPL/2020],TCT,cttgc,atcct
59,deletion,ORF1ab,6656:6679,24,6656,N,2131,1,2020-06-13,2020-06-13,USA/California/San Diego,1,[hCoV-19/USA/SEARCH-3559-SAN/2020],AAT,tgtta,ctaat
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24,deletion,ORF3a,26158:26161,4,26158,V,256,9,2020-07-17,2020-09-14,USA/California/San Diego,9,"[hCoV-19/USA/SEARCH-2907-SAN/2020, hCoV-19/USA...",GTT,agttg,tccag
16,deletion,ORF1ab,516:518,3,516,V,84,10,2020-07-22,2020-08-01,USA/California/San Diego,10,"[hCoV-19/USA/SEARCH-4342-SAN/2020, hCoV-19/USA...",GTT,catgt,ggttg
56,deletion,ORF1ab,686:694,9,686,K,141,17,2020-03-31,2020-10-03,"[Jordan/Amman, USA/California/San Diego, USA/L...","[2, 13, 2]","[hCoV-19/USA/SEARCH-0264-NBG/2020, hCoV-19/USA...",AAG,tctaa,actta
51,deletion,N,28890:28901,12,28890,S,206,23,2020-06-16,2020-08-06,USA/California/San Diego,23,"[hCoV-19/USA/SEARCH-2285-SAN/2020, hCoV-19/USA...",TCT,acttc,ggctg


In [3]:
dels[dels['gene']=='S'].sort_values('num_samples', ascending=False)#.to_csv('S_deletions_consensus.csv', index=False)

NameError: name 'dels' is not defined

In [4]:
identify_insertions(in_fp, patient_zero).to_csv('test.csv', index=False)

## dev 

In [10]:
GENE2POS = {
            '5UTR': {'start': 0, 'end': 265},
            'ORF1ab': {'start': 265, 'end': 21555},
            'S': {'start': 21562, 'end': 25384},
            'ORF3a': {'start': 25392, 'end': 26220},
            'E': {'start': 26244, 'end': 26472},
            'M': {'start': 26522, 'end': 27191},
            'ORF6': {'start': 27201, 'end': 27387},
            'ORF7a': {'start': 27393, 'end': 27759},
            'ORF7b': {'start': 27755, 'end': 27887},
            'ORF8': {'start': 27893, 'end': 28259},
            'N': {'start': 28273, 'end': 29533},
            'ORF10': {'start': 29557, 'end': 29674},
            '3UTR': {'start': 29674, 'end': 29902}
           }

In [11]:
in_dir = '/home/al/analysis/mutations/fa/'
out_dir = '/home/al/analysis/mutations/msa/'

In [14]:
!rm -r /home/al/analysis/mutations
!mkdir /home/al/analysis/mutations
!mkdir /home/al/analysis/mutations/fa

In [15]:
for filename in analysis_folder.listdir():
    if (filename.endswith('fa') or filename.endswith('fasta')):
        copy(filename, '/home/al/analysis/mutations/fa/')
#         print(filename)

In [179]:
copy(ref_path, in_dir)

'/home/al/analysis/mutations/fa/NC045512.fasta'

In [180]:
in_dir = '/home/al/analysis/mutations/fa/'
out_dir = '/home/al/analysis/mutations/msa'
concat_fasta(in_dir, out_dir)

'/home/al/analysis/mutations/msa.fa'

In [17]:
align_fasta_reference('/home/al/analysis/mutations/msa.fa',  num_cpus=12, ref_fp=ref_path)

'/home/al/analysis/mutations/msa_aligned.fa'

In [14]:
cns = AlignIO.read('/home/al/analysis/mutations/msa_aligned.fa', 'fasta')

In [15]:
ref_seq = get_seq(cns, patient_zero)

In [16]:
len(ref_seq)

29903

In [17]:
seqs = get_seqs(cns, 0, 30000)

In [18]:
seqsdf = (pd.DataFrame(index=seqs.keys(), data=seqs.values(), columns=['sequence'])
                .reset_index().rename(columns={'index': 'idx'}))

In [19]:
# seqsdf

In [20]:
def find_replacements(x, ref):
    return [f'{i}:{n}' for i, n in enumerate(x) 
            if n!=ref[i] and n!='-' and n!='n']

In [21]:
seqsdf['replacements'] = seqsdf['sequence'].apply(find_replacements, args=(ref_seq,))

In [22]:
seqsdf = seqsdf.explode('replacements')
seqsdf['pos'] = -1
seqsdf.loc[~seqsdf['replacements'].isna(), 'pos'] = seqsdf.loc[~seqsdf['replacements'].isna(), 'replacements'].apply(lambda x: int(x.split(':')[0]))
seqsdf = seqsdf.loc[seqsdf['pos']!=-1]

In [23]:
def compute_codon_num(x, gene2pos: dict):
    pos = x['pos']
    ref_pos = gene2pos[x['gene']]['start']
    return math.ceil((pos - ref_pos + 1) / 3)

In [24]:
seqsdf['gene'] = seqsdf['pos'].apply(map_gene_to_pos)
seqsdf = seqsdf.loc[~seqsdf['gene'].isna()]
seqsdf = seqsdf.loc[seqsdf['gene']!='nan']
seqsdf['codon_num'] = seqsdf.apply(compute_codon_num, args=(GENE2POS,), axis=1)

In [25]:
def get_ref_codon(x, ref_seq, gene2pos: dict):
    ref_pos = gene2pos[x['gene']]['start']
    codon_start = ref_pos + ((x['codon_num'] - 1) * 3)
    return ref_seq[codon_start: codon_start+3].upper()
seqsdf['ref_codon'] = seqsdf.apply(get_ref_codon, args=(ref_seq, GENE2POS), axis=1)

In [26]:
def get_alt_codon(x, gene2pos: dict):
    ref_pos = gene2pos[x['gene']]['start']
    codon_start = ref_pos + ((x['codon_num'] - 1) * 3)
    return x['sequence'][codon_start: codon_start+3].upper()
seqsdf['alt_codon'] = seqsdf.apply(get_alt_codon, args=(GENE2POS,), axis=1)

In [27]:
def get_aa(codon: str):
    CODON2AA = { 
        'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M', 
        'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', 
        'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K', 
        'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',                  
        'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 
        'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 
        'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q', 
        'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R', 
        'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 
        'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A', 
        'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', 
        'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 
        'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 
        'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L', 
        'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_', 
        'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W', 
    } 
    return CODON2AA.get(codon, 'nan')
seqsdf['ref_aa'] = seqsdf['ref_codon'].apply(get_aa)
seqsdf['alt_aa'] = seqsdf['alt_codon'].apply(get_aa)
seqsdf = seqsdf.loc[seqsdf['alt_aa']!='nan']

In [28]:
seqsdf.columns

Index(['idx', 'sequence', 'replacements', 'pos', 'gene', 'codon_num',
       'ref_codon', 'alt_codon', 'ref_aa', 'alt_aa'],
      dtype='object')

In [29]:
meta = pd.read_csv(meta_fp)
print(seqsdf['idx'].unique().shape)
seqsdf = pd.merge(seqsdf, meta, left_on='idx', right_on='fasta_hdr')
print(seqsdf['idx'].unique().shape)

(2765,)
(2765,)


In [30]:
seqsdf = seqsdf.loc[(seqsdf['collection_date']!='Unknown') 
                   & (seqsdf['collection_date']!='1900-01-00')]
seqsdf.loc[seqsdf['collection_date'].str.contains('/'), 'collection_date'] = seqsdf['collection_date'].apply(lambda x: x.split('/')[0])
seqsdf['date'] = pd.to_datetime(seqsdf['collection_date'])

In [31]:
seqsdf['date'].min()

Timestamp('2020-03-04 00:00:00')

In [32]:
# (seqsdf.groupby(['gene', 'ref_aa', 'codon_num', 'alt_aa'])
# .agg(
#      num_samples=('ID', 'nunique')))

In [35]:
def uniq_locs(x):
    return np.unique(x)
def loc_counts(x):
    _, counts = np.unique(x, return_counts=True)
    return counts

In [37]:
subs = (seqsdf.groupby(['gene', 'pos', 'ref_aa', 'codon_num', 'alt_aa'])
.agg(
     num_samples=('ID', 'nunique'),
     first_detected=('date', 'min'),
     last_detected=('date', 'max'),
     locations=('location', uniq_locs),
     location_counts=('location', loc_counts),
     samples=('ID', 'unique')
    )
.reset_index())
subs['pos'] = subs['pos'] + 1

In [175]:
(subs[subs['gene']=='S'].sort_values('num_samples', ascending=False)
 .to_csv('S_mutations_consensus.csv', index=False))

## Consolidate metadata ID and fasta headers

In [134]:
def fix_header(x):
    if 'Consensus' in x:
        return x.split('_')[1]
    else:
        return x.split('/')[2]
seqsdf['n_ID'] = seqsdf['idx'].apply(fix_header)

In [135]:
seqsdf['n_ID'] = seqsdf['n_ID'].str.replace('ALSR', 'SEARCH')

In [136]:
meta = pd.read_csv(meta_fp)
meta['n_ID'] = meta['ID'].apply(lambda x: '-'.join(x.split('-')[:2]))

In [137]:
seqsdf['n_ID'] = seqsdf['n_ID'].apply(lambda x: '-'.join(x.split('-')[:2]))

In [138]:
tmp = pd.merge(seqsdf, meta, on='n_ID')

In [122]:
# tmp[tmp['ID'].str.contains('2112')]

In [98]:
# seqsdf

In [139]:
set(meta['n_ID'].unique()) - set(tmp['n_ID'].unique())

{'SEARCH-1668'}

In [140]:
seqsdf['idx'].unique().shape

(2765,)

In [141]:
meta['ID'].unique().shape

(2766,)

In [147]:
s = seqsdf[['n_ID', 'idx']].drop_duplicates()

In [151]:
new_meta = pd.merge(meta, s, on='n_ID', how='left')
(new_meta.drop(columns=['n_ID'])
.rename(columns={'idx': 'fasta_hdr'})
.to_csv('metadata.csv', index=False))

In [152]:
new_meta.shape 

(2766, 11)

In [153]:
new_meta

Unnamed: 0,ID,gb_accession,gisaid_accession,collection_date,location,percent_coverage_cds,avg_depth,authors,originating_lab,n_ID,idx
0,MG0987,MT598172,EPI_ISL_416457,2020-03-18,USA/California/San Diego,99.5954,2465.60,SEARCH Alliance San Diego,Andersen lab at Scripps Research,MG0987,Consensus_MG0987
1,PC00101P,MT192765,EPI_ISL_414648,2020-03-11,USA/California/San Diego,99.7525,3516.14,SEARCH Alliance San Diego,Andersen lab at Scripps Research,PC00101P,Consensus_PC00101P_threshold_0_quality_20
2,SEARCH-0007-SAN,MT598171,EPI_ISL_429990,2020-03-21,USA/California/San Diego,100.0000,6215.17,SEARCH Alliance San Diego with Christina Clark...,Rady's Childrens Hospital,SEARCH-0007,Consensus_SEARCH-0007-SAN_L1_threshold_0_quali...
3,SEARCH-0016-SAN,MT598173,EPI_ISL_430016,2020-03-24,USA/California/San Diego,100.0000,6440.67,SEARCH Alliance San Diego,Andersen lab at Scripps Research,SEARCH-0016,Consensus_SEARCH-0016-SAN_L1_threshold_0_quali...
4,SEARCH-0017-SAN,MT598174,EPI_ISL_429991,2020-03-24,USA/California/San Diego,100.0000,4947.09,SEARCH Alliance San Diego,Andersen lab at Scripps Research,SEARCH-0017,Consensus_SEARCH-0017-SAN_L1_threshold_0_quali...
...,...,...,...,...,...,...,...,...,...,...,...
2761,SEARCH-4685-SAN,,,2020-11-02,USA/California/San Diego,100.0000,4831.37,"SEARCH Alliance San Diego with Tracy Basler, J...",San Diego County Public Health Laboratory,SEARCH-4685,hCoV-19/USA/SEARCH-4685-SAN/2020
2762,SEARCH-4686-SAN,,,2020-11-05,USA/California/San Diego,100.0000,3864.73,"SEARCH Alliance San Diego with Tracy Basler, J...",San Diego County Public Health Laboratory,SEARCH-4686,hCoV-19/USA/SEARCH-4686-SAN/2020
2763,SEARCH-4687-ORA,,,2020-11-02,USA/California/Orange,98.1400,3123.54,"SEARCH Alliance San Diego with Tracy Basler, J...",San Diego County Public Health Laboratory,SEARCH-4687,hCoV-19/USA/SEARCH-4687-ORA/2020
2764,SEARCH-4690-SAN,,,2020-05-28,USA/California/San Diego,98.7283,2625.16,"SEARCH Alliance San Diego with Tracy Basler, J...",San Diego County Public Health Laboratory,SEARCH-4690,hCoV-19/USA/SEARCH-4690-SAN/2020


In [81]:
len(ref_seq)

29903