## Internal Mutations Report
This notebook contains developmental code for analyzing mutations in SARS-CoV-2 samples released by Andersen Lab. 

In [2]:
import pandas as pd
from path import Path
from mutations import *
from onion_trees import *
from bjorn_support import *
from shutil import copy


In [3]:
df = pd.read_csv('/home/al/code/HCoV-19-Genomics/variants/substitutions_01-01-2021.csv')
df.head()

Unnamed: 0,gene,ref_codon,alt_codon,pos,ref_aa,codon_num,alt_aa,num_samples,first_detected,last_detected,...,gisaid_last_detected,gisaid_num_locations,gisaid_locations,gisaid_location_counts,gisaid_num_states,gisaid_states,gisaid_state_counts,gisaid_num_countries,gisaid_countries,gisaid_country_counts
0,S,GAT,GGT,23403,D,614,G,3384,2020-03-04,2020-12-29,...,2020-12-31,2507.0,"[""'s-Gravenwezel"", 'A Coruna', 'Aarschot', 'Aa...","[1, 15, 3, 1, 7, 2, 8, 1, 1, 10, 1, 1, 1, 2, 1...",1043.0,"['Aalst', 'Aargau', 'Abruzzo', 'Abu Dhabi', 'A...","[5, 278, 107, 831, 17, 1, 2, 1, 1, 29, 11, 50,...",129.0,"['Algeria', 'Andorra', 'Argentina', 'Armenia',...","[3, 1, 50, 1, 9, 15065, 692, 126, 507, 8, 2770..."
1,ORF1ab,CTA,TTA,14408,L,4715,L,3384,2020-03-04,2020-12-29,...,2020-12-31,2489.0,"[""'s-Gravenwezel"", 'A Coruna', 'Aarschot', 'Aa...","[1, 15, 3, 1, 6, 2, 8, 1, 10, 1, 1, 1, 2, 12, ...",1028.0,"['Aalst', 'Aargau', 'Abruzzo', 'Abu Dhabi', 'A...","[5, 278, 85, 805, 17, 1, 2, 1, 1, 29, 11, 50, ...",128.0,"['Algeria', 'Andorra', 'Argentina', 'Armenia',...","[3, 1, 48, 1, 9, 15027, 688, 81, 486, 8, 2769,..."
2,ORF1ab,TTC,TTT,3037,F,924,F,3375,2020-03-04,2020-12-29,...,2020-12-31,2494.0,"[""'s-Gravenwezel"", 'A Coruna', 'Aarschot', 'Aa...","[1, 15, 3, 1, 7, 2, 8, 1, 10, 1, 1, 1, 2, 12, ...",1030.0,"['Aalst', 'Aargau', 'Abruzzo', 'Abu Dhabi', 'A...","[5, 267, 107, 774, 17, 1, 2, 1, 1, 29, 11, 50,...",129.0,"['Algeria', 'Andorra', 'Argentina', 'Armenia',...","[3, 1, 50, 1, 9, 15053, 679, 124, 485, 6, 2770..."
3,5UTR,CGT,TGT,241,R,81,C,3348,2020-03-04,2020-12-29,...,2020-12-31,2475.0,"[""'s-Gravenwezel"", 'A Coruna', 'Aarschot', 'Aa...","[1, 15, 3, 1, 7, 2, 8, 1, 10, 1, 1, 1, 2, 10, ...",1024.0,"['Aalst', 'Aargau', 'Abruzzo', 'Abu Dhabi', 'A...","[5, 275, 104, 835, 17, 1, 2, 1, 29, 1, 11, 50,...",128.0,"['Algeria', 'Argentina', 'Armenia', 'Aruba', '...","[3, 50, 1, 9, 13414, 688, 124, 501, 8, 2764, 4..."
4,N,TCA,TTA,28854,S,194,L,1654,2020-03-23,2020-12-17,...,2020-12-31,323.0,"['Ahmedabad', 'Alachua County', 'Alameda Count...","[120, 5, 163, 13, 4, 20, 1, 1, 3, 1, 1, 1, 1, ...",300.0,"['Aargau', 'Abu Dhabi', 'Agder', 'Aguascalient...","[13, 13, 10, 4, 1, 3, 7, 52, 1, 4, 4, 1, 4, 2,...",68.0,"['Aruba', 'Australia', 'Austria', 'Bahrain', '...","[4, 125, 1, 3, 33, 3, 36, 271, 2, 17, 7, 3, 1,..."


In [5]:
df['samples']

"['MG0987' 'PC00101P' 'SEARCH-0007-SAN' ... 'SEARCH-5570-SAN'\n 'SEARCH-5573-SAN' 'SEARCH-5574-SAN']"

In [89]:
msa_fp = Path('/home/al/analysis/mutations/S501Y/msa_reference.fa')

In [90]:
patient_zero = 'NC_045512.2'

In [3]:
fa_fp = Path('/home/al/analysis/mutations/alab_git/b117/b117_seqs.fasta')
ref_fp = '/home/al/data/test_inputs/NC045512.fasta'

align_fasta_reference(fa_fp, num_cpus=20, ref_fp=ref_fp)

'/home/al/analysis/mutations/alab_git/b117/b117_seqs_aligned.fa'

In [91]:
cns = AlignIO.read(msa_fp, 'fasta')


In [94]:
meta.columns

Index(['strain', 'virus', 'gisaid_epi_isl', 'genbank_accession', 'date',
       'region', 'country', 'division', 'location', 'region_exposure',
       'country_exposure', 'division_exposure', 'segment', 'length', 'host',
       'age', 'sex', 'Nextstrain_clade', 'pangolin_lineage', 'GISAID_clade',
       'originating_lab', 'submitting_lab', 'authors', 'url', 'title',
       'paper_url', 'date_submitted'],
      dtype='object')

In [100]:
meta = pd.read_csv('/home/al/analysis/mutations/alab_git/nextstrain_groups_neherlab_ncov_S.N501_metadata.tsv', sep='\t')
strains = meta['Strain'].unique().tolist()
len(strains)

6891

In [108]:
subs = pd.read_csv("/home/al/analysis/mutations/gisaid/gisaid_replacements_19-12-2020.csv")
subs = subs[subs['idx'].isin(strains)]

In [109]:
subs.columns

Index(['idx', 'replacements', 'pos', 'gene', 'codon_num', 'ref_codon',
       'alt_codon', 'ref_aa', 'alt_aa', 'strain', 'virus', 'gisaid_epi_isl',
       'genbank_accession', 'date', 'region', 'country', 'division',
       'location', 'region_exposure', 'country_exposure', 'division_exposure',
       'segment', 'length', 'host', 'age', 'sex', 'Nextstrain_clade',
       'pangolin_lineage', 'GISAID_clade', 'originating_lab', 'submitting_lab',
       'authors', 'url', 'title', 'paper_url', 'date_submitted', 'month',
       'mutation'],
      dtype='object')

In [111]:
# aggregate on each substitutions, compute number of samples and other attributes
subs = (subs.groupby(['gene', 'ref_codon', 'alt_codon', 'pos', 'ref_aa', 
                        'codon_num', 'alt_aa'])
.agg(
 num_samples=('idx', 'nunique'),
 first_detected=('date', 'min'),
 last_detected=('date', 'max'),
#      locations=('location', uniq_locs),
 location_counts=('location', 
                  lambda x: np.unique(x, return_counts=True)),
 samples=('idx', 'unique')
)
.reset_index())
subs['locations'] = subs['location_counts'].apply(lambda x: list(x[0]))
subs['location_counts'] = subs['location_counts'].apply(lambda x: list(x[1]))

In [114]:
positions = [26729, 28856, 1058, 17614, 15095, 17596, 26059, 24022]

In [115]:
subs.loc[subs['pos'].isin(positions)]

Unnamed: 0,gene,ref_codon,alt_codon,pos,ref_aa,codon_num,alt_aa,num_samples,first_detected,last_detected,location_counts,samples,locations
293,M,GTT,CTT,26729,V,70,L,16,2020-11-25,2020-12-17,[16],"[England/ALDP-B5B4D2/2020, England/MILK-B55793...",[unk]
294,M,GTT,TTT,26729,V,70,F,2,2020-08-26,2020-12-09,"[1, 1]","[mink/Netherlands/NB-EMC-22-1/2020, USA/NY-NYC...","[New York City, unk]"
740,ORF1ab,ACC,ATC,1058,T,265,I,378,2020-03-27,2020-12-19,"[16, 1, 2, 1, 3, 4, 1, 59, 3, 1, 1, 1, 3, 1, 3...","[Iceland/84/2020, Iceland/219/2020, Senegal/00...","[Bitou, Brisbane, Brooklyn, Cali, Chris Hani, ..."
866,ORF1ab,AGC,GGC,17614,S,5784,G,926,2020-10-27,2020-12-19,"[2, 924]","[England/CAMC-A58BA4/2020, England/MILK-A06237...","[Sydney, unk]"
910,ORF1ab,ATA,ACA,15095,I,4944,T,25,2020-11-20,2020-12-17,[25],"[England/CAMC-B215A5/2020, England/CAMC-B21587...",[unk]
2182,ORF3a,ACT,ATT,26059,T,223,I,19,2020-10-27,2020-12-19,"[14, 1, 4]","[USA/MI-UM-10036367756/2020, USA/MI-UM-1003641...","[Cape Town, Sydney, unk]"


In [None]:
for rec in cns:
    if rec.name in 

In [86]:
print(f"Initial cleaning...")
seqs, ref_seq = process_cns_seqs(cns, patient_zero,
                                 start_pos=0, end_pos=30000)

ValueError: No records found in handle

In [47]:
seqsdf = identify_replacements_per_sample(seqs, None, ref_seq, GENE2POS)

Identifying mutations...
Mapping Genes to mutations...
Compute codon numbers...
Fetch reference codon...
Fetch alternative codon...
Map amino acids...
Fuse with metadata...


In [48]:
seqsdf.columns

Index(['idx', 'sequence', 'replacements', 'pos', 'gene', 'codon_num',
       'ref_codon', 'alt_codon', 'ref_aa', 'alt_aa'],
      dtype='object')

In [49]:
subs = (seqsdf.groupby(['gene', 'ref_codon', 'alt_codon', 'pos', 'ref_aa', 
                            'codon_num', 'alt_aa'])
    .agg(
     num_samples=('idx', 'nunique'),
#      first_detected=('date', 'min'),
#      last_detected=('date', 'max'),
#      locations=('location', uniq_locs),
#      location_counts=('location', 
#                       lambda x: np.unique(x, return_counts=True)),
     samples=('idx', 'unique')
    )
    .reset_index())

In [51]:
subs.to_csv('b117_usa_substitutions.csv', index=False)

In [70]:
# dels
start_pos = 265
end_pos =29674
gene2pos = GENE2POS

In [71]:
seqs, ref_seq = process_cns_seqs(cns, patient_zero, start_pos, end_pos)
dels = identify_deletions_per_sample(seqs, None, 1)


In [72]:
del_seqs = (dels.groupby(['relative_coords', 'del_len'])
                    .agg(samples=('idx', 'unique'),
                         num_samples=('idx', 'nunique'))
#                          first_detected=('date', 'min'),
#                          last_detected=('date', 'max'),
#                              locations=('location', uniq_locs),
#                          location_counts=('location', 
#                                           lambda x: np.unique(x, return_counts=True)))
                    .reset_index()
                    .sort_values('num_samples'))
# del_seqs['locations'] = del_seqs['location_counts'].apply(lambda x: list(x[0]))
# del_seqs['location_counts'] = del_seqs['location_counts'].apply(lambda x: list(x[1]))
del_seqs['type'] = 'deletion'
# adjust coordinates to account for the nts trimmed from beginning e.g. 265nts
del_seqs['absolute_coords'] = del_seqs['relative_coords'].apply(adjust_coords, args=(start_pos+1,))
del_seqs['pos'] = del_seqs['absolute_coords'].apply(lambda x: int(x.split(':')[0]))
# approximate the gene where each deletion was identified
del_seqs['gene'] = del_seqs['pos'].apply(map_gene_to_pos)
del_seqs = del_seqs.loc[~del_seqs['gene'].isna()]
# filter our substitutions in non-gene positions
del_seqs = del_seqs.loc[del_seqs['gene']!='nan']
# compute codon number of each substitution
del_seqs['codon_num'] = del_seqs.apply(compute_codon_num, args=(gene2pos,), axis=1)
# fetch the reference codon for each substitution
del_seqs['ref_codon'] = del_seqs.apply(get_ref_codon, args=(ref_seq, gene2pos), axis=1)
# fetch the reference and alternative amino acids
del_seqs['ref_aa'] = del_seqs['ref_codon'].apply(get_aa)
# record the 5 nts before each deletion (based on reference seq)
del_seqs['prev_5nts'] = del_seqs['absolute_coords'].apply(lambda x: ref_seq[int(x.split(':')[0])-5:int(x.split(':')[0])])
# record the 5 nts after each deletion (based on reference seq)
del_seqs['next_5nts'] = del_seqs['absolute_coords'].apply(lambda x: ref_seq[int(x.split(':')[1])+1:int(x.split(':')[1])+6])
del_seqs[['type', 'gene', 'absolute_coords', 'del_len', 'pos', 
                 'ref_aa', 'codon_num', 'num_samples','samples',
                 'ref_codon', 'prev_5nts', 'next_5nts'
                 ]].to_csv('b117_usa_deletions.csv', index=False)

In [34]:
in_dir = Path('/home/al/code/HCoV-19-Genomics/consensus_sequences/')
out_dir = Path('/home/al/analysis/mutations/alab_git/cns_seqs')
ref_fp = '/home/al/data/test_inputs/NC045512.fasta'
# fa_fp = concat_fasta(in_dir, out_dir)

In [35]:
# align_fasta_reference(fa_fp, num_cpus=25, ref_fp=ref_fp)

In [36]:
in_dir.listdir()[0]

Path('/home/al/code/HCoV-19-Genomics/consensus_sequences/SEARCH-0247-SAN.fa')

In [37]:
for filename in in_dir.listdir():
    if ((filename.endswith('fa') or filename.endswith('fasta')) and ('JOR' not in filename)):
        copy(filename, '/home/al/analysis/mutations/alab_git/b117/fa')

In [25]:
meta = pd.read_csv('/home/al/analysis/mutations/alab_git/nextstrain_groups_neherlab_ncov_S.N501_metadata.tsv', sep='\t')

In [26]:
strains = meta['Strain'].unique().tolist()
len(strains)

6891

In [28]:
gisaid_seqs = Path('/home/al/analysis/mutations/gisaid/data/sequences_2020-12-29_08-08.fasta')
seqs_data = SeqIO.to_dict(SeqIO.parse(gisaid_seqs, "fasta"))

In [29]:
len(seqs_data.keys())

295507

In [33]:
my_seqs = {}
for name in seqs_data.keys():
    if name in strains:
        my_seqs[name] = seqs_data[name]
len(my_seqs)

6891

In [41]:
out_dir = Path('/home/al/analysis/mutations/alab_git/b117')

In [39]:
with open(out_dir/'b117_seqs.fa', 'w') as handle:
    SeqIO.write(my_seqs.values(), handle, 'fasta')

In [40]:
concat_fasta('/home/al/analysis/mutations/alab_git/b117/fa', 
            '/home/al/analysis/mutations/alab_git/b117/alab_usa')

'/home/al/analysis/mutations/alab_git/b117/alab_usa.fa'

In [None]:
fasta_filepath = out_dir/'b117_seqs.fa'
align_fasta_reference(fasta_filepath, ref_fp=)

In [None]:
consensus_data = SeqIO.to_dict(SeqIO.parse(aligned_seqs, "fasta"))

In [43]:
fa_fp = Path('/home/al/analysis/mutations/alab_git/b117/b117_usa.fasta')
msa_fp = align_fasta_reference(fa_fp, ref_fp='/home/al/data/test_inputs/NC045512.fasta')

In [None]:
msa_fp = 

In [44]:
patient_zero = 'NC_045512.2'

In [45]:
cns = AlignIO.read(msa_fp, 'fasta')
print(f"Initial cleaning...")
seqs, ref_seq = process_cns_seqs(cns, patient_zero,
                                 start_pos=0, end_pos=30000)

Initial cleaning...


In [47]:
seqsdf = identify_replacements_per_sample(seqs, None, ref_seq, GENE2POS)

Identifying mutations...
Mapping Genes to mutations...
Compute codon numbers...
Fetch reference codon...
Fetch alternative codon...
Map amino acids...
Fuse with metadata...


In [48]:
seqsdf.columns

Index(['idx', 'sequence', 'replacements', 'pos', 'gene', 'codon_num',
       'ref_codon', 'alt_codon', 'ref_aa', 'alt_aa'],
      dtype='object')

In [49]:
subs = (seqsdf.groupby(['gene', 'ref_codon', 'alt_codon', 'pos', 'ref_aa', 
                            'codon_num', 'alt_aa'])
    .agg(
     num_samples=('idx', 'nunique'),
#      first_detected=('date', 'min'),
#      last_detected=('date', 'max'),
#      locations=('location', uniq_locs),
#      location_counts=('location', 
#                       lambda x: np.unique(x, return_counts=True)),
     samples=('idx', 'unique')
    )
    .reset_index())

In [51]:
subs.to_csv('b117_usa_substitutions.csv', index=False)

In [70]:
# dels
start_pos = 265
end_pos =29674
gene2pos = GENE2POS

In [71]:
seqs, ref_seq = process_cns_seqs(cns, patient_zero, start_pos, end_pos)
dels = identify_deletions_per_sample(seqs, None, 1)


In [72]:
del_seqs = (dels.groupby(['relative_coords', 'del_len'])
                    .agg(samples=('idx', 'unique'),
                         num_samples=('idx', 'nunique'))
#                          first_detected=('date', 'min'),
#                          last_detected=('date', 'max'),
#                              locations=('location', uniq_locs),
#                          location_counts=('location', 
#                                           lambda x: np.unique(x, return_counts=True)))
                    .reset_index()
                    .sort_values('num_samples'))
# del_seqs['locations'] = del_seqs['location_counts'].apply(lambda x: list(x[0]))
# del_seqs['location_counts'] = del_seqs['location_counts'].apply(lambda x: list(x[1]))
del_seqs['type'] = 'deletion'
# adjust coordinates to account for the nts trimmed from beginning e.g. 265nts
del_seqs['absolute_coords'] = del_seqs['relative_coords'].apply(adjust_coords, args=(start_pos+1,))
del_seqs['pos'] = del_seqs['absolute_coords'].apply(lambda x: int(x.split(':')[0]))
# approximate the gene where each deletion was identified
del_seqs['gene'] = del_seqs['pos'].apply(map_gene_to_pos)
del_seqs = del_seqs.loc[~del_seqs['gene'].isna()]
# filter our substitutions in non-gene positions
del_seqs = del_seqs.loc[del_seqs['gene']!='nan']
# compute codon number of each substitution
del_seqs['codon_num'] = del_seqs.apply(compute_codon_num, args=(gene2pos,), axis=1)
# fetch the reference codon for each substitution
del_seqs['ref_codon'] = del_seqs.apply(get_ref_codon, args=(ref_seq, gene2pos), axis=1)
# fetch the reference and alternative amino acids
del_seqs['ref_aa'] = del_seqs['ref_codon'].apply(get_aa)
# record the 5 nts before each deletion (based on reference seq)
del_seqs['prev_5nts'] = del_seqs['absolute_coords'].apply(lambda x: ref_seq[int(x.split(':')[0])-5:int(x.split(':')[0])])
# record the 5 nts after each deletion (based on reference seq)
del_seqs['next_5nts'] = del_seqs['absolute_coords'].apply(lambda x: ref_seq[int(x.split(':')[1])+1:int(x.split(':')[1])+6])
del_seqs[['type', 'gene', 'absolute_coords', 'del_len', 'pos', 
                 'ref_aa', 'codon_num', 'num_samples','samples',
                 'ref_codon', 'prev_5nts', 'next_5nts'
                 ]].to_csv('b117_usa_deletions.csv', index=False)

In [74]:
del_seqs[['type', 'gene', 'absolute_coords', 'del_len', 'pos', 
                 'ref_aa', 'codon_num', 'num_samples','samples',
                 'ref_codon', 'prev_5nts', 'next_5nts'
                 ]].to_csv('b117_usa_deletions.csv', index=False)

In [62]:
del

Unnamed: 0,idx,sequence,seq_len,del_positions,del_len,relative_coords
1,hCoV-19/USA/SEARCH-5574-SAN/2020,nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn...,29903,"[11287, 11288, 11289, 11290, 11291, 11292, 112...",9,11287:11295
1,hCoV-19/USA/SEARCH-5574-SAN/2020,nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn...,29903,"[21764, 21765, 21766, 21767, 21768, 21769]",6,21764:21769
1,hCoV-19/USA/SEARCH-5574-SAN/2020,nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn...,29903,"[21990, 21991, 21992]",3,21990:21992
1,hCoV-19/USA/SEARCH-5574-SAN/2020,nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn...,29903,[28270],1,28270:28270
2,MW422256.1,------------------------------accaaccaactttcga...,29903,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",30,0:29
2,MW422256.1,------------------------------accaaccaactttcga...,29903,"[11287, 11288, 11289, 11290, 11291, 11292, 112...",9,11287:11295
2,MW422256.1,------------------------------accaaccaactttcga...,29903,"[21764, 21765, 21766, 21767, 21768, 21769]",6,21764:21769
2,MW422256.1,------------------------------accaaccaactttcga...,29903,"[21990, 21991, 21992]",3,21990:21992
2,MW422256.1,------------------------------accaaccaactttcga...,29903,[28270],1,28270:28270
2,MW422256.1,------------------------------accaaccaactttcga...,29903,"[29866, 29867, 29868, 29869, 29870, 29871, 298...",37,29866:29902


In [14]:
dels = identify_deletions_per_sample(seqs, None, 1)
dels

Unnamed: 0,idx,sequence,seq_len,del_positions,del_len,relative_coords
1,hCoV-19/USA/SEARCH-5574-SAN/2020,nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn...,29903,"[11287, 11288, 11289, 11290, 11291, 11292, 112...",9,11287:11295
1,hCoV-19/USA/SEARCH-5574-SAN/2020,nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn...,29903,"[21764, 21765, 21766, 21767, 21768, 21769]",6,21764:21769
1,hCoV-19/USA/SEARCH-5574-SAN/2020,nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn...,29903,"[21990, 21991, 21992]",3,21990:21992
1,hCoV-19/USA/SEARCH-5574-SAN/2020,nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn...,29903,[28270],1,28270:28270


In [10]:
seqsdf['pos'] = seqsdf['pos'] + 1

In [11]:
positions = [3267, 5388, 6954, 11288, 23063, 23271, 23604, 23709, 24506, 24914,
             27972, 28048, 28111, 28280, 28977]
seqsdf[seqsdf['pos'].isin(positions)]

Unnamed: 0,idx,sequence,replacements,pos,gene,codon_num,ref_codon,alt_codon,ref_aa,alt_aa
1,hCoV-19/USA/SEARCH-5574-SAN/2020,nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn...,3266:t,3267,ORF1ab,1001,ACT,ATT,T,I
1,hCoV-19/USA/SEARCH-5574-SAN/2020,nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn...,5387:a,5388,ORF1ab,1708,GCT,GAT,A,D
1,hCoV-19/USA/SEARCH-5574-SAN/2020,nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn...,6953:c,6954,ORF1ab,2230,ATA,ACA,I,T
1,hCoV-19/USA/SEARCH-5574-SAN/2020,nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn...,23062:t,23063,S,501,AAT,TAT,N,Y
1,hCoV-19/USA/SEARCH-5574-SAN/2020,nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn...,23270:a,23271,S,570,GCT,GAT,A,D
1,hCoV-19/USA/SEARCH-5574-SAN/2020,nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn...,23603:a,23604,S,681,CCT,CAT,P,H
1,hCoV-19/USA/SEARCH-5574-SAN/2020,nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn...,23708:t,23709,S,716,ACA,ATA,T,I
1,hCoV-19/USA/SEARCH-5574-SAN/2020,nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn...,24505:g,24506,S,982,TCA,GCA,S,A
1,hCoV-19/USA/SEARCH-5574-SAN/2020,nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn...,24913:c,24914,S,1118,GAC,CAC,D,H
1,hCoV-19/USA/SEARCH-5574-SAN/2020,nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnnn...,27971:t,27972,ORF8,27,CAA,TAA,Q,_


In [5]:
in_dir = Path('/home/al/code/HCoV-19-Genomics/consensus_sequences/')
out_dir = Path('/home/al/analysis/mutations/alab_git')
meta_fp = Path('/home/al/code/HCoV-19-Genomics/metadata.csv')
msa_fp = '/home/al/analysis/mutations/alab_git/cns_seqs_aligned.fa'

In [6]:
patient_zero = 'NC_045512.2'

In [8]:
cns = AlignIO.read(msa_fp, 'fasta')
seqs, ref_seq = process_cns_seqs(cns, patient_zero,
                                 start_pos=0, end_pos=30000)
seqs_df = identify_replacements_per_sample(seqs,
                                           meta_fp, 
                                           ref_seq,
                                           GENE2POS)

Identifying mutations...
Mapping Genes to mutations...
Compute codon numbers...
Fetch reference codon...
Fetch alternative codon...
Map amino acids...
Fuse with metadata...


In [13]:
q957l = seqs_df.loc[(seqs_df['gene']=='S') & (seqs_df['codon_num']==957) & (seqs_df['alt_aa']=='L')]

q957l.date.min(), q957l.date.max()

(Timestamp('2020-09-12 00:00:00'), Timestamp('2020-11-07 00:00:00'))

In [16]:
seqs_df['location'].unique()

array(['USA/California/San Diego', 'Jordan/Amman', 'Jordan/Irbid',
       'USA/Louisiana/New Orleans', 'USA/California/Imperial',
       'USA/California/Cruise_Ship_2', 'USA/California/Cruise_Ship_1',
       'MEX/Baja California/Tijuana', 'Jordan/Mafraq',
       'USA/California/Los Angeles', 'USA/California/Riverside',
       'USA/California/Santa Barbara ',
       'MEX/Sonora/San Luis Río Colorado', 'MEX/Baja California/Mexicali',
       'USA/Arizona/Yuma', 'USA/California/San Bernadino',
       'USA/West Virginia/Kanawha', 'USA/California/Orange',
       'MEX/Baja California/Ensenada', 'MEX/Baja California/Rosarito',
       'USA/California/Kern', 'Jordan/Zarqa', 'Jordan/Aqaba'],
      dtype=object)

In [42]:
jor_seqs = seqs_df[seqs_df['location'].str.contains('Jordan')]
print(jor_seqs['idx'].unique().shape)

(490,)


In [43]:
mutation_filter = (jor_seqs['gene']=='S') & (jor_seqs['codon_num']==957) & (jor_seqs['alt_aa']=='L')
jor_seqs['q957l'] = False
jor_seqs.loc[mutation_filter, 'q957l'] = True
jor_seqs['q957l'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  jor_seqs['q957l'] = False
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


False    8046
True      388
Name: q957l, dtype: int64

**Out of the 8046 total substitution-based mutations, 388 were Q957L among our Jordanian samples that have been released so far.**

In [44]:
jor_seqs['date'].min(), jor_seqs['date'].max()

(Timestamp('2020-03-16 00:00:00'), Timestamp('2020-11-07 00:00:00'))

In [45]:
jor_seqs[jor_seqs['q957l']]['date'].min(), jor_seqs[jor_seqs['q957l']]['date'].max()

(Timestamp('2020-09-12 00:00:00'), Timestamp('2020-11-07 00:00:00'))

In [50]:
jor_seqs['idx']

31       Consensus_SEARCH-0032-JOR_L1_L2_L3_L4_threshol...
32       Consensus_SEARCH-0032-JOR_L1_L2_L3_L4_threshol...
33       Consensus_SEARCH-0032-JOR_L1_L2_L3_L4_threshol...
34       Consensus_SEARCH-0032-JOR_L1_L2_L3_L4_threshol...
35       Consensus_SEARCH-0032-JOR_L1_L2_L3_L4_threshol...
                               ...                        
39761                     hCoV-19/JOR/SEARCH-5327-JOR/2020
39762                     hCoV-19/JOR/SEARCH-5327-JOR/2020
39763                     hCoV-19/JOR/SEARCH-5327-JOR/2020
39764                     hCoV-19/JOR/SEARCH-5327-JOR/2020
39765                     hCoV-19/JOR/SEARCH-5327-JOR/2020
Name: idx, Length: 8434, dtype: object

In [46]:
jor_seqs.columns

Index(['idx', 'sequence', 'replacements', 'pos', 'gene', 'codon_num',
       'ref_codon', 'alt_codon', 'ref_aa', 'alt_aa', 'ID', 'gb_accession',
       'gisaid_accession', 'collection_date', 'location',
       'percent_coverage_cds', 'avg_depth', 'authors', 'originating_lab',
       'fasta_hdr', 'date', 'q957l'],
      dtype='object')

In [70]:
all_samples = jor_seqs['idx'].unique()
len(all_samples)

490

In [71]:
mutant_samples = jor_seqs[jor_seqs['q957l']]['idx'].unique()
len(mutant_samples)

388

In [73]:
jor_seqs.loc[:, 'is_nonsyn_mutation'] = False
jor_seqs.loc[jor_seqs['alt_aa']!=jor_seqs['ref_aa'], 'is_nonsyn_mutation'] = True
jor_seqs.loc[:, 'is_S_nonsyn_mutation'] = False
jor_seqs.loc[(jor_seqs['alt_aa']!=jor_seqs['ref_aa']) & (jor_seqs['gene']=='S'), 'is_S_nonsyn_mutation'] = True

In [103]:
mnthly_cnts = jor_seqs.groupby('month').agg(total_samples=('idx', 'nunique'), 
                                            mutated_samples=('q957l', 'sum'))
mnthly_cnts['mutation_freq'] = mnthly_cnts['mutated_samples'] / mnthly_cnts['total_samples']
mnthly_cnts

Unnamed: 0_level_0,total_samples,mutated_samples,mutation_freq
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3,22,0,0.0
4,6,0,0.0
6,4,0,0.0
7,2,0,0.0
9,65,56,0.861538
10,347,300,0.864553
11,44,32,0.727273


In [74]:
jor_seqs[['is_nonsyn_mutation', 'is_S_nonsyn_mutation']].sum() / len(all_samples)

is_nonsyn_mutation      11.591837
is_S_nonsyn_mutation     1.983673
dtype: float64

In [75]:
jor_seqs[jor_seqs['idx'].isin(mutant_samples)][['is_nonsyn_mutation', 'is_S_nonsyn_mutation']].sum() / len(mutant_samples)

is_nonsyn_mutation      12.471649
is_S_nonsyn_mutation     2.195876
dtype: float64

In [94]:
jor_seqs[~jor_seqs['idx'].isin(mutant_samples)][['is_nonsyn_mutation', 'is_S_nonsyn_mutation']].sum() / (len(all_samples) - len(mutant_samples))

is_nonsyn_mutation      8.245098
is_S_nonsyn_mutation    1.176471
dtype: float64

In [100]:
print("All Samples\n")

print(f"{len(all_samples)} total samples from Jordan")
print(f"An average of 11.59 non-synonymous mutations per sample")
print(f"An average of 1.98 non-synonymous mutations in S gene per sample\n")

print("Mutated Samples\n")
print(f"{len(mutant_samples)} total samples from Jordan with Q957L mutation")
print(f"An average of 12.47 non-synonymous mutations per sample")
print(f"An average of 2.20 non-synonymous mutations in S gene per sample\n")

print("Non-mutated Samples\n")
print(f"{len(all_samples)-len(mutant_samples)} total samples from Jordan without Q957L mutation")
print(f"An average of 8.25 non-synonymous mutations per sample")
print(f"An average of 1.18 non-synonymous mutations in S gene per sample")

All Samples

490 total samples from Jordan
An average of 11.59 non-synonymous mutations per sample
An average of 1.98 non-synonymous mutations in S gene per sample

Mutated Samples

388 total samples from Jordan with Q957L mutation
An average of 12.47 non-synonymous mutations per sample
An average of 2.20 non-synonymous mutations in S gene per sample

Non-mutated Samples

102 total samples from Jordan without Q957L mutation
An average of 8.25 non-synonymous mutations per sample
An average of 1.18 non-synonymous mutations in S gene per sample


In [86]:
mutants = jor_seqs[(jor_seqs['idx'].isin(mutant_samples))].groupby('idx').agg(nonsyn_mutations=('is_nonsyn_mutation', 'sum'), s_nonsyn_mutations=('is_S_nonsyn_mutation', 'sum'))
print(mutants.shape)
mutants.head()

(388, 2)


Unnamed: 0_level_0,nonsyn_mutations,s_nonsyn_mutations
idx,Unnamed: 1_level_1,Unnamed: 2_level_1
hCoV-19/JOR/SEARCH-3901-JOR/2020,12,2
hCoV-19/JOR/SEARCH-3903-JOR/2020,12,2
hCoV-19/JOR/SEARCH-3904-JOR/2020,11,2
hCoV-19/JOR/SEARCH-3905-JOR/2020,13,2
hCoV-19/JOR/SEARCH-3906-JOR/2020,11,3


In [87]:
nonmutants = jor_seqs[(~jor_seqs['idx'].isin(mutant_samples))].groupby('idx').agg(nonsyn_mutations=('is_nonsyn_mutation', 'sum'), s_nonsyn_mutations=('is_S_nonsyn_mutation', 'sum'))
print(nonmutants.shape)
nonmutants.head()

(102, 2)


Unnamed: 0_level_0,nonsyn_mutations,s_nonsyn_mutations
idx,Unnamed: 1_level_1,Unnamed: 2_level_1
Consensus_SEARCH-0032-JOR_L1_L2_L3_L4_threshold_0_quality_20,8,1
Consensus_SEARCH-0033-JOR_L1_L2_L3_L4_threshold_0_quality_20,6,2
Consensus_SEARCH-0034-JOR_L1_L2_L3_L4_threshold_0_quality_20,5,1
Consensus_SEARCH-0035-JOR_L1_L2_L3_L4_threshold_0_quality_20,7,0
Consensus_SEARCH-0036-JOR_L1_L2_L3_L4_threshold_0_quality_20,5,1


In [92]:
ttest_ind(mutants['nonsyn_mutations'].values, nonmutants['nonsyn_mutations'].values)

Ttest_indResult(statistic=16.776774111466757, pvalue=3.3066564792193805e-50)

In [93]:
ttest_ind(mutants['s_nonsyn_mutations'].values, nonmutants['s_nonsyn_mutations'].values)

Ttest_indResult(statistic=18.795936825009285, pvalue=1.0782681771236335e-59)

In [3]:
df = pd.read_csv("/home/al/analysis/2020-12-24_release/insertions.csv")
df

Unnamed: 0,type,gene,absolute_coords,ins_len,pos,ref_aa,codon_num,num_samples,first_detected,last_detected,locations,location_counts,samples,ref_codon,prev_5nts,next_5nts
0,insertion,ORF1ab,11077:11079,3,11077,Y,3605,1,2020-08-24,2020-08-24,['JOR/Amman'],[1],['hCoV-19/JOR/SEARCH-5444-JOR/2020'],TAC,tgttc,ttttt
1,insertion,ORF1ab,17702:17702,1,17702,_,5813,1,2020-08-22,2020-08-22,['JOR/Amman'],[1],['hCoV-19/JOR/SEARCH-5448-JOR/2020'],TAA,tctgc,aatta
2,insertion,ORF1ab,1026:1026,1,1026,E,254,1,2020-11-18,2020-11-18,['USA/California/San Diego'],[1],['hCoV-19/USA/SEARCH-5569-SAN/2020'],GAA,acacc,ttttg
3,insertion,ORF1ab,8819:8819,1,8819,_,2852,1,2020-09-14,2020-09-14,['JOR/Amman'],[1],['hCoV-19/JOR/SEARCH-5441-JOR/2020'],TAG,gcttg,cccat
4,insertion,ORF1ab,6697:6697,1,6697,V,2145,3,2020-08-31,2020-09-12,['JOR/Amman'],[3],['hCoV-19/JOR/SEARCH-5431-JOR/2020' 'hCoV-19/J...,GTT,aagcc,ttttc


In [47]:
codons_of_interest = [69, 70, 144, 145, 681, 716, 982, 1118, 501, 570]

In [52]:
df[(df['gene']=='S') & (df['codon_num'].isin(codons_of_interest))].sort_values('num_samples', ascending=False).iloc[0].values

array(['S', 23604, 'P', 681, 'H', 3, '2020-12-04', '2020-12-14', '[3]',
       "['SEARCH-5356-SAN' 'SEARCH-5365-SAN' 'SEARCH-5381-SAN']",
       "['USA/California/San Diego']"], dtype=object)

In [50]:
dels = pd.read_csv("/home/al/analysis/2020-12-24_release/deletions.csv")
dels[(dels['gene']=='S') & (dels['codon_num'].isin(codons_of_interest))].sort_values('num_samples', ascending=False)

Unnamed: 0,type,gene,absolute_coords,del_len,pos,ref_aa,codon_num,num_samples,first_detected,last_detected,locations,location_counts,samples,ref_codon,prev_5nts,next_5nts


In [54]:
subs = pd.read_csv("/home/al/code/HCoV-19-Genomics/variants/substitutions_22-12-2020.csv")
subs[(subs['gene']=='S') & (subs['codon_num'].isin(codons_of_interest))].sort_values('num_samples', ascending=False)

Unnamed: 0,gene,pos,ref_aa,codon_num,alt_aa,num_samples,first_detected,last_detected,location_counts,samples,...,gisaid_last_detected,gisaid_num_locations,gisaid_location_counts,gisaid_num_states,gisaid_state_counts,gisaid_num_countries,gisaid_country_counts,gisaid_locations,gisaid_states,gisaid_countries
250,S,23604,P,681,H,13,2020-07-30,2020-12-03,[13],['SEARCH-3061-SAN' 'SEARCH-3067-SAN' 'SEARCH-3...,...,2020-12-19,33.0,"[5, 4, 1, 1, 3, 1, 1, 1, 1, 1, 1, 30, 1, 2, 2,...",40.0,"[8, 4, 5, 2, 66, 5, 1, 1848, 9, 1, 14, 2, 2, 1...",13.0,"[4, 41, 1, 50, 5, 11, 3, 3, 2, 6, 4, 182, 1869]","['Alachua County', 'Alameda County', 'Albany C...","['Arizona', 'Auckland', 'Balear Islands', 'Bas...","['Australia', 'Denmark', 'Germany', 'India', '..."
1002,S,24914,D,1118,Y,3,2020-04-26,2020-05-04,[3],['SEARCH-0237-SAN' 'SEARCH-0532-SAN' 'SEARCH-0...,...,2020-12-18,8.0,"[1, 1, 1, 2, 3, 1, 1, 63]",23.0,"[1, 1, 2, 3, 1, 13, 3, 1, 1, 1, 2, 2, 1, 2, 1,...",10.0,"[2, 1, 35, 1, 1, 1, 2, 13, 1, 16]","['Chicago', 'London', 'Multnomah County', 'New...","['Abu Dhabi', 'Australia', 'Bangkok', 'Califor...","['Australia', 'Bangladesh', 'Denmark', 'Nether..."
1042,S,23709,T,716,I,3,2020-05-16,2020-09-30,"[2, 1]",['SEARCH-0681-IPL' 'SEARCH-3916-JOR' 'SEARCH-3...,...,2020-12-19,8.0,"[3, 1, 1, 2, 1, 1, 2, 1908]",27.0,"[2, 1, 1, 1836, 1, 1, 13, 1, 1, 3, 1, 4, 9, 2,...",11.0,"[4, 1, 18, 1, 1, 2, 1, 13, 1, 13, 1864]","['Caddo Parish', 'Gibraltar', 'Greater Houston...","['Amman', 'Arizona', 'California', 'England', ...","['Australia', 'Canada', 'Denmark', 'France', '..."
1048,S,23604,P,681,R,3,2020-12-03,2020-12-03,[3],['SEARCH-5114-SAN' 'SEARCH-5118-SAN' 'SEARCH-5...,...,2020-12-14,6.0,"[1, 2, 3, 1, 1, 20]",11.0,"[2, 4, 8, 5, 3, 1, 1, 1, 1, 1, 1]",10.0,"[1, 5, 1, 4, 1, 10, 1, 1, 1, 3]","['Baney', 'Counties Manukau', 'Malabo', 'New Y...","['Auckland', 'Bioko Norte', 'Canterbury', 'Dha...","['Australia', 'Bangladesh', 'Denmark', 'Equato..."
2043,S,23064,N,501,I,1,2020-10-05,2020-10-05,[1],['SEARCH-5210-JOR'],...,,,,,,,,,,
2614,S,21994,Y,144,Y,1,2020-06-13,2020-06-13,[1],['SEARCH-3824-IPL'],...,2020-12-17,6.0,"[1, 2, 1, 2, 1, 11]",8.0,"[1, 3, 1, 2, 1, 8, 1, 1]",6.0,"[3, 1, 1, 3, 1, 9]","['Barcelona', 'Brahmanbaria', 'Imperial County...","['Abu Dhabi', 'California', 'Catalunya', 'Chat...","['Bangladesh', 'Spain', 'Sweden', 'USA', 'Unit..."
2615,S,21997,Y,145,Y,1,2020-09-25,2020-09-25,[1],['SEARCH-4479-SAN'],...,2020-12-18,9.0,"[1, 6, 1, 2, 1, 3, 2, 2, 29]",16.0,"[1, 3, 3, 1, 13, 2, 2, 1, 1, 2, 2, 1, 3, 3, 2, 7]",12.0,"[4, 2, 1, 2, 1, 1, 3, 7, 3, 1, 9, 13]","['Bitou', 'George', 'San Diego', 'Santa Clara ...","['Aargau', 'Basque Country', 'California', 'Eg...","['Australia', 'Denmark', 'Egypt', 'India', 'It..."
2625,S,21772,V,70,V,1,2020-04-22,2020-04-22,[1],['SEARCH-1700-TIJ'],...,2020-12-18,19.0,"[11, 1, 3, 2, 2, 1, 2, 1, 1, 1, 11, 40, 1, 2, ...",37.0,"[1, 40, 1, 4, 41, 2, 1, 2, 14, 1, 2, 1, 2, 2, ...",16.0,"[4, 1, 32, 1, 1, 2, 1, 1, 39, 1, 3, 8, 1, 2, 8...","['Azovo', 'Brooklyn', 'Ekaterinburg', 'El Paso...","['Baja California', 'California', 'Denmark', '...","['Australia', 'Chile', 'Denmark', 'Gambia', 'H..."
2634,S,21770,V,70,F,1,2020-07-04,2020-07-04,[1],['SEARCH-2559-TIJ'],...,2020-12-17,14.0,"[1, 1, 1, 1, 1, 1, 3, 1, 1, 2, 1, 1, 1, 73]",25.0,"[1, 1, 1, 5, 1, 21, 1, 3, 1, 2, 1, 1, 2, 5, 4,...",11.0,"[3, 1, 6, 1, 1, 31, 3, 1, 1, 17, 24]","['Caddo Parish', 'Contra Costa County', 'Great...","['Arizona', 'Baja California', 'Cairo', 'Calif...","['Denmark', 'Egypt', 'India', 'Indonesia', 'Me..."


In [30]:
codons_of_interest = [501, 957, 67, 68, 69, 417, 484]

In [35]:
(df[(df['gene']=='S') & (df['codon_num'].isin(codons_of_interest))]
 .sort_values('num_samples', ascending=False)
 .iloc[:99])

Unnamed: 0,gene,pos,ref_aa,codon_num,alt_aa,num_samples,first_detected,last_detected,location_counts,samples
1012,S,24432,Q,957,L,67,2020-08-13,2020-11-07,"(array(['JOR/Amman', 'JOR/Aqaba', 'JOR/Irbid']...",['SEARCH-5399-JOR' 'SEARCH-5400-JOR' 'SEARCH-5...
919,S,21767,H,69,Y,1,2020-09-03,2020-09-03,"(array(['JOR/Amman'], dtype=object), array([1]))",['SEARCH-5468-JOR']
965,S,23012,E,484,K,1,2020-12-04,2020-12-04,"(array(['USA/California/San Diego'], dtype=obj...",['SEARCH-5383-SAN']


In [36]:
ddf = pd.read_csv("/home/al/analysis/2020-12-24_release/deletions.csv")
(ddf[(ddf['gene']=='S') & (ddf['codon_num'].isin(codons_of_interest))]
 .sort_values('num_samples', ascending=False)
 .iloc[:99])

Unnamed: 0,type,gene,absolute_coords,del_len,pos,ref_aa,codon_num,num_samples,first_detected,last_detected,location_counts,samples,ref_codon,prev_5nts,next_5nts


In [15]:
df = pd.read_csv("/home/al/data/release_summary_csv/release_summary_24_12_2020.csv")
def check_date(x):
    if type(x)==str: return True
    return False
df = df[df['Collection date'].apply(check_date)]

In [None]:
Path.isfile()

In [16]:
# clean and process sample collection dates
df = df.loc[(df['Collection date']!='Unknown') 
               & (df['Collection date']!='1900-01-00')]
df.loc[df['Collection date'].str.contains('/'), 'Collection date'] = df['Collection date'].apply(lambda x: x.split('/')[0])
df['date'] = pd.to_datetime(df['Collection date'])

In [21]:
df.columns

Index(['Sample ID', 'SEARCH SampleID', 'Ready for release?',
       'New sequences ready for release', 'Released?', 'Submitter',
       'FASTA filename', 'Virus name', 'Type', 'Passage details/history',
       'Collection date', 'Location', 'Additional location information',
       'Host', 'Additional host information', 'Gender', 'Patient age',
       'Patient status', 'Specimen source', 'Outbreak', 'Last vaccinated',
       'Treatment', 'Sequencing technology', 'Assembly method', 'Coverage',
       'Originating lab', 'Address', 'Sample ID given by the sample provider',
       'Submitting lab', 'Address.1',
       'Sample ID given by the submitting laboratory', 'Authors', 'Comment',
       'Comment Icon', 'date', 'idx'],
      dtype='object')

In [22]:
df['idx'] = df['SEARCH SampleID'].apply(lambda x: x.split('-')[1]).astype(int)
df.loc[df['idx']>5325, 'New sequences ready for release'] = 'Yes'

In [23]:
df.to_csv("/home/al/data/release_summary_csv/release_summary_24_12_2020.csv", index=False)

In [2]:
import pandas as pd

In [11]:
from bjorn import *
from bjorn_support import *
from onion_trees import *
import gffutils
import math
from mutations import *
import ast

In [160]:
subs = pd.read_csv("/home/al/analysis/mutations/gisaid/gisaid_replacements_aggregated_19-12-2020.csv")

In [161]:
dels = pd.read_csv("/home/al/analysis/mutations/gisaid/gisaid_deletions_aggregated_19-12-2020.csv")

In [162]:
strain_dels = dels[(dels['gene']=='S') & (dels['codon_num'].isin([67, 68, 69]))].drop_duplicates(subset=['absolute_coords']).reset_index(drop=True)

In [163]:
strain_subs = subs[(subs['gene']=='S') & (subs['codon_num'].isin([501, 417, 484]))].drop_duplicates(subset=['gene', 'codon_num', 'alt_aa']).reset_index(drop=True)

In [164]:
strain_subs.shape, strain_dels.shape

((16, 17), (4, 21))

In [165]:
strain_subs.columns

Index(['gene', 'pos', 'ref_aa', 'codon_num', 'alt_aa', 'num_samples',
       'first_detected', 'last_detected', 'num_locations', 'location_counts',
       'num_divisions', 'division_counts', 'num_countries', 'country_counts',
       'locations', 'divisions', 'countries'],
      dtype='object')

In [166]:
def is_in(x, loc):
    for i in eval(x):
        if loc in i.lower():
            return True
    return False
strain_subs['isin_SD'] = strain_subs['locations'].apply(is_in, args=('san diego',))
strain_subs['isin_CA'] = strain_subs['divisions'].apply(is_in, args=('california',))
strain_subs['isin_US'] = strain_subs['countries'].apply(is_in, args=('usa',))

In [167]:
strain_dels.columns

Index(['relative_coords', 'del_len', 'samples', 'num_samples',
       'first_detected', 'last_detected', 'location_counts', 'division_counts',
       'country_counts', 'type', 'absolute_coords', 'pos', 'gene', 'codon_num',
       'ref_codon', 'ref_aa', 'prev_5nts', 'next_5nts', 'locations',
       'divisions', 'countries'],
      dtype='object')

In [168]:
strain_dels['isin_SD'] = strain_dels['locations'].apply(is_in, args=('san diego',))
strain_dels['isin_CA'] = strain_dels['divisions'].apply(is_in, args=('california',))
strain_dels['isin_US'] = strain_dels['countries'].apply(is_in, args=('usa',))

In [169]:
strain_dels.to_csv('/home/al/analysis/mutations/S501Y/gisaid_strain_deletions.csv', index=False)
strain_subs.to_csv('/home/al/analysis/mutations/S501Y/gisaid_strain_substitutions.csv', index=False)

In [170]:
strain_subs

Unnamed: 0,gene,pos,ref_aa,codon_num,alt_aa,num_samples,first_detected,last_detected,num_locations,location_counts,num_divisions,division_counts,num_countries,country_counts,locations,divisions,countries,isin_SD,isin_CA,isin_US
0,S,22812,K,417,R,1,2020-06-23,2020-06-23,1,[1],1,[1],1,[1],['unk'],['Israel'],['Israel'],False,False,False
1,S,22813,K,417,K,45,2020-07-24,2020-12-02,3,"[1, 1, 43]",4,"[1, 1, 41, 2]",3,"[1, 1, 43]","['Asotin County', 'Yakima County', 'unk']","['Japan', 'Nordjylland', 'Utah', 'Washington']","['Denmark', 'Japan', 'USA']",False,False,True
2,S,22813,K,417,N,215,2020-05-19,2020-12-18,16,"[17, 3, 59, 1, 1, 1, 47, 10, 2, 42, 2, 15, 2, ...",8,"[2, 60, 2, 12, 1, 1, 1, 136]",4,"[2, 1, 208, 4]","['Bitou', 'Chris Hani', 'George', 'Harry Gwala...","['Bavaria', 'Eastern Cape', 'England', 'KwaZul...","['Germany', 'Norway', 'South Africa', 'United ...",False,False,False
3,S,23012,E,484,K,254,2020-06-06,2020-12-18,25,"[16, 1, 1, 3, 59, 1, 1, 1, 46, 10, 2, 40, 1, 2...",28,"[6, 1, 1, 2, 1, 1, 57, 1, 8, 1, 1, 1, 14, 2, 1...",13,"[1, 6, 3, 1, 1, 2, 2, 209, 3, 2, 2, 13, 9]","['Bitou', 'Caceres', 'Castellon de la Plana', ...","['Bahrain', 'Basel-Land', 'Bouafle', 'Bouaké',...","['Australia', 'Bahrain', ""Côte d'Ivoire"", 'Ecu...",True,True,True
4,S,23012,E,484,Q,40,2020-05-28,2020-12-14,6,"[2, 1, 1, 1, 1, 34]",16,"[1, 1, 11, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 13...",12,"[1, 1, 1, 1, 3, 1, 1, 1, 1, 2, 3, 24]","['Daskroi', 'Greater Houston Area', 'San Diego...","['Bahrain', 'California', 'England', 'Gujarat'...","['Australia', 'Bahrain', 'Belgium', 'Denmark',...",True,True,True
5,S,23012,E,484,R,3,2020-09-07,2020-12-10,2,"[1, 2]",2,"[1, 2]",2,"[1, 2]","['San Diego', 'unk']","['California', 'England']","['USA', 'United Kingdom']",True,True,True
6,S,23013,E,484,A,10,2020-05-17,2020-12-12,3,"[1, 1, 8]",9,"[1, 1, 1, 1, 1, 1, 1, 1, 2]",4,"[1, 3, 4, 2]","['Gandia', 'eThekwini', 'unk']","['Arizona', 'Canary Islands', 'Comunitat Valen...","['South Africa', 'Spain', 'USA', 'United Kingd...",False,False,True
7,S,23013,E,484,G,2,2020-11-24,2020-12-10,2,"[1, 1]",2,"[1, 1]",2,"[1, 1]","['Madrid', 'unk']","['Japan', 'Madrid']","['Japan', 'Spain']",False,False,False
8,S,23013,E,484,V,1,2020-12-01,2020-12-01,1,[1],1,[1],1,[1],['unk'],['Florida'],['USA'],False,False,True
9,S,23014,E,484,D,3,2020-06-01,2020-10-02,3,"[1, 1, 1]",3,"[1, 1, 1]",3,"[1, 1, 1]","['Marseille', 'Munich', 'unk']","['Bavaria', ""Provence-Alpes-Côte d'Azur"", 'Tra...","['France', 'Germany', 'Thailand']",False,False,False


In [3]:
in_dir = Path('/home/al/code/HCoV-19-Genomics/consensus_sequences/')
out_dir = Path('/home/al/analysis/mutations/alab_git')
meta_fp = Path('/home/al/code/HCoV-19-Genomics/metadata.csv')

In [111]:
fasta_fp = concat_fasta(in_dir, out_dir/'cns_seqs')

In [112]:
msa_fp = align_fasta_reference(fasta_fp, num_cpus=20, ref_fp='/home/gk/code/hCoV19/db/NC045512.fasta')

In [4]:
msa_fp = '/home/al/analysis/mutations/alab_git/cns_seqs_aligned.fa'

In [5]:
subs = identify_replacements(msa_fp, meta_fp)

Loading Alignment file at: /home/al/analysis/mutations/alab_git/cns_seqs_aligned.fa
Initial cleaning...
Creating a dataframe...
Identifying mutations...
Mapping Genes to mutations...
Compute codon numbers...
Fetch reference codon...
Fetch alternative codon...
Map amino acids...
Fuse with metadata...


In [117]:
subs.explode('samples')['samples'].unique().shape

(3252,)

In [9]:
subs.sort_values('num_samples', ascending=False).to_csv(out_dir/"substitutions_22-12-2020_orig.csv", index=False)

In [123]:
dels = identify_deletions(msa_fp, meta_fp, min_del_len=1)

In [125]:
dels.sort_values('num_samples', ascending=False).to_csv(out_dir/"deletions_22-12-2020_orig.csv", index=False)

In [None]:
align_fasta_reference(seqs_fp, num_cpus=25, ref_fp=ref_fp)

## CNS Mutations Report

In [6]:
analysis_folder = Path('/home/al/code/HCoV-19-Genomics/consensus_sequences/')
meta_fp = Path('/home/al/code/HCoV-19-Genomics/metadata.csv')
ref_path = Path('/home/gk/code/hCoV19/db/NC045512.fasta')
patient_zero = 'NC_045512.2'
in_fp = '/home/al/analysis/mutations/S501Y/msa_aligned.fa'

In [3]:
subs = identify_replacements(in_fp, meta_fp)

In [4]:
subs.head()

Unnamed: 0,gene,pos,ref_aa,codon_num,alt_aa,num_samples,first_detected,last_detected,locations,location_counts,samples
0,3UTR,29679,S,2,F,2,2020-07-14,2020-10-07,USA/California/San Diego,2,"[SEARCH-3119-SAN, SEARCH-4245-SAN]"
1,3UTR,29681,L,3,L,2,2020-07-11,2020-07-21,"[USA/California/Los Angeles, USA/California/Sa...","[1, 1]","[SEARCH-2600-SAN, SEARCH-2692-LAX]"
2,3UTR,29688,S,5,I,4,2020-03-25,2020-10-26,USA/California/San Diego,4,"[SEARCH-0113-SAN, SEARCH-2855-SAN, SEARCH-3609..."
3,3UTR,29690,V,6,L,1,2020-08-13,2020-08-13,USA/California/San Diego,1,[SEARCH-4455-SAN]
4,3UTR,29692,V,6,V,1,2020-10-03,2020-10-03,Jordan/Amman,1,[SEARCH-4034-JOR]


In [5]:
dels = identify_deletions(in_fp, meta_fp, patient_zero)
dels

Unnamed: 0,type,gene,absolute_coords,del_len,pos,ref_aa,codon_num,num_samples,first_detected,last_detected,locations,location_counts,samples,ref_codon,prev_5nts,next_5nts
30,deletion,ORF7a,27538:27572,35,27538,L,49,1,2020-04-28,2020-04-28,MEX/Baja California/Tijuana,1,[hCoV-19/MEX/SEARCH-1480-TIJ/2020],CTA,tcctc,actca
27,deletion,ORF6,27264:27290,27,27264,F,22,1,2020-07-28,2020-07-28,USA/California/San Diego,1,[hCoV-19/USA/SEARCH-3236-SAN/2020],TTT,ggact,tacat
28,deletion,ORF6,27266:27293,28,27266,F,22,1,2020-04-23,2020-04-23,USA/California/San Diego,1,[hCoV-19/USA/SEARCH-1921-SAN/2020],TTT,acttt,atcat
29,deletion,ORF7a,27498:27531,34,27498,S,36,1,2020-05-12,2020-05-12,USA/California/Imperial,1,[hCoV-19/USA/SEARCH-0573-IPL/2020],TCT,cttgc,atcct
59,deletion,ORF1ab,6656:6679,24,6656,N,2131,1,2020-06-13,2020-06-13,USA/California/San Diego,1,[hCoV-19/USA/SEARCH-3559-SAN/2020],AAT,tgtta,ctaat
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24,deletion,ORF3a,26158:26161,4,26158,V,256,9,2020-07-17,2020-09-14,USA/California/San Diego,9,"[hCoV-19/USA/SEARCH-2907-SAN/2020, hCoV-19/USA...",GTT,agttg,tccag
16,deletion,ORF1ab,516:518,3,516,V,84,10,2020-07-22,2020-08-01,USA/California/San Diego,10,"[hCoV-19/USA/SEARCH-4342-SAN/2020, hCoV-19/USA...",GTT,catgt,ggttg
56,deletion,ORF1ab,686:694,9,686,K,141,17,2020-03-31,2020-10-03,"[Jordan/Amman, USA/California/San Diego, USA/L...","[2, 13, 2]","[hCoV-19/USA/SEARCH-0264-NBG/2020, hCoV-19/USA...",AAG,tctaa,actta
51,deletion,N,28890:28901,12,28890,S,206,23,2020-06-16,2020-08-06,USA/California/San Diego,23,"[hCoV-19/USA/SEARCH-2285-SAN/2020, hCoV-19/USA...",TCT,acttc,ggctg


In [3]:
dels[dels['gene']=='S'].sort_values('num_samples', ascending=False)#.to_csv('S_deletions_consensus.csv', index=False)

NameError: name 'dels' is not defined

In [4]:
identify_insertions(in_fp, patient_zero).to_csv('test.csv', index=False)

## dev 

In [10]:
GENE2POS = {
            '5UTR': {'start': 0, 'end': 265},
            'ORF1ab': {'start': 265, 'end': 21555},
            'S': {'start': 21562, 'end': 25384},
            'ORF3a': {'start': 25392, 'end': 26220},
            'E': {'start': 26244, 'end': 26472},
            'M': {'start': 26522, 'end': 27191},
            'ORF6': {'start': 27201, 'end': 27387},
            'ORF7a': {'start': 27393, 'end': 27759},
            'ORF7b': {'start': 27755, 'end': 27887},
            'ORF8': {'start': 27893, 'end': 28259},
            'N': {'start': 28273, 'end': 29533},
            'ORF10': {'start': 29557, 'end': 29674},
            '3UTR': {'start': 29674, 'end': 29902}
           }

In [11]:
in_dir = '/home/al/analysis/mutations/fa/'
out_dir = '/home/al/analysis/mutations/msa/'

In [14]:
!rm -r /home/al/analysis/mutations
!mkdir /home/al/analysis/mutations
!mkdir /home/al/analysis/mutations/fa

In [15]:
for filename in analysis_folder.listdir():
    if (filename.endswith('fa') or filename.endswith('fasta')):
        copy(filename, '/home/al/analysis/mutations/fa/')
#         print(filename)

In [179]:
copy(ref_path, in_dir)

'/home/al/analysis/mutations/fa/NC045512.fasta'

In [180]:
in_dir = '/home/al/analysis/mutations/fa/'
out_dir = '/home/al/analysis/mutations/msa'
concat_fasta(in_dir, out_dir)

'/home/al/analysis/mutations/msa.fa'

In [17]:
align_fasta_reference('/home/al/analysis/mutations/msa.fa',  num_cpus=12, ref_fp=ref_path)

'/home/al/analysis/mutations/msa_aligned.fa'

In [14]:
cns = AlignIO.read('/home/al/analysis/mutations/msa_aligned.fa', 'fasta')

In [15]:
ref_seq = get_seq(cns, patient_zero)

In [16]:
len(ref_seq)

29903

In [17]:
seqs = get_seqs(cns, 0, 30000)

In [18]:
seqsdf = (pd.DataFrame(index=seqs.keys(), data=seqs.values(), columns=['sequence'])
                .reset_index().rename(columns={'index': 'idx'}))

In [19]:
# seqsdf

In [20]:
def find_replacements(x, ref):
    return [f'{i}:{n}' for i, n in enumerate(x) 
            if n!=ref[i] and n!='-' and n!='n']

In [21]:
seqsdf['replacements'] = seqsdf['sequence'].apply(find_replacements, args=(ref_seq,))

In [22]:
seqsdf = seqsdf.explode('replacements')
seqsdf['pos'] = -1
seqsdf.loc[~seqsdf['replacements'].isna(), 'pos'] = seqsdf.loc[~seqsdf['replacements'].isna(), 'replacements'].apply(lambda x: int(x.split(':')[0]))
seqsdf = seqsdf.loc[seqsdf['pos']!=-1]

In [23]:
def compute_codon_num(x, gene2pos: dict):
    pos = x['pos']
    ref_pos = gene2pos[x['gene']]['start']
    return math.ceil((pos - ref_pos + 1) / 3)

In [24]:
seqsdf['gene'] = seqsdf['pos'].apply(map_gene_to_pos)
seqsdf = seqsdf.loc[~seqsdf['gene'].isna()]
seqsdf = seqsdf.loc[seqsdf['gene']!='nan']
seqsdf['codon_num'] = seqsdf.apply(compute_codon_num, args=(GENE2POS,), axis=1)

In [25]:
def get_ref_codon(x, ref_seq, gene2pos: dict):
    ref_pos = gene2pos[x['gene']]['start']
    codon_start = ref_pos + ((x['codon_num'] - 1) * 3)
    return ref_seq[codon_start: codon_start+3].upper()
seqsdf['ref_codon'] = seqsdf.apply(get_ref_codon, args=(ref_seq, GENE2POS), axis=1)

In [26]:
def get_alt_codon(x, gene2pos: dict):
    ref_pos = gene2pos[x['gene']]['start']
    codon_start = ref_pos + ((x['codon_num'] - 1) * 3)
    return x['sequence'][codon_start: codon_start+3].upper()
seqsdf['alt_codon'] = seqsdf.apply(get_alt_codon, args=(GENE2POS,), axis=1)

In [27]:
def get_aa(codon: str):
    CODON2AA = { 
        'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M', 
        'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', 
        'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K', 
        'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',                  
        'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 
        'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 
        'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q', 
        'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R', 
        'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 
        'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A', 
        'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', 
        'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 
        'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 
        'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L', 
        'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_', 
        'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W', 
    } 
    return CODON2AA.get(codon, 'nan')
seqsdf['ref_aa'] = seqsdf['ref_codon'].apply(get_aa)
seqsdf['alt_aa'] = seqsdf['alt_codon'].apply(get_aa)
seqsdf = seqsdf.loc[seqsdf['alt_aa']!='nan']

In [28]:
seqsdf.columns

Index(['idx', 'sequence', 'replacements', 'pos', 'gene', 'codon_num',
       'ref_codon', 'alt_codon', 'ref_aa', 'alt_aa'],
      dtype='object')

In [29]:
meta = pd.read_csv(meta_fp)
print(seqsdf['idx'].unique().shape)
seqsdf = pd.merge(seqsdf, meta, left_on='idx', right_on='fasta_hdr')
print(seqsdf['idx'].unique().shape)

(2765,)
(2765,)


In [30]:
seqsdf = seqsdf.loc[(seqsdf['collection_date']!='Unknown') 
                   & (seqsdf['collection_date']!='1900-01-00')]
seqsdf.loc[seqsdf['collection_date'].str.contains('/'), 'collection_date'] = seqsdf['collection_date'].apply(lambda x: x.split('/')[0])
seqsdf['date'] = pd.to_datetime(seqsdf['collection_date'])

In [31]:
seqsdf['date'].min()

Timestamp('2020-03-04 00:00:00')

In [32]:
# (seqsdf.groupby(['gene', 'ref_aa', 'codon_num', 'alt_aa'])
# .agg(
#      num_samples=('ID', 'nunique')))

In [35]:
def uniq_locs(x):
    return np.unique(x)
def loc_counts(x):
    _, counts = np.unique(x, return_counts=True)
    return counts

In [37]:
subs = (seqsdf.groupby(['gene', 'pos', 'ref_aa', 'codon_num', 'alt_aa'])
.agg(
     num_samples=('ID', 'nunique'),
     first_detected=('date', 'min'),
     last_detected=('date', 'max'),
     locations=('location', uniq_locs),
     location_counts=('location', loc_counts),
     samples=('ID', 'unique')
    )
.reset_index())
subs['pos'] = subs['pos'] + 1

In [175]:
(subs[subs['gene']=='S'].sort_values('num_samples', ascending=False)
 .to_csv('S_mutations_consensus.csv', index=False))

## Consolidate metadata ID and fasta headers

In [134]:
def fix_header(x):
    if 'Consensus' in x:
        return x.split('_')[1]
    else:
        return x.split('/')[2]
seqsdf['n_ID'] = seqsdf['idx'].apply(fix_header)

In [135]:
seqsdf['n_ID'] = seqsdf['n_ID'].str.replace('ALSR', 'SEARCH')

In [136]:
meta = pd.read_csv(meta_fp)
meta['n_ID'] = meta['ID'].apply(lambda x: '-'.join(x.split('-')[:2]))

In [137]:
seqsdf['n_ID'] = seqsdf['n_ID'].apply(lambda x: '-'.join(x.split('-')[:2]))

In [138]:
tmp = pd.merge(seqsdf, meta, on='n_ID')

In [122]:
# tmp[tmp['ID'].str.contains('2112')]

In [98]:
# seqsdf

In [139]:
set(meta['n_ID'].unique()) - set(tmp['n_ID'].unique())

{'SEARCH-1668'}

In [140]:
seqsdf['idx'].unique().shape

(2765,)

In [141]:
meta['ID'].unique().shape

(2766,)

In [147]:
s = seqsdf[['n_ID', 'idx']].drop_duplicates()

In [151]:
new_meta = pd.merge(meta, s, on='n_ID', how='left')
(new_meta.drop(columns=['n_ID'])
.rename(columns={'idx': 'fasta_hdr'})
.to_csv('metadata.csv', index=False))

In [152]:
new_meta.shape 

(2766, 11)

In [153]:
new_meta

Unnamed: 0,ID,gb_accession,gisaid_accession,collection_date,location,percent_coverage_cds,avg_depth,authors,originating_lab,n_ID,idx
0,MG0987,MT598172,EPI_ISL_416457,2020-03-18,USA/California/San Diego,99.5954,2465.60,SEARCH Alliance San Diego,Andersen lab at Scripps Research,MG0987,Consensus_MG0987
1,PC00101P,MT192765,EPI_ISL_414648,2020-03-11,USA/California/San Diego,99.7525,3516.14,SEARCH Alliance San Diego,Andersen lab at Scripps Research,PC00101P,Consensus_PC00101P_threshold_0_quality_20
2,SEARCH-0007-SAN,MT598171,EPI_ISL_429990,2020-03-21,USA/California/San Diego,100.0000,6215.17,SEARCH Alliance San Diego with Christina Clark...,Rady's Childrens Hospital,SEARCH-0007,Consensus_SEARCH-0007-SAN_L1_threshold_0_quali...
3,SEARCH-0016-SAN,MT598173,EPI_ISL_430016,2020-03-24,USA/California/San Diego,100.0000,6440.67,SEARCH Alliance San Diego,Andersen lab at Scripps Research,SEARCH-0016,Consensus_SEARCH-0016-SAN_L1_threshold_0_quali...
4,SEARCH-0017-SAN,MT598174,EPI_ISL_429991,2020-03-24,USA/California/San Diego,100.0000,4947.09,SEARCH Alliance San Diego,Andersen lab at Scripps Research,SEARCH-0017,Consensus_SEARCH-0017-SAN_L1_threshold_0_quali...
...,...,...,...,...,...,...,...,...,...,...,...
2761,SEARCH-4685-SAN,,,2020-11-02,USA/California/San Diego,100.0000,4831.37,"SEARCH Alliance San Diego with Tracy Basler, J...",San Diego County Public Health Laboratory,SEARCH-4685,hCoV-19/USA/SEARCH-4685-SAN/2020
2762,SEARCH-4686-SAN,,,2020-11-05,USA/California/San Diego,100.0000,3864.73,"SEARCH Alliance San Diego with Tracy Basler, J...",San Diego County Public Health Laboratory,SEARCH-4686,hCoV-19/USA/SEARCH-4686-SAN/2020
2763,SEARCH-4687-ORA,,,2020-11-02,USA/California/Orange,98.1400,3123.54,"SEARCH Alliance San Diego with Tracy Basler, J...",San Diego County Public Health Laboratory,SEARCH-4687,hCoV-19/USA/SEARCH-4687-ORA/2020
2764,SEARCH-4690-SAN,,,2020-05-28,USA/California/San Diego,98.7283,2625.16,"SEARCH Alliance San Diego with Tracy Basler, J...",San Diego County Public Health Laboratory,SEARCH-4690,hCoV-19/USA/SEARCH-4690-SAN/2020


In [81]:
len(ref_seq)

29903