## TODO
* `gb_accession` and `gisaid_accession` are not found for new sequences, how do we concat to `metadata.csv` without them?
* metadata format for NCBI
* support tools for manual sanity checks

In [15]:
from bjorn import *
from bjorn_support import *
from onion_trees import *
import gffutils
import math
from mutations import *

In [16]:
input_fasta = "/home/al/analysis/mutations/S501Y/msa_reference.fa"
meta_fp = "/home/al/analysis/mutations/S501Y/metadata_2020-12-20_12-24.tsv"
out_dir = "/home/al/analysis/mutations/S501Y/"
ref_fp = "/home/al/data/test_inputs/NC045512.fasta"
patient_zero = 'NC_045512.2'

In [17]:
## keep only seqs contained in meta_file and save to fasta file
## concat with internal SD file
## generate MSA
meta = pd.read_csv(meta_fp, sep='\t')
meta.columns

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Index(['strain', 'virus', 'gisaid_epi_isl', 'genbank_accession', 'date',
       'region', 'country', 'division', 'location', 'region_exposure',
       'country_exposure', 'division_exposure', 'segment', 'length', 'host',
       'age', 'sex', 'Nextstrain_clade', 'pangolin_lineage', 'GISAID_clade',
       'originating_lab', 'submitting_lab', 'authors', 'url', 'title',
       'paper_url', 'date_submitted'],
      dtype='object')

In [18]:
# consensus_data = SeqIO.to_dict(SeqIO.parse(seqs_fp, "fasta"))

In [19]:
strains = meta['strain'].unique().tolist()
len(strains)

273267

In [20]:
print(f"Loading Alignment file at: {input_fasta}")
cns = AlignIO.read(input_fasta, 'fasta')

Loading Alignment file at: /home/al/analysis/mutations/S501Y/msa_reference.fa


In [21]:
print(f"Initial cleaning...")
seqs, ref_seq = process_cns_seqs(cns, patient_zero,
                                 start_pos=0, end_pos=30000)

Initial cleaning...


In [156]:
print(f"Creating a dataframe...")
seqsdf = (pd.DataFrame(index=seqs.keys(), 
                       data=seqs.values(), 
                       columns=['sequence'])
            .reset_index()
            .rename(columns={'index': 'idx'}))

Creating a dataframe...


In [157]:
def find_replacements(x, ref):
    return [f'{i}:{n}' for i, n in enumerate(x)
            if n!=ref[i] and n!='-' and n!='n']

In [158]:
print(f"Identifying mutations...")
# for each sample, identify list of substitutions (position:alt)
seqsdf['replacements'] = seqsdf['sequence'].apply(find_replacements, args=(ref_seq,))

Identifying mutations...


In [159]:
# wide-to-long data manipulation
seqsdf = seqsdf.explode('replacements')

In [160]:
# seqsdf

In [161]:
seqsdf['pos'] = -1
# populate position column
seqsdf.loc[~seqsdf['replacements'].isna(), 'pos'] = (seqsdf.loc[~seqsdf['replacements'].isna(), 'replacements']
   .apply(lambda x: int(x.split(':')[0])))

In [162]:
# filter out non-substitutions
seqsdf = seqsdf.loc[seqsdf['pos']!=-1]
print(f"Mapping Genes to mutations...")
# identify gene of each substitution
seqsdf['gene'] = seqsdf['pos'].apply(map_gene_to_pos)
seqsdf = seqsdf.loc[~seqsdf['gene'].isna()]

Mapping Genes to mutations...


In [163]:
# seqsdf

In [164]:
# filter our substitutions in non-gene positions
seqsdf = seqsdf.loc[seqsdf['gene']!='nan']
print(f"Compute codon numbers...")

# compute codon number of each substitution
seqsdf['codon_num'] = seqsdf.apply(compute_codon_num, args=(GENE2POS,), axis=1)


Compute codon numbers...


In [165]:

print(f"Fetch reference codon...")
# fetch the reference codon for each substitution
seqsdf['ref_codon'] = seqsdf.apply(get_ref_codon, args=(ref_seq, GENE2POS), axis=1)

Fetch reference codon...


In [166]:

print(f"Fetch alternative codon...")
# fetch the alternative codon for each substitution
seqsdf['alt_codon'] = seqsdf.apply(get_alt_codon, args=(GENE2POS,), axis=1)

Fetch alternative codon...


In [167]:
print(f"Map amino acids...")
# fetch the reference and alternative amino acids
seqsdf['ref_aa'] = seqsdf['ref_codon'].apply(get_aa)
seqsdf['alt_aa'] = seqsdf['alt_codon'].apply(get_aa)


# filter out substitutions with non-amino acid alternates (bad consensus calls)
seqsdf = seqsdf.loc[seqsdf['alt_aa']!='nan']

Map amino acids...


In [168]:
print(f"Fuse with metadata...")
# load and join metadata
meta = pd.read_csv(meta_fp, sep='\t')
seqsdf = pd.merge(seqsdf, meta, left_on='idx', right_on='strain')

Fuse with metadata...


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [169]:
seqsdf['date'] = pd.to_datetime(seqsdf['date_submitted'])

In [170]:
seqsdf['month'] = seqsdf['date'].dt.month

In [171]:
seqsdf.columns

Index(['idx', 'sequence', 'replacements', 'pos', 'gene', 'codon_num',
       'ref_codon', 'alt_codon', 'ref_aa', 'alt_aa', 'strain', 'virus',
       'gisaid_epi_isl', 'genbank_accession', 'date', 'region', 'country',
       'division', 'location', 'region_exposure', 'country_exposure',
       'division_exposure', 'segment', 'length', 'host', 'age', 'sex',
       'Nextstrain_clade', 'pangolin_lineage', 'GISAID_clade',
       'originating_lab', 'submitting_lab', 'authors', 'url', 'title',
       'paper_url', 'date_submitted', 'month'],
      dtype='object')

In [172]:
seqsdf.loc[seqsdf['location'].isna(), 'location'] = 'unk'

In [None]:
out_dir = Path('/home/al/analysis/mutations/gisaid')

In [204]:
seqsdf.drop(columns=['sequence']).to_csv(out_dir/'gisaid_replacements_19-12-2020.csv', index=False)

In [205]:
seqsdf[['idx', 'sequence']].to_csv(out_dir/'gisaid_sequences_19-12-2020.csv', index=False)

In [22]:
seqsdf = pd.read_csv('/home/al/analysis/mutations/gisaid/gisaid_replacements_19-12-2020.csv')

In [23]:
seqsdf = seqsdf[seqsdf['host']=='Human']

In [29]:
print(f"Aggregate final results...")
# aggregate on each substitutions, compute number of samples and other attributes
subs = (seqsdf.groupby(['gene', 'pos', 'ref_aa', 'codon_num', 'alt_aa'])
.agg(
 num_samples=('idx', 'nunique'),
 first_detected=('date', 'min'),
 last_detected=('date', 'max'),
 num_locations=('location', 'nunique'),
 location_counts=('location', lambda x: np.unique(x, return_counts=True)),
 num_divisions=('division', 'nunique'),
 division_counts=('division', lambda x: np.unique(x, return_counts=True)),
 num_countries=('country', 'nunique'),
 country_counts=('country', lambda x: np.unique(x, return_counts=True))
)
.reset_index())
# 1-based nucleotide position coordinate system
subs['pos'] = subs['pos'] + 1

Aggregate final results...


In [32]:
# subs.sort_values('num_samples', ascending=False).iloc[0]['country_counts']

In [33]:
subs['locations'] = subs['location_counts'].apply(lambda x: list(x[0]))
subs['location_counts'] = subs['location_counts'].apply(lambda x: list(x[1]))
subs['divisions'] = subs['division_counts'].apply(lambda x: list(x[0]))
subs['division_counts'] = subs['division_counts'].apply(lambda x: list(x[1]))
subs['countries'] = subs['country_counts'].apply(lambda x: list(x[0]))
subs['country_counts'] = subs['country_counts'].apply(lambda x: list(x[1]))

In [181]:
print(f"Aggregate final results...")
# aggregate on each substitutions, compute number of samples and other attributes
subs_mnth = (seqsdf.groupby(['month', 'gene', 'pos', 'ref_aa', 'codon_num', 'alt_aa'])
.agg(
 num_samples=('idx', 'nunique'),
 first_detected_mnth=('date', 'min'),
 last_detected_mnth=('date', 'max'),
 num_locations=('location', 'nunique'),
#  locations=('location', lambda x: list(np.unique(x))),
 location_counts=('location', lambda x: np.unique(x, return_counts=True)),
 num_divisions=('division', 'nunique'),
 division_counts=('division', lambda x: np.unique(x, return_counts=True)),
 num_countries=('country', 'nunique'),
#  countries=('country', lambda x: list(np.unique(x))),
 country_counts=('country', lambda x: np.unique(x, return_counts=True)),
)
.reset_index())
# 1-based nucleotide position coordinate system
subs_mnth['pos'] = subs_mnth['pos'] + 1
subs_mnth = pd.merge(subs_mnth, subs[['gene', 'pos', 'alt_aa', 'first_detected', 'last_detected']], on=['gene', 'pos', 'alt_aa'])

Aggregate final results...


In [34]:
out_dir = Path('/home/al/analysis/mutations/gisaid')

In [41]:
subs.to_csv(out_dir/'gisaid_substitutions_aggregated_19-12-2020.csv', index=False)

In [183]:
top_s_mnthly = (subs_mnth[subs_mnth['gene']=='S'].sort_values('num_samples', ascending=False)
 .drop_duplicates(subset=['gene', 'codon_num', 'alt_aa'])
 .iloc[:50]
 .reset_index(drop=True))

In [192]:
muts_of_interest = []
for i, mutation in top_s_mnthly.iterrows():
    locs = mutation['location_counts'][0]
    for l in locs:
        if 'san diego' in l.lower():
            muts_of_interest.append(i)
muts_of_interest

[0, 1, 2, 3, 10, 13, 21, 22, 33, 34, 35, 38, 40, 42, 43, 44, 45, 49]

In [197]:
def is_in(x, loc):
    for i in x[0]:
        if loc in i.lower():
            return True
    return False
top_s_mnthly['isin_SD'] = top_s_mnthly['location_counts'].apply(is_in, args=('san diego',))
top_s_mnthly['isin_CA'] = top_s_mnthly['division_counts'].apply(is_in, args=('california',))
top_s_mnthly['isin_US'] = top_s_mnthly['country_counts'].apply(is_in, args=('usa',))

top_s_mnthly.to_csv("/home/al/analysis/mutations/gisaid/top_S_mutations_monthly.csv", index=False)

### Integrate GISAID information with ALab variants table

In [42]:
out_dir = Path('/home/al/analysis/mutations/gisaid')
subs = pd.read_csv(out_dir/'gisaid_substitutions_aggregated_19-12-2020.csv')

In [43]:
gisaid_subs = (subs.rename(columns={'num_samples': 'gisaid_num_samples', 'first_detected': 'gisaid_1st_detected', 'last_detected': 'gisaid_last_detected',
                                    'num_locations': 'gisaid_num_locations', 'locations': 'gisaid_locations', 'location_counts': 'gisaid_location_counts',
                                   'num_divisions': 'gisaid_num_states','divisions': 'gisaid_states', 'division_counts': 'gisaid_state_counts',
                                   'num_countries': 'gisaid_num_countries', 'countries': 'gisaid_countries', 'country_counts': 'gisaid_country_counts'})
               .drop(columns=['ref_aa', 'pos']))

In [44]:
gisaid_subs.columns

Index(['gene', 'codon_num', 'alt_aa', 'gisaid_num_samples',
       'gisaid_1st_detected', 'gisaid_last_detected', 'gisaid_num_locations',
       'gisaid_location_counts', 'gisaid_num_states', 'gisaid_state_counts',
       'gisaid_num_countries', 'gisaid_country_counts', 'gisaid_locations',
       'gisaid_states', 'gisaid_countries'],
      dtype='object')

In [45]:
# gisaid_subs.sort_values('gisaid_num_samples', ascending=False).iloc[0]['gisaid_country_counts']

In [46]:
our_subs = pd.read_csv("/home/al/analysis/mutations/alab_git/substitutions_22-12-2020_orig.csv")
our_subs.shape

(4043, 13)

In [47]:
all_subs = pd.merge(our_subs, gisaid_subs, on=['gene', 'codon_num', 'alt_aa'], how='left').drop_duplicates(subset=['gene', 'codon_num', 'alt_aa'])

In [48]:
all_subs.columns

Index(['gene', 'ref_codon', 'alt_codon', 'pos', 'ref_aa', 'codon_num',
       'alt_aa', 'num_samples', 'first_detected', 'last_detected',
       'location_counts', 'samples', 'locations', 'gisaid_num_samples',
       'gisaid_1st_detected', 'gisaid_last_detected', 'gisaid_num_locations',
       'gisaid_location_counts', 'gisaid_num_states', 'gisaid_state_counts',
       'gisaid_num_countries', 'gisaid_country_counts', 'gisaid_locations',
       'gisaid_states', 'gisaid_countries'],
      dtype='object')

In [53]:
all_subs.sort_values('num_samples', ascending=False)

Unnamed: 0,gene,ref_codon,alt_codon,pos,ref_aa,codon_num,alt_aa,num_samples,first_detected,last_detected,...,gisaid_last_detected,gisaid_num_locations,gisaid_location_counts,gisaid_num_states,gisaid_state_counts,gisaid_num_countries,gisaid_country_counts,gisaid_locations,gisaid_states,gisaid_countries
0,S,GAT,GGT,23403,D,614,G,3194,2020-03-04,2020-12-06,...,2020-12-19,2145.0,"[15, 1, 5, 2, 8, 1, 1, 10, 1, 1, 2, 12, 8, 196...",969.0,"[1, 155, 102, 831, 1, 2, 1, 1, 29, 11, 50, 1, ...",129.0,"[3, 1, 50, 1, 9, 15004, 692, 126, 480, 1, 1666...","['A Coruna', 'Aarschot', 'Abakan', 'Abondant',...","['Aalst', 'Aargau', 'Abruzzo', 'Abu Dhabi', 'A...","['Algeria', 'Andorra', 'Argentina', 'Armenia',..."
2,ORF1ab,CTA,TTA,14408,L,4715,L,3194,2020-03-04,2020-12-06,...,2020-12-19,2129.0,"[15, 1, 4, 2, 8, 1, 10, 1, 1, 2, 12, 8, 193, 1...",954.0,"[1, 155, 80, 805, 1, 2, 1, 1, 29, 11, 50, 1, 1...",128.0,"[3, 1, 48, 1, 9, 14966, 688, 81, 460, 1, 1669,...","['A Coruna', 'Aarschot', 'Abakan', 'Abondant',...","['Aalst', 'Aargau', 'Abruzzo', 'Abu Dhabi', 'A...","['Algeria', 'Andorra', 'Argentina', 'Armenia',..."
3,ORF1ab,TTC,TTT,3037,F,924,F,3185,2020-03-04,2020-12-06,...,2020-12-19,2136.0,"[15, 1, 5, 2, 8, 1, 10, 1, 1, 2, 12, 8, 197, 1...",960.0,"[1, 146, 102, 774, 1, 2, 1, 1, 29, 11, 50, 1, ...",129.0,"[3, 1, 50, 1, 9, 14992, 679, 124, 458, 1, 1668...","['A Coruna', 'Aarschot', 'Abakan', 'Abondant',...","['Aalst', 'Aargau', 'Abruzzo', 'Abu Dhabi', 'A...","['Algeria', 'Andorra', 'Argentina', 'Armenia',..."
4,5UTR,CGT,TGT,241,R,81,C,3158,2020-03-04,2020-12-06,...,2020-12-19,2117.0,"[15, 1, 5, 2, 8, 1, 10, 1, 1, 2, 10, 7, 197, 1...",952.0,"[1, 155, 99, 835, 1, 2, 1, 29, 1, 11, 50, 1, 2...",128.0,"[3, 50, 1, 9, 13405, 688, 124, 474, 1, 1665, 4...","['A Coruna', 'Aarschot', 'Abakan', 'Abondant',...","['Aalst', 'Aargau', 'Abruzzo', 'Abu Dhabi', 'A...","['Algeria', 'Argentina', 'Armenia', 'Aruba', '..."
6,N,TCA,TTA,28854,S,194,L,1603,2020-03-23,2020-12-06,...,2020-12-19,279.0,"[120, 5, 71, 1, 4, 20, 1, 1, 3, 1, 1, 1, 1, 8,...",259.0,"[8, 13, 10, 4, 1, 7, 18, 3, 4, 418, 4, 46, 2, ...",63.0,"[4, 122, 1, 3, 31, 28, 119, 17, 2, 3, 1, 1, 15...","['Ahmedabad', 'Alachua County', 'Alameda Count...","['Aargau', 'Abu Dhabi', 'Agder', 'Aguascalient...","['Aruba', 'Australia', 'Austria', 'Bahrain', '..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2959,N,AGC,ATC,28875,S,201,I,1,2020-03-16,2020-03-16,...,2020-12-17,2.0,"[1, 57]",8.0,"[1, 11, 1, 1, 1, 3, 3, 37]",6.0,"[1, 38, 4, 1, 3, 11]","['San Diego County', 'unk']","['California', 'England', 'Oslo', 'Osun State'...","['Nigeria', 'Norway', 'Switzerland', 'USA', 'U..."
2958,ORF6,GAT,TAT,27382,D,61,Y,1,2020-10-02,2020-10-02,...,2020-12-12,8.0,"[4, 1, 1, 1, 1, 1, 1, 88]",25.0,"[1, 1, 2, 1, 1, 33, 1, 4, 1, 21, 1, 6, 1, 1, 1...",12.0,"[1, 24, 2, 2, 1, 1, 1, 1, 2, 4, 12, 47]","['Gibraltar', 'Huescar', 'Milwaukee County', '...","['Amman', 'Andalusia', 'Arizona', 'Balear Isla...","['Australia', 'Denmark', 'France', 'India', 'J..."
2957,N,AGT,AAT,29511,S,413,N,1,2020-06-23,2020-06-23,...,2020-12-12,3.0,"[1, 1, 9]",5.0,"[2, 1, 3, 1, 4]",2.0,"[8, 3]","['Greater Houston Area', 'San Diego County', '...","['Arizona', 'California', 'England', 'Texas', ...","['USA', 'United Kingdom']"
2956,N,ATG,ATT,28903,M,210,I,1,2020-09-12,2020-09-12,...,2020-12-17,22.0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, ...",41.0,"[3, 2, 2, 1, 1, 1, 4, 1, 1, 567, 1, 1, 2, 1, 4...",21.0,"[1, 1, 3, 4, 4, 2, 1, 3, 10, 3, 5, 1, 1, 2, 3,...","['Alameda County', 'Badalona', 'Caruaru', 'Con...","['Abu Dhabi', 'Alajuela', 'Auckland', 'Auvergn...","['Australia', 'Bangladesh', 'Brazil', 'Costa R..."


In [28]:
subs.loc[(subs['gene']=='S')&(subs['alt_aa']=='L')&(subs['codon_num']==957)]

Unnamed: 0,gene,pos,ref_aa,codon_num,alt_aa,num_samples,first_detected,last_detected,num_locations,locations,location_counts,num_countries,countries
23070,S,24432,Q,957,L,3,2020-10-27,2020-12-09,3,"[Counties Manukau, King County, Sydney]","[1, 1, 1]",3,"[USA, Australia, New Zealand]"


In [30]:
cols = ['month', 'ref_aa', 'codon_num', 'alt_aa', 'first_detected', 
        'last_detected', 'num_samples', 'num_countries', 
        'countries', 'country_counts', 'num_locations', 'locations', 'location_counts' ,
        'first_detected_mnth', 'last_detected_mnth']

In [72]:
# (subs_mnth[(subs_mnth['gene']=='S') & (subs_mnth['month']==12)]
#  .sort_values('num_samples', ascending=False)
#  .drop_duplicates(subset=['codon_num', 'alt_aa'], keep='first')
#  .iloc[:50]
#  .reset_index(drop=True))[cols]

In [132]:
# keys_df = seqsdf[['idx', 'sequence']]
# keys_df.to_csv('gisaid_replacements.csv', index=False)

In [133]:
sd = []
for d in seqsdf['location'].dropna().unique():
    if 'san diego' in d.lower():
        sd.append(d)

In [111]:
ca = []
for d in seqsdf['division'].unique():
    if 'cali' in d.lower():
        ca.append(d)

In [73]:
# cols = ['idx', 'location', 'division', 'pos']
# seqsdf.loc[(seqsdf['codon_num']==681) & (seqsdf['gene']=='S')][cols]

## Deletions

In [285]:
input_fasta = "/home/al/analysis/mutations/S501Y/msa_reference.fa"
meta_fp = "/home/al/analysis/mutations/S501Y/metadata_2020-12-20_12-24.tsv"
out_dir = "/home/al/analysis/mutations/S501Y/"
ref_fp = "/home/al/data/test_inputs/NC045512.fasta"
patient_zero = 'NC_045512.2'
min_del_len = 1
start_pos = 265
end_pos = 29674

In [286]:
# read MSA file
consensus_data = AlignIO.read(input_fasta, 'fasta')
# prcess MSA to remove insertions and fix position coordinate systems
seqs, ref_seq = process_cns_seqs(consensus_data, patient_zero, start_pos=start_pos, end_pos=end_pos)
# load into dataframe
seqsdf = (pd.DataFrame(index=seqs.keys(), data=seqs.values(), 
                       columns=['sequence'])
            .reset_index().rename(columns={'index': 'idx'}))

In [287]:
# load and join metadata
meta = pd.read_csv(meta_fp, sep='\t')
print(seqsdf.shape)
seqsdf = pd.merge(seqsdf, meta, left_on='idx', right_on='strain')
print(seqsdf.shape)
# # clean and process sample collection dates
# seqsdf = seqsdf.loc[(seqsdf['collection_date']!='Unknown') 
#                & (seqsdf['collection_date']!='1900-01-00')]
# seqsdf.loc[seqsdf['collection_date'].str.contains('/'), 'collection_date'] = seqsdf['collection_date'].apply(lambda x: x.split('/')[0])
seqsdf['date'] = pd.to_datetime(seqsdf['date_submitted'])

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


(273268, 2)
(273267, 29)


In [288]:
# compute length of each sequence
seqsdf['seq_len'] = seqsdf['sequence'].str.len()
# identify deletion positions
seqsdf['del_positions'] = seqsdf['sequence'].apply(find_deletions)

In [289]:
seqsdf.columns

Index(['idx', 'sequence', 'strain', 'virus', 'gisaid_epi_isl',
       'genbank_accession', 'date', 'region', 'country', 'division',
       'location', 'region_exposure', 'country_exposure', 'division_exposure',
       'segment', 'length', 'host', 'age', 'sex', 'Nextstrain_clade',
       'pangolin_lineage', 'GISAID_clade', 'originating_lab', 'submitting_lab',
       'authors', 'url', 'title', 'paper_url', 'date_submitted', 'seq_len',
       'del_positions'],
      dtype='object')

In [290]:
seqsdf = seqsdf[seqsdf['host']=='Human']

In [291]:
# sequences with one or more deletions
del_seqs = seqsdf.loc[seqsdf['del_positions'].str.len() > 0]
del_seqs = del_seqs.explode('del_positions')
# compute length of each deletion
del_seqs['del_len'] = del_seqs['del_positions'].apply(len)
# only consider deletions longer than 2nts
del_seqs = del_seqs[del_seqs['del_len'] >= min_del_len]
# fetch coordinates of each deletion
del_seqs['relative_coords'] = del_seqs['del_positions'].apply(get_indel_coords)
del_seqs.loc[del_seqs['location'].isna(), 'location'] = 'unk'
# group sample by the deletion they share
del_seqs = (del_seqs.groupby(['relative_coords', 'del_len'])
                    .agg(
                         samples=('idx', 'unique'),
                         num_samples=('idx', 'nunique'),
                         first_detected=('date', 'min'),
                         last_detected=('date', 'max'),
#                          locations=('location', lambda x: list(np.unique(x))),
                         location_counts=('location', lambda x: np.unique(x, return_counts=True)),
#                          divisions=('division', lambda x: list(np.unique(x))),
                         division_counts=('division', lambda x: np.unique(x, return_counts=True)),
#                          countries=('country', lambda x: list(np.unique(x))),
                         country_counts=('country', lambda x: np.unique(x, return_counts=True)),
                        )
                    .reset_index()
                    .sort_values('num_samples'))
del_seqs['type'] = 'deletion'
# adjust coordinates to account for the nts trimmed from beginning e.g. 265nts
del_seqs['absolute_coords'] = del_seqs['relative_coords'].apply(adjust_coords, args=(start_pos+1,))
del_seqs['pos'] = del_seqs['absolute_coords'].apply(lambda x: int(x.split(':')[0]))
# approximate the gene where each deletion was identified
del_seqs['gene'] = del_seqs['pos'].apply(map_gene_to_pos)
del_seqs = del_seqs.loc[~del_seqs['gene'].isna()]
# filter our substitutions in non-gene positions
del_seqs = del_seqs.loc[del_seqs['gene']!='nan']
# compute codon number of each substitution
del_seqs['codon_num'] = del_seqs.apply(compute_codon_num, args=(GENE2POS,), axis=1)
# fetch the reference codon for each substitution
del_seqs['ref_codon'] = del_seqs.apply(get_ref_codon, args=(ref_seq, GENE2POS), axis=1)
# fetch the reference and alternative amino acids
del_seqs['ref_aa'] = del_seqs['ref_codon'].apply(get_aa)
# record the 5 nts before each deletion (based on reference seq)
del_seqs['prev_5nts'] = del_seqs['absolute_coords'].apply(lambda x: ref_seq[int(x.split(':')[0])-5:int(x.split(':')[0])])
# record the 5 nts after each deletion (based on reference seq)
del_seqs['next_5nts'] = del_seqs['absolute_coords'].apply(lambda x: ref_seq[int(x.split(':')[1])+1:int(x.split(':')[1])+6])
del_seqs['locations'] = del_seqs['location_counts'].apply(lambda x: list(x[0]))
del_seqs['location_counts'] = del_seqs['location_counts'].apply(lambda x: list(x[1]))
del_seqs['divisions'] = del_seqs['division_counts'].apply(lambda x: list(x[0]))
del_seqs['division_counts'] = del_seqs['division_counts'].apply(lambda x: list(x[1]))
del_seqs['countries'] = del_seqs['country_counts'].apply(lambda x: list(x[0]))
del_seqs['country_counts'] = del_seqs['country_counts'].apply(lambda x: list(x[1]))
del_seqs.sort_values('num_samples', ascending=False)

Unnamed: 0,relative_coords,del_len,samples,num_samples,first_detected,last_detected,location_counts,division_counts,country_counts,type,...,pos,gene,codon_num,ref_codon,ref_aa,prev_5nts,next_5nts,locations,divisions,countries
938,21499:21504,6,"[England/SHEF-C43D2/2020, Australia/VIC1606/20...",6353,2020-05-27,2020-12-19,"[1, 1, 3, 1, 1, 2, 3, 1, 6, 11, 1, 1, 1, 1, 1,...","[1, 1, 1, 10, 8, 6, 1, 1, 19, 1, 13, 1, 1, 325...","[8, 2, 1, 4, 79, 6, 2303, 1, 14, 8, 1, 3, 2, 2...",deletion,...,21765,S,68,ATA,I,gctat,ctctg,"[Benátky nad Jizerou, Bobo Dioulasso, Brno, Ch...","[Aargau, Agder, Auckland, Auvergne-Rhône-Alpes...","[Australia, Burkina Faso, Canada, Croatia, Cze..."
233,0:76,77,"[Guangdong/ZQ-S2-P0061/2020, USA/CT-Yale-001/2...",3295,2020-03-09,2020-12-18,"[1, 1, 2, 1, 1, 8, 297, 1, 1, 2, 2, 2, 3, 1, 2...","[5, 3, 2, 2, 2, 4, 3, 43, 62, 1, 1, 16, 2, 5, ...","[32, 4, 2, 3, 154, 52, 5, 8, 1, 1, 2, 1, 37, 1...",deletion,...,266,ORF1ab,1,ATG,M,taaga,ctcgt,"[Alameda County, Barcelona, Barranquilla, Brow...","[Alaska, Antioquia, Atlantico, Auckland, Austr...","[Australia, Austria, Bangladesh, Belgium, Braz..."
987,21725:21727,3,"[India/MH-1-27/2020, Netherlands/ZuidHolland_7...",2096,2020-03-06,2020-12-19,"[2, 1, 1, 4, 6, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[3, 1, 2, 1, 1, 1, 1, 2, 1, 1, 6, 1, 2, 5, 1, ...","[3, 1, 5, 1, 1, 20, 1, 1, 1, 2, 2, 4, 1, 3, 1,...",deletion,...,21991,S,144,TAT,Y,gtgtt,tacca,"[Alameda County, Allegheny County, Amadora, Ca...","[Abu Dhabi, Amman, Andalusia, Apulia, Aragon, ...","[Australia, Bangladesh, Canada, Côte d'Ivoire,..."
2539,29400:29408,9,"[Guangdong/DG-S9-P0045/2020, Guangdong/MM-S1-P...",2052,2020-03-09,2020-12-19,"[7, 1, 1, 1, 1, 33, 17, 1, 2, 1, 165, 10, 1, 1...","[26, 3, 1, 1, 2, 9, 19, 2, 1, 8, 1, 1, 28, 3, ...","[49, 27, 25, 11, 138, 43, 3, 6, 3, 45, 1, 2, 2...",deletion,...,29666,ORF10,37,CTC,L,taatc,aatct,"[A Coruna, Alameda County, Alaquas, Brown Coun...","[Alaska, Amazonas, Aragon, Arizona, Auckland, ...","[Australia, Austria, Bangladesh, Belgium, Braz..."
459,1339:1341,3,"[France/ARA-739/2020, Netherlands/Andel_136506...",2020,2020-02-14,2020-12-18,"[1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 2, ...","[1, 9, 2, 2, 1, 1, 1, 4, 4, 1, 6, 1, 3, 2, 1, ...","[27, 15, 5, 2, 2, 1, 1, 26, 2, 10, 1, 1, 2, 3,...",deletion,...,1605,ORF1ab,447,AAT,N,cttaa,caacc,"[Alcaniz, Algemesi, Alhaurin de la Torre, Alzi...","[Agder, Andalusia, Aragon, Athens, Auckland, A...","[Australia, Belgium, Canada, Chile, Curacao, C..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2203,29048:29408,361,[Australia/WA387/2020],1,2020-11-02,2020-11-02,[1],[1],[1],deletion,...,29314,N,348,GAT,D,tcaaa,aatct,[unk],[Western Australia],[Australia]
2202,29043:29408,366,[USA/UT-UPHL-05515/2020],1,2020-10-08,2020-10-08,[1],[1],[1],deletion,...,29309,N,346,TTC,F,aaatt,aatct,[unk],[Utah],[USA]
2201,29042:29408,367,[Panama/335568/2020],1,2020-07-24,2020-07-24,[1],[1],[1],deletion,...,29308,N,346,TTC,F,caaat,aatct,[unk],[Cocle],[Panama]
2200,29039:29408,370,[Panama/336247/2020],1,2020-07-24,2020-07-24,[1],[1],[1],deletion,...,29305,N,345,AAT,N,atcca,aatct,[unk],[Panama Center],[Panama]


In [293]:
del_seqs.to_csv('/home/al/analysis/mutations/gisaid/gisaid_deletions_aggregated_19-12-2020.csv', index=False)

In [294]:
del_seqs.columns

Index(['relative_coords', 'del_len', 'samples', 'num_samples',
       'first_detected', 'last_detected', 'location_counts', 'division_counts',
       'country_counts', 'type', 'absolute_coords', 'pos', 'gene', 'codon_num',
       'ref_codon', 'ref_aa', 'prev_5nts', 'next_5nts', 'locations',
       'divisions', 'countries'],
      dtype='object')

In [295]:
gisaid_dels = (del_seqs.rename(columns={'num_samples': 'gisaid_num_samples', 'first_detected': 'gisaid_1st_detected', 'last_detected': 'gisaid_last_detected',
                                   'locations': 'gisaid_locations', 'location_counts': 'gisaid_location_counts',
                                   'divisions': 'gisaid_states', 'division_counts': 'gisaid_state_counts',
                                   'countries': 'gisaid_countries', 'country_counts': 'gisaid_country_counts'})
               .drop(columns=['ref_aa', 'pos', 'type', 'samples', 'ref_codon', 'prev_5nts', 'next_5nts', 'relative_coords', 'del_len']))

In [296]:
our_dels = pd.read_csv("/home/al/analysis/mutations/alab_git/deletions_22-12-2020_orig.csv")
# our_dels

In [297]:
cols = ['type', 'gene', 'absolute_coords', 'del_len', 'pos', 
                 'ref_aa', 'codon_num', 'num_samples',
                 'first_detected', 'last_detected', 'locations',
                 'location_counts', 'gisaid_num_samples',
                 'gisaid_1st_detected', 'gisaid_last_detected', 'gisaid_countries', 'gisaid_country_counts',
                 'gisaid_states', 'gisaid_state_counts', 'gisaid_locations', 'gisaid_location_counts', 'samples',
                 'ref_codon', 'prev_5nts', 'next_5nts'
                 ]

In [298]:
our_dels = pd.merge(our_dels, gisaid_dels, on=['gene', 'codon_num', 'absolute_coords'], how='left')

In [299]:
our_dels[cols]

Unnamed: 0,type,gene,absolute_coords,del_len,pos,ref_aa,codon_num,num_samples,first_detected,last_detected,...,gisaid_countries,gisaid_country_counts,gisaid_states,gisaid_state_counts,gisaid_locations,gisaid_location_counts,samples,ref_codon,prev_5nts,next_5nts
0,deletion,ORF8,28090:28092,3,28090,G,66,49,2020-09-30,2020-11-06,...,"[Jordan, USA]","[27, 1]","[Amman, Michigan]","[27, 1]",[unk],[28],['hCoV-19/JOR/SEARCH-3903-JOR/2020' 'hCoV-19/J...,GGT,gctgg,taaat
1,deletion,N,28890:28901,12,28890,S,206,24,2020-06-16,2020-08-25,...,"[USA, United Kingdom]","[23, 1]","[California, England]","[23, 1]","[San Diego, unk]","[23, 1]",['hCoV-19/USA/SEARCH-2285-SAN/2020' 'hCoV-19/U...,TCT,acttc,ggctg
2,deletion,ORF1ab,686:694,9,686,K,141,17,2020-03-31,2020-10-03,...,"[Australia, Belgium, Belize, Brazil, Burkina F...","[107, 21, 1, 4, 2, 43, 2, 1, 2, 1, 207, 1, 2, ...","[Abu Dhabi, Aichi, Amazonas, Amman, Andalusia,...","[4, 1, 3, 2, 1, 1, 1, 2, 1, 1, 1, 2, 2, 1, 1, ...","[Angers, Bee County, Bobo Dioulasso, Brooklyn,...","[1, 1, 2, 3, 3, 1, 1, 1, 1, 1, 1, 5, 1, 3, 1, ...",['hCoV-19/USA/SEARCH-0264-NBG/2020' 'hCoV-19/U...,AAG,tctaa,actta
3,deletion,ORF7b,27879:27891,13,27879,H,42,10,2020-10-20,2020-11-06,...,,,,,,,['hCoV-19/USA/SEARCH-4415-SAN/2020' 'hCoV-19/U...,CAC,ttgtc,catga
4,deletion,ORF1ab,516:518,3,516,V,84,10,2020-07-22,2020-08-01,...,"[Australia, Belgium, Brazil, Denmark, Germany,...","[8, 4, 1, 771, 1, 1, 1, 23, 26, 1]","[Arizona, Baden-Wuerttemberg, California, Dela...","[1, 1, 10, 1, 4, 21, 1, 1, 39, 3, 1, 1, 222, 1...","[Faroe Islands, Hanoi, Kent County, Madrid, Sa...","[1, 1, 1, 1, 10, 1, 1, 821]",['hCoV-19/USA/SEARCH-4342-SAN/2020' 'hCoV-19/U...,GTT,catgt,ggttg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81,deletion,N,29426:29441,16,29426,R,385,1,2020-06-06,2020-06-06,...,,,,,,,['hCoV-19/MEX/SEARCH-2505-TIJ/2020'],AGA,gcaga,aactg
82,deletion,ORF10,29574:29588,15,29574,V,6,1,2020-06-30,2020-06-30,...,[USA],[1],[California],[1],[San Diego],[1],['hCoV-19/USA/SEARCH-2405-SAN/2020'],GTT,aacgt,tacga
83,deletion,ORF1ab,3716:3716,1,3716,A,1151,1,2020-03-16,2020-03-16,...,,,,,,,['hCoV-19/USA/SEARCH-0116-SAN/2020'],GCT,atcag,tggta
84,deletion,E,26306:26306,1,26306,L,21,1,2020-07-08,2020-07-08,...,,,,,,,['hCoV-19/USA/SEARCH-4104-SAN/2020'],CTT,tttct,gcttt


In [300]:
our_dels[cols].sort_values('num_samples', ascending=False).to_csv("/home/al/analysis/mutations/alab_git/deletions_22-12-2020.csv", index=False)

In [None]:
align_fasta_reference(seqs_fp, num_cpus=25, ref_fp=ref_fp)

## CNS Mutations Report

In [6]:
analysis_folder = Path('/home/al/code/HCoV-19-Genomics/consensus_sequences/')
meta_fp = Path('/home/al/code/HCoV-19-Genomics/metadata.csv')
ref_path = Path('/home/gk/code/hCoV19/db/NC045512.fasta')
patient_zero = 'NC_045512.2'
in_fp = '/home/al/analysis/mutations/S501Y/msa_aligned.fa'

In [3]:
subs = identify_replacements(in_fp, meta_fp)

In [4]:
subs.head()

Unnamed: 0,gene,pos,ref_aa,codon_num,alt_aa,num_samples,first_detected,last_detected,locations,location_counts,samples
0,3UTR,29679,S,2,F,2,2020-07-14,2020-10-07,USA/California/San Diego,2,"[SEARCH-3119-SAN, SEARCH-4245-SAN]"
1,3UTR,29681,L,3,L,2,2020-07-11,2020-07-21,"[USA/California/Los Angeles, USA/California/Sa...","[1, 1]","[SEARCH-2600-SAN, SEARCH-2692-LAX]"
2,3UTR,29688,S,5,I,4,2020-03-25,2020-10-26,USA/California/San Diego,4,"[SEARCH-0113-SAN, SEARCH-2855-SAN, SEARCH-3609..."
3,3UTR,29690,V,6,L,1,2020-08-13,2020-08-13,USA/California/San Diego,1,[SEARCH-4455-SAN]
4,3UTR,29692,V,6,V,1,2020-10-03,2020-10-03,Jordan/Amman,1,[SEARCH-4034-JOR]


In [5]:
dels = identify_deletions(in_fp, meta_fp, patient_zero)
dels

Unnamed: 0,type,gene,absolute_coords,del_len,pos,ref_aa,codon_num,num_samples,first_detected,last_detected,locations,location_counts,samples,ref_codon,prev_5nts,next_5nts
30,deletion,ORF7a,27538:27572,35,27538,L,49,1,2020-04-28,2020-04-28,MEX/Baja California/Tijuana,1,[hCoV-19/MEX/SEARCH-1480-TIJ/2020],CTA,tcctc,actca
27,deletion,ORF6,27264:27290,27,27264,F,22,1,2020-07-28,2020-07-28,USA/California/San Diego,1,[hCoV-19/USA/SEARCH-3236-SAN/2020],TTT,ggact,tacat
28,deletion,ORF6,27266:27293,28,27266,F,22,1,2020-04-23,2020-04-23,USA/California/San Diego,1,[hCoV-19/USA/SEARCH-1921-SAN/2020],TTT,acttt,atcat
29,deletion,ORF7a,27498:27531,34,27498,S,36,1,2020-05-12,2020-05-12,USA/California/Imperial,1,[hCoV-19/USA/SEARCH-0573-IPL/2020],TCT,cttgc,atcct
59,deletion,ORF1ab,6656:6679,24,6656,N,2131,1,2020-06-13,2020-06-13,USA/California/San Diego,1,[hCoV-19/USA/SEARCH-3559-SAN/2020],AAT,tgtta,ctaat
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24,deletion,ORF3a,26158:26161,4,26158,V,256,9,2020-07-17,2020-09-14,USA/California/San Diego,9,"[hCoV-19/USA/SEARCH-2907-SAN/2020, hCoV-19/USA...",GTT,agttg,tccag
16,deletion,ORF1ab,516:518,3,516,V,84,10,2020-07-22,2020-08-01,USA/California/San Diego,10,"[hCoV-19/USA/SEARCH-4342-SAN/2020, hCoV-19/USA...",GTT,catgt,ggttg
56,deletion,ORF1ab,686:694,9,686,K,141,17,2020-03-31,2020-10-03,"[Jordan/Amman, USA/California/San Diego, USA/L...","[2, 13, 2]","[hCoV-19/USA/SEARCH-0264-NBG/2020, hCoV-19/USA...",AAG,tctaa,actta
51,deletion,N,28890:28901,12,28890,S,206,23,2020-06-16,2020-08-06,USA/California/San Diego,23,"[hCoV-19/USA/SEARCH-2285-SAN/2020, hCoV-19/USA...",TCT,acttc,ggctg


In [3]:
dels[dels['gene']=='S'].sort_values('num_samples', ascending=False)#.to_csv('S_deletions_consensus.csv', index=False)

NameError: name 'dels' is not defined

In [4]:
identify_insertions(in_fp, patient_zero).to_csv('test.csv', index=False)

## dev 

In [10]:
GENE2POS = {
            '5UTR': {'start': 0, 'end': 265},
            'ORF1ab': {'start': 265, 'end': 21555},
            'S': {'start': 21562, 'end': 25384},
            'ORF3a': {'start': 25392, 'end': 26220},
            'E': {'start': 26244, 'end': 26472},
            'M': {'start': 26522, 'end': 27191},
            'ORF6': {'start': 27201, 'end': 27387},
            'ORF7a': {'start': 27393, 'end': 27759},
            'ORF7b': {'start': 27755, 'end': 27887},
            'ORF8': {'start': 27893, 'end': 28259},
            'N': {'start': 28273, 'end': 29533},
            'ORF10': {'start': 29557, 'end': 29674},
            '3UTR': {'start': 29674, 'end': 29902}
           }

In [11]:
in_dir = '/home/al/analysis/mutations/fa/'
out_dir = '/home/al/analysis/mutations/msa/'

In [14]:
!rm -r /home/al/analysis/mutations
!mkdir /home/al/analysis/mutations
!mkdir /home/al/analysis/mutations/fa

In [15]:
for filename in analysis_folder.listdir():
    if (filename.endswith('fa') or filename.endswith('fasta')):
        copy(filename, '/home/al/analysis/mutations/fa/')
#         print(filename)

In [179]:
copy(ref_path, in_dir)

'/home/al/analysis/mutations/fa/NC045512.fasta'

In [180]:
in_dir = '/home/al/analysis/mutations/fa/'
out_dir = '/home/al/analysis/mutations/msa'
concat_fasta(in_dir, out_dir)

'/home/al/analysis/mutations/msa.fa'

In [17]:
align_fasta_reference('/home/al/analysis/mutations/msa.fa',  num_cpus=12, ref_fp=ref_path)

'/home/al/analysis/mutations/msa_aligned.fa'

In [14]:
cns = AlignIO.read('/home/al/analysis/mutations/msa_aligned.fa', 'fasta')

In [15]:
ref_seq = get_seq(cns, patient_zero)

In [16]:
len(ref_seq)

29903

In [17]:
seqs = get_seqs(cns, 0, 30000)

In [18]:
seqsdf = (pd.DataFrame(index=seqs.keys(), data=seqs.values(), columns=['sequence'])
                .reset_index().rename(columns={'index': 'idx'}))

In [19]:
# seqsdf

In [20]:
def find_replacements(x, ref):
    return [f'{i}:{n}' for i, n in enumerate(x) 
            if n!=ref[i] and n!='-' and n!='n']

In [21]:
seqsdf['replacements'] = seqsdf['sequence'].apply(find_replacements, args=(ref_seq,))

In [22]:
seqsdf = seqsdf.explode('replacements')
seqsdf['pos'] = -1
seqsdf.loc[~seqsdf['replacements'].isna(), 'pos'] = seqsdf.loc[~seqsdf['replacements'].isna(), 'replacements'].apply(lambda x: int(x.split(':')[0]))
seqsdf = seqsdf.loc[seqsdf['pos']!=-1]

In [23]:
def compute_codon_num(x, gene2pos: dict):
    pos = x['pos']
    ref_pos = gene2pos[x['gene']]['start']
    return math.ceil((pos - ref_pos + 1) / 3)

In [24]:
seqsdf['gene'] = seqsdf['pos'].apply(map_gene_to_pos)
seqsdf = seqsdf.loc[~seqsdf['gene'].isna()]
seqsdf = seqsdf.loc[seqsdf['gene']!='nan']
seqsdf['codon_num'] = seqsdf.apply(compute_codon_num, args=(GENE2POS,), axis=1)

In [25]:
def get_ref_codon(x, ref_seq, gene2pos: dict):
    ref_pos = gene2pos[x['gene']]['start']
    codon_start = ref_pos + ((x['codon_num'] - 1) * 3)
    return ref_seq[codon_start: codon_start+3].upper()
seqsdf['ref_codon'] = seqsdf.apply(get_ref_codon, args=(ref_seq, GENE2POS), axis=1)

In [26]:
def get_alt_codon(x, gene2pos: dict):
    ref_pos = gene2pos[x['gene']]['start']
    codon_start = ref_pos + ((x['codon_num'] - 1) * 3)
    return x['sequence'][codon_start: codon_start+3].upper()
seqsdf['alt_codon'] = seqsdf.apply(get_alt_codon, args=(GENE2POS,), axis=1)

In [27]:
def get_aa(codon: str):
    CODON2AA = { 
        'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M', 
        'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', 
        'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K', 
        'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',                  
        'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 
        'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 
        'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q', 
        'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R', 
        'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 
        'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A', 
        'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', 
        'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 
        'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 
        'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L', 
        'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_', 
        'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W', 
    } 
    return CODON2AA.get(codon, 'nan')
seqsdf['ref_aa'] = seqsdf['ref_codon'].apply(get_aa)
seqsdf['alt_aa'] = seqsdf['alt_codon'].apply(get_aa)
seqsdf = seqsdf.loc[seqsdf['alt_aa']!='nan']

In [28]:
seqsdf.columns

Index(['idx', 'sequence', 'replacements', 'pos', 'gene', 'codon_num',
       'ref_codon', 'alt_codon', 'ref_aa', 'alt_aa'],
      dtype='object')

In [29]:
meta = pd.read_csv(meta_fp)
print(seqsdf['idx'].unique().shape)
seqsdf = pd.merge(seqsdf, meta, left_on='idx', right_on='fasta_hdr')
print(seqsdf['idx'].unique().shape)

(2765,)
(2765,)


In [30]:
seqsdf = seqsdf.loc[(seqsdf['collection_date']!='Unknown') 
                   & (seqsdf['collection_date']!='1900-01-00')]
seqsdf.loc[seqsdf['collection_date'].str.contains('/'), 'collection_date'] = seqsdf['collection_date'].apply(lambda x: x.split('/')[0])
seqsdf['date'] = pd.to_datetime(seqsdf['collection_date'])

In [31]:
seqsdf['date'].min()

Timestamp('2020-03-04 00:00:00')

In [32]:
# (seqsdf.groupby(['gene', 'ref_aa', 'codon_num', 'alt_aa'])
# .agg(
#      num_samples=('ID', 'nunique')))

In [35]:
def uniq_locs(x):
    return np.unique(x)
def loc_counts(x):
    _, counts = np.unique(x, return_counts=True)
    return counts

In [37]:
subs = (seqsdf.groupby(['gene', 'pos', 'ref_aa', 'codon_num', 'alt_aa'])
.agg(
     num_samples=('ID', 'nunique'),
     first_detected=('date', 'min'),
     last_detected=('date', 'max'),
     locations=('location', uniq_locs),
     location_counts=('location', loc_counts),
     samples=('ID', 'unique')
    )
.reset_index())
subs['pos'] = subs['pos'] + 1

In [175]:
(subs[subs['gene']=='S'].sort_values('num_samples', ascending=False)
 .to_csv('S_mutations_consensus.csv', index=False))

## Consolidate metadata ID and fasta headers

In [134]:
def fix_header(x):
    if 'Consensus' in x:
        return x.split('_')[1]
    else:
        return x.split('/')[2]
seqsdf['n_ID'] = seqsdf['idx'].apply(fix_header)

In [135]:
seqsdf['n_ID'] = seqsdf['n_ID'].str.replace('ALSR', 'SEARCH')

In [136]:
meta = pd.read_csv(meta_fp)
meta['n_ID'] = meta['ID'].apply(lambda x: '-'.join(x.split('-')[:2]))

In [137]:
seqsdf['n_ID'] = seqsdf['n_ID'].apply(lambda x: '-'.join(x.split('-')[:2]))

In [138]:
tmp = pd.merge(seqsdf, meta, on='n_ID')

In [122]:
# tmp[tmp['ID'].str.contains('2112')]

In [98]:
# seqsdf

In [139]:
set(meta['n_ID'].unique()) - set(tmp['n_ID'].unique())

{'SEARCH-1668'}

In [140]:
seqsdf['idx'].unique().shape

(2765,)

In [141]:
meta['ID'].unique().shape

(2766,)

In [147]:
s = seqsdf[['n_ID', 'idx']].drop_duplicates()

In [151]:
new_meta = pd.merge(meta, s, on='n_ID', how='left')
(new_meta.drop(columns=['n_ID'])
.rename(columns={'idx': 'fasta_hdr'})
.to_csv('metadata.csv', index=False))

In [152]:
new_meta.shape 

(2766, 11)

In [153]:
new_meta

Unnamed: 0,ID,gb_accession,gisaid_accession,collection_date,location,percent_coverage_cds,avg_depth,authors,originating_lab,n_ID,idx
0,MG0987,MT598172,EPI_ISL_416457,2020-03-18,USA/California/San Diego,99.5954,2465.60,SEARCH Alliance San Diego,Andersen lab at Scripps Research,MG0987,Consensus_MG0987
1,PC00101P,MT192765,EPI_ISL_414648,2020-03-11,USA/California/San Diego,99.7525,3516.14,SEARCH Alliance San Diego,Andersen lab at Scripps Research,PC00101P,Consensus_PC00101P_threshold_0_quality_20
2,SEARCH-0007-SAN,MT598171,EPI_ISL_429990,2020-03-21,USA/California/San Diego,100.0000,6215.17,SEARCH Alliance San Diego with Christina Clark...,Rady's Childrens Hospital,SEARCH-0007,Consensus_SEARCH-0007-SAN_L1_threshold_0_quali...
3,SEARCH-0016-SAN,MT598173,EPI_ISL_430016,2020-03-24,USA/California/San Diego,100.0000,6440.67,SEARCH Alliance San Diego,Andersen lab at Scripps Research,SEARCH-0016,Consensus_SEARCH-0016-SAN_L1_threshold_0_quali...
4,SEARCH-0017-SAN,MT598174,EPI_ISL_429991,2020-03-24,USA/California/San Diego,100.0000,4947.09,SEARCH Alliance San Diego,Andersen lab at Scripps Research,SEARCH-0017,Consensus_SEARCH-0017-SAN_L1_threshold_0_quali...
...,...,...,...,...,...,...,...,...,...,...,...
2761,SEARCH-4685-SAN,,,2020-11-02,USA/California/San Diego,100.0000,4831.37,"SEARCH Alliance San Diego with Tracy Basler, J...",San Diego County Public Health Laboratory,SEARCH-4685,hCoV-19/USA/SEARCH-4685-SAN/2020
2762,SEARCH-4686-SAN,,,2020-11-05,USA/California/San Diego,100.0000,3864.73,"SEARCH Alliance San Diego with Tracy Basler, J...",San Diego County Public Health Laboratory,SEARCH-4686,hCoV-19/USA/SEARCH-4686-SAN/2020
2763,SEARCH-4687-ORA,,,2020-11-02,USA/California/Orange,98.1400,3123.54,"SEARCH Alliance San Diego with Tracy Basler, J...",San Diego County Public Health Laboratory,SEARCH-4687,hCoV-19/USA/SEARCH-4687-ORA/2020
2764,SEARCH-4690-SAN,,,2020-05-28,USA/California/San Diego,98.7283,2625.16,"SEARCH Alliance San Diego with Tracy Basler, J...",San Diego County Public Health Laboratory,SEARCH-4690,hCoV-19/USA/SEARCH-4690-SAN/2020


In [81]:
len(ref_seq)

29903