In [1]:
import sys
import time
import json
import numpy as np
import pandas as pd

import bjorn_support as bs
import onion_trees as ot
import mutations as bm
import visualize as bv
import reports as br
import data as bd

In [2]:
import plotly
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [4]:
date = '2021-01-26'
countries_fp = '/home/al/data/geojsons/countries.geo.json'
states_fp = '/home/al/data/geojsons/us-states.json'
subs = pd.read_csv('/home/al/analysis/gisaid/subs_long_2021-01-25.csv.gz', 
                   compression='gzip')
dels = pd.read_csv('/home/al/analysis/gisaid/dels_long_2021-01-25.csv.gz', 
                   compression='gzip')

In [None]:
(dels.groupby(['mutation', 'absolute_coords', 'del_len', 'del_seq'])
     .agg(num_samples=('idx', 'nunique'))
     .reset_index()
     .nlargest(50, 'num_samples'))

In [5]:
cols = ['mutation', 'strain', 'country', 'division', 'location', 'date', 'absolute_coords', 'del_len']

In [6]:
dels['pos'] = dels['absolute_coords'].apply(lambda x: int(x.split(':')[0]))
dels['ref_codon'] = dels['del_seq'].copy()

In [7]:
print(subs.shape)
print(dels.shape)
subs['type'] = 'substitution'
muts = pd.concat([subs, dels])
print(muts.shape)

(6328749, 38)
(117950, 44)
(6446699, 47)


In [8]:
with open(countries_fp) as f:
    countries = json.load(f)
country_map = {x['properties']['name']: x['id'] for x in countries['features']}
muts['country_id'] = muts['country'].apply(lambda x: country_map.get(x, 'NA'))
with open(states_fp) as f:
    states = json.load(f)
state_map = {x['properties']['name']: x['id'] for x in states['features']}
muts['division_id'] = muts['division'].apply(lambda x: state_map.get(x, 'NA'))

In [9]:
muts.rename(columns={
    'date': 'date_collected',
    'GISAID_clade': 'gisaid_clade',
    'Nextstrain_clade': 'nextstrain_clade',
    'del_len': 'change_length_nt'
    }, inplace=True)

In [10]:
muts.columns

Index(['idx', 'replacements', 'pos', 'gene', 'codon_num', 'ref_codon',
       'alt_codon', 'ref_aa', 'alt_aa', 'mutation', 'strain', 'virus',
       'gisaid_epi_isl', 'genbank_accession', 'date_collected', 'region',
       'country', 'division', 'location', 'region_exposure',
       'country_exposure', 'division_exposure', 'segment', 'length', 'host',
       'age', 'sex', 'nextstrain_clade', 'pangolin_lineage', 'gisaid_clade',
       'originating_lab', 'submitting_lab', 'authors', 'url', 'title',
       'paper_url', 'date_submitted', 'purpose_of_sequencing', 'type',
       'seq_len', 'del_positions', 'change_length_nt', 'relative_coords',
       'absolute_coords', 'del_seq', 'prev_5nts', 'next_5nts', 'country_id',
       'division_id'],
      dtype='object')

In [11]:
def compute_acc_nt_pos(x, gene2pos):
    s = gene2pos.get(x['gene'], 0)
    return s + x['pos']
muts['nt_map_coords'] = muts[['gene', 'pos']].apply(compute_acc_nt_pos, 
                            args=(bd.GENE2NTCOORDS,), 
                            axis=1)

In [12]:
def compute_acc_aa_pos(x, gene2pos):
    s = gene2pos.get(x['gene'], 0)
    return s + x['codon_num']
muts['aa_map_coords'] = muts[['gene', 'codon_num']].apply(compute_acc_aa_pos, 
                            args=(bd.GENE2AACOORDS,), 
                            axis=1)

In [13]:
muts['date_modified'] = date

In [14]:
muts.columns

Index(['idx', 'replacements', 'pos', 'gene', 'codon_num', 'ref_codon',
       'alt_codon', 'ref_aa', 'alt_aa', 'mutation', 'strain', 'virus',
       'gisaid_epi_isl', 'genbank_accession', 'date_collected', 'region',
       'country', 'division', 'location', 'region_exposure',
       'country_exposure', 'division_exposure', 'segment', 'length', 'host',
       'age', 'sex', 'nextstrain_clade', 'pangolin_lineage', 'gisaid_clade',
       'originating_lab', 'submitting_lab', 'authors', 'url', 'title',
       'paper_url', 'date_submitted', 'purpose_of_sequencing', 'type',
       'seq_len', 'del_positions', 'change_length_nt', 'relative_coords',
       'absolute_coords', 'del_seq', 'prev_5nts', 'next_5nts', 'country_id',
       'division_id', 'nt_map_coords', 'aa_map_coords', 'date_modified'],
      dtype='object')

In [15]:
muts['is_synonymous'] = False
muts.loc[muts['ref_aa']==muts['alt_aa'], 'is_synonymous'] = True

In [16]:
meta_info = ['strain', 'date_modified',
        'date_collected','date_submitted',
        'country_id', 'country', 
        'division_id', 'division', 'location', 
        'submitting_lab', 'originating_lab',
        'authors', 'pangolin_lineage', 
        'gisaid_clade', 'nextstrain_clade',
        'gisaid_epi_isl', 'genbank_accession',
        'purpose_of_sequencing']

muts_info = ['type', 'mutation', 'gene', 
             'ref_codon', 'pos', 'alt_codon', 
             'is_synonymous', 
             'ref_aa', 'codon_num', 'alt_aa', 
             'absolute_coords', 
             'change_length_nt', 
             'nt_map_coords', 'aa_map_coords']

In [17]:
muts.loc[muts['location']=='unk', 'location'] = 'NA'
muts.loc[muts['purpose_of_sequencing']=='?', 'purpose_of_sequencing'] = 'NA'
muts.loc[muts['genbank_accession']=='?', 'genbank_accession'] = 'NA'

In [18]:
muts.fillna('NA', inplace=True)

In [19]:
sample_ids = muts[['strain']].drop_duplicates().sample(10)['strain'].unique()
test = muts[muts['strain'].isin(sample_ids)]

In [20]:
# test['genbank_accession']

In [21]:
# test

In [23]:
start = time.time()
(muts.groupby(meta_info, as_index=True)
             .apply(lambda x: x[muts_info].to_dict('records'))
             .reset_index()
             .rename(columns={0:'mutations'})
             .to_json('test_data/data_model_2021-01-26.json.gz', 
                      orient='records',
                      compression='gzip'))
end = time.time()
print(f'Execution time: {end - start} seconds')

Execution time: 674.5737869739532 seconds


In [42]:
cois = [141, 142, 143, 144, 145]
my_dels = dels[(dels['codon_num'].isin(cois)) & (dels['gene']=='S')][cols]

In [43]:
my_dels.shape

(25872, 8)

In [44]:
(my_dels.groupby(['mutation', 'country'])
        .agg(num_samples=('strain', 'nunique'),
             first_detected=('date', 'min'),
             last_detected=('date', 'max'))
#         .reset_index()
        .sort_values('num_samples', ascending=False))

Unnamed: 0_level_0,Unnamed: 1_level_0,num_samples,first_detected,last_detected
mutation,country,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
S:DEL144/144.0,United Kingdom,23596,2020,2021-01-15
S:DEL144/144.0,Denmark,450,2020-04-20,2021-01-11
S:DEL144/144.0,United States of America,275,2020-03-30,2021-01-09
S:DEL144/144.0,Spain,116,2020-03-19,2021-01-12
S:DEL143/143.0,United Kingdom,110,2020,2021-01-09
S:DEL143/143.0,...,...,...,...
S:DEL143/143.0,Switzerland,1,2020-12-30,2020-12-30
S:DEL142/144.0,Peru,1,2020-12-11,2020-12-11
S:DEL142/144.0,Denmark,1,2020-12-14,2020-12-14
S:DEL142/144.0,Costa Rica,1,2020-11-06,2020-11-06


In [45]:
my_dels['mutation'].value_counts()

S:DEL144/144.0                 25466
S:DEL141/144.0                   139
S:DEL143/143.0                   114
S:DEL144/145.0                    57
S:DEL141/143.0                    41
S:DEL142/144.0                    17
S:DEL142/143.0                     6
S:DEL143/145.0                     6
S:DEL145/146.0                     5
S:DEL145/145.0                     5
S:DEL141/141.0                     4
S:DEL141/142.0                     3
S:DEL143/144.0                     2
S:DEL141/145.0                     2
S:DEL143/168.33333333333334        1
S:DEL141/143.66666666666666        1
S:DEL141/142.33333333333334        1
S:DEL145/148.0                     1
S:DEL145/164.0                     1
Name: mutation, dtype: int64

In [46]:
my_dels['mutation'] = my_dels['mutation'].apply(lambda x: x.split('.')[0])

In [47]:
counts = (my_dels['mutation']
                  .value_counts()
                  .to_frame()
                  .reset_index()
                  .rename(columns={'index': 'deletion', 'mutation': 'num_samples'}))
counts['pct_samples'] = counts['num_samples'] / counts['num_samples'].sum()
fig = go.Figure(go.Bar(
        y=counts['deletion'], x=counts['num_samples'], orientation='h',
        text=counts['pct_samples'],
        textposition='outside'
    ))
#     fig.for_each_xaxis(lambda axis: axis.title.update(font=dict(color = 'blue', size=8)))
fig.update_traces(texttemplate='%{text:.2p}')
fig.update_yaxes(title_text="Deletion")
fig.update_xaxes(title_text="Number of Sequences")
fig.update_layout(title=f"[Undesignated]", 
                  template='plotly_white', showlegend=False,
                  margin={"r":0})
fig.write_html('s14x_deletion_histogram.html')
fig.show()

In [49]:
(my_dels.loc[(my_dels['mutation']=='S:DEL141/144') 
             & (my_dels['country'].str.contains('America'))
             & (my_dels['division']=='California')]
        .groupby(['country', 'division', 'location'])
        .agg(num_samples=('strain', 'nunique'),
             first_detected=('date', 'min'),
             last_detected=('date', 'max'))
#         .reset_index()
        .sort_values('num_samples', ascending=False))

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,num_samples,first_detected,last_detected
country,division,location,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
United States of America,California,Los Angeles County,2,2020-11-02,2020-12-17
United States of America,California,San Francisco County,2,2020-03-27,2020-04-28
United States of America,California,San Diego,1,2020-08-18,2020-08-18
United States of America,California,San Diego County,1,2020-03-25,2020-03-25


In [31]:
test['absolute_coords']

252571     NaN
252572     NaN
252573     NaN
252574     NaN
252575     NaN
          ... 
5645297    NaN
5645298    NaN
5645299    NaN
5645300    NaN
5645301    NaN
Name: absolute_coords, Length: 124, dtype: object

In [32]:
test.loc[test['location']=='NA', 'location'] = np.nan
test.loc[test['location'].isna(), 'location']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


252571     NaN
252572     NaN
252573     NaN
252574     NaN
252575     NaN
          ... 
5645297    NaN
5645298    NaN
5645299    NaN
5645300    NaN
5645301    NaN
Name: location, Length: 84, dtype: object

In [27]:
test[meta_info]

Unnamed: 0,strain,date_modified,date_collected,date_submitted,country,division,location,submitting_lab,originating_lab,authors,pangolin_lineage,gisaid_clade,nextstrain_clade,gisaid_epi_isl,genbank_accession,purpose_of_sequencing
22187,USA/WA-UW129/2020,2021-01-22,2020-03-12,2020-03-23,United States of America,Washington,,UW Virology Lab,UW Virology Lab,Pavitra Roychoudhury et al,A.1,S,19B,EPI_ISL_416667,?,
22188,USA/WA-UW129/2020,2021-01-22,2020-03-12,2020-03-23,United States of America,Washington,,UW Virology Lab,UW Virology Lab,Pavitra Roychoudhury et al,A.1,S,19B,EPI_ISL_416667,?,
22189,USA/WA-UW129/2020,2021-01-22,2020-03-12,2020-03-23,United States of America,Washington,,UW Virology Lab,UW Virology Lab,Pavitra Roychoudhury et al,A.1,S,19B,EPI_ISL_416667,?,
22190,USA/WA-UW129/2020,2021-01-22,2020-03-12,2020-03-23,United States of America,Washington,,UW Virology Lab,UW Virology Lab,Pavitra Roychoudhury et al,A.1,S,19B,EPI_ISL_416667,?,
22191,USA/WA-UW129/2020,2021-01-22,2020-03-12,2020-03-23,United States of America,Washington,,UW Virology Lab,UW Virology Lab,Pavitra Roychoudhury et al,A.1,S,19B,EPI_ISL_416667,?,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4619022,Luxembourg/LNS9352817/2020,2021-01-22,2020-11-29,2021-01-06,Luxembourg,Luxembourg,,"Laboratoire national de santé, Microbiology, M...","Laboratoire national de santé, Microbiology, V...",Anke Wienecke-Baldacchino et al,B.1.160,GH,20A.EU2,EPI_ISL_770918,?,
4619023,Luxembourg/LNS9352817/2020,2021-01-22,2020-11-29,2021-01-06,Luxembourg,Luxembourg,,"Laboratoire national de santé, Microbiology, M...","Laboratoire national de santé, Microbiology, V...",Anke Wienecke-Baldacchino et al,B.1.160,GH,20A.EU2,EPI_ISL_770918,?,
4619024,Luxembourg/LNS9352817/2020,2021-01-22,2020-11-29,2021-01-06,Luxembourg,Luxembourg,,"Laboratoire national de santé, Microbiology, M...","Laboratoire national de santé, Microbiology, V...",Anke Wienecke-Baldacchino et al,B.1.160,GH,20A.EU2,EPI_ISL_770918,?,
6710,SouthAfrica/R12686-20/2020,2021-01-22,2020-06-16,2020-08-01,South Africa,North-West,,National Institute for Communicable Diseases o...,National Institute for Communicable Diseases o...,Allam M et al,C.1,GR,20D,EPI_ISL_504227,?,
