In [1]:
import pandas as pd
import os
from path import Path
import plotly
import plotly.express as px
import plotly.graph_objects as go
from urllib.request import urlopen
import json
import statsmodels as sm
from statsmodels.formula.api import ols
from Bio import Seq, SeqIO, AlignIO, Phylo, Align
from jinja2 import Environment, FileSystemLoader  # html template engine
import cv2
import numpy as np
import skimage as sk
import matplotlib.pylab as plt

In [2]:
import bjorn_support as bs
import onion_trees as ot
import mutations as bm
import visualize as bv
import reports as br

In [3]:
msa_fp = '/home/al/analysis/gisaid/sequences_2021-01-11_09-53_aligned.fasta'
meta_fp = '/home/al/analysis/gisaid/metadata_concat_2021-01-11_16-49.tsv'

In [4]:
cols = {'Virus name': 'strain', 'Collection date': 'date' , 'Additional host information': 'purpose_of_sequencing',
        'Lineage': 'pangolin_lineage', 'Host': 'host'}

In [5]:
# gisaid, _ = bm.identify_replacements_per_sample(msa_fp, meta_fp, bm.GENE2POS, data_src='gisaid')

In [6]:
# gisaid.loc[gisaid['country']=='USA', 'country'] = 'United States of America'

In [7]:
# gisaid.to_csv('/home/al/analysis/gisaid/subs_long_2021-01-11.csv.gz', index=False, compression='gzip')

In [8]:
# gisaid.loc[(gisaid['pangolin_lineage']=='B.1.1.7') & (gisaid['country']=='USA'), 'strain'].unique()

In [9]:
vnum='v11'
feature = 'pangolin_lineage'
values = ['B.1.1.7']
input_params = {
    'strain': 'B117',
    'date': '01/11/2021',
    'msa_fp': Path('/home/al/analysis/gisaid/sequences_2021-01-11_09-53_aligned.fasta'),
    'meta_fp' : Path('/home/al/code/HCoV-19-Genomics/metadata.csv'),
    'tree_fp' : Path('/home/al/analysis/alab_mutations_01-01-2021/alab/seqs_aligned.fa.treefile'),
    'subs_fp' : '/home/al/analysis/alab_mutations_01-01-2021/alab_substitutions_long_01-01-2021.csv',
    'countries_fp' : '/home/al/data/geojsons/countries.geo.json',
    'states_fp' : "/home/al/data/geojsons/us-states.json",
    'counties_fp' : '/home/al/data/geojsons/us-counties.json',
    'patient_zero' : 'NC_045512.2',
    'gisaid_data_fp' : '/home/al/analysis/gisaid/subs_long_2021-01-11.csv.gz',
    'b117_meta' : '/home/al/analysis/b117/nextstrain_groups_neherlab_ncov_S.N501_metadata.tsv',
    'b117_tree': 'test_data/b117_seqs_aligned.fasta.treefile',
    'sample_sz': 400,
    'sampling_img_fp' : "/home/al/analysis/b117/figs/sars-cov-2_EM_v3.png"
}

In [10]:
results = br.generate_voc_data(feature, values, input_params)

  if (await self.run_code(code, result,  async_=asy)):

Columns (11) have mixed types.Specify dtype option on import or set low_memory=False.


Columns (11) have mixed types.Specify dtype option on import or set low_memory=False.



In [25]:
html = br.generate_voc_html(feature, values, results)
br.save_html(html, f'test_data/b117_report_draftv2.html')

In [21]:
gisaid.loc[(gisaid['codon_num']==452) & (gisaid['gene']=='S') & (gisaid['alt_aa']=='R'), 'strain'].unique().shape

(404,)

In [24]:
gisaid.loc[(gisaid['codon_num']==452) & (gisaid['gene']=='S') & (gisaid['alt_aa']=='R'), 'date'].min()

'2020-03-17'

In [12]:
gisaid = pd.read_csv(input_params['gisaid_data_fp'], compression='gzip')
# gisaid.columns


Columns (37,38,40,41,42,43,44,45) have mixed types.Specify dtype option on import or set low_memory=False.



In [16]:
def b117_genetic_distance(gisaid_data, msa_fp, b117_meta, patient_zero, sample_sz=250, clock_rate=8e-4):
    # nabla_symbol = u"\u2207"
    croft_meta = pd.read_csv(b117_meta, sep='\t')
    croft_meta = croft_meta[croft_meta['Country']!='USA'].copy()
    # extract B117 samples from Emma Croft's build
    b117_ids = croft_meta[croft_meta['Pangolin Lineage']=='B.1.1.7'].sample(sample_sz)['Strain'].unique().tolist()
    # extract outgroup samples from Emma Croft's build
    outgrp_ids = croft_meta[~croft_meta['Pangolin Lineage'].isin(['B.1.1.7', 'B.1.1.70', 'B.1.351'])].sample(sample_sz)['Strain'].unique().tolist()
    # extract B117 US samples from GISAID
    us_ids = gisaid_data[(gisaid_data['country']=='United States of America')
                      & (gisaid_data['pangolin_lineage']=='B.1.1.7')]['strain'].unique().tolist()
    sois = us_ids+outgrp_ids+b117_ids+[patient_zero]
    tree_fp = Path('tmp/b117_seqs_aligned.fasta' + '.treefile')
    if not Path.isfile(tree_fp):
        bs.fetch_seqs(msa_fp, 'tmp/b117_seqs_aligned.fasta', sois, is_aligned=True)
        tree_fp = bs.compute_tree('tmp/b117_seqs_aligned.fasta', num_cpus=20, redo=True)
    tree = ot.load_tree(tree_fp, patient_zero)
    dists = {n.name: tree.distance(n.name, patient_zero) for n in tree.get_terminals()}
    dists_df = (pd.DataFrame(index=dists.keys(), data=dists.values(), 
                      columns=['genetic_distance'])
         .reset_index()
         .rename(columns={'index': 'strain'}))
    b117_meta = gisaid_data[(gisaid_data['strain'].isin(sois))].drop_duplicates(subset=['strain']).copy()
    dists_df = pd.merge(dists_df, b117_meta, on='strain')
    dists_df.loc[:, 'group'] = 'outgroup'
    dists_df.loc[dists_df['strain'].isin(b117_ids), 'group'] = 'B.1.1.7 (non-US)'
    dists_df.loc[dists_df['strain'].isin(us_ids), 'group'] = 'B.1.1.7 (US)'
#     dists_df = dists_df.loc[~((dists_df['group']=='outgroup') & (dists_df['num_nt_subs']>=0.001))]
    dists_df = dists_df[~dists_df['date'].isna()]
    dists_df.loc[:, 'date'] = pd.to_datetime(dists_df['date'], errors='coerce')
    dists_df['time'] = dists_df['date'].astype(str).apply(bv.decimal_date)
    b117_model = ols('genetic_distance ~ time', data=dists_df[dists_df['group']!='outgroup']).fit()
    b117_model.params['time'] = clock_rate
    b117_preds = dists_df[dists_df['group']!='outgroup'].copy()
    b117_model.params['Intercept'] = np.mean(b117_preds['genetic_distance'] - (clock_rate*b117_preds['time']))
    b117_preds.loc[:, 'predictions'] = b117_model.predict(b117_preds['time'])
    b117_n = int(b117_preds.shape[0] / 2)
    outgrp_model = ols('genetic_distance ~ time', 
                       data=dists_df[dists_df['group']=='outgroup']).fit()
    outgrp_model.params['time'] = clock_rate
    outgrp_preds = dists_df[dists_df['group']=='outgroup'].copy()
    outgrp_model.params['Intercept'] = np.mean(outgrp_preds['genetic_distance'] - (clock_rate*outgrp_preds['time']))
    outgrp_preds.loc[:, 'predictions'] = outgrp_model.predict(outgrp_preds['time'])
    outgrp_n = int(outgrp_preds.shape[0] / 2)
    fig = go.Figure(
        data=go.Scatter(y=dists_df[dists_df['group']=='B.1.1.7 (US)']['genetic_distance'], 
                                    x=dists_df[dists_df['group']=='B.1.1.7 (US)']['date'],
                                    name='B.1.1.7 (US)', mode='markers',
                                    text=dists_df[dists_df['group']=='B.1.1.7 (US)']['strain'],
                                    hovertemplate =
                                    'Sample: %{text}',
                                    marker_color='rgba(220,20,60,.6)'))
    fig.add_trace(
        go.Scatter(y=dists_df[dists_df['group']=='B.1.1.7 (non-US)']['genetic_distance'], 
                   x=dists_df[dists_df['group']=='B.1.1.7 (non-US)']['date'],
                   mode='markers', marker_color='rgba(30,144,255,.6)', 
                   name='B.1.1.7 (non-US)'
                 ))
    fig.add_trace(go.Scatter(y=b117_preds['predictions'], 
                             x=b117_preds['date'], name='OLS (B.1.1.7)', 
                             mode='lines', line_color='rgba(0,0,0,1.)'))
    fig.add_annotation(x=b117_preds.iloc[b117_n]['date'], 
                       y=b117_preds.iloc[b117_n]['predictions'],
            text=f"B117 Lineage",
            showarrow=True,
            arrowhead=1, yshift=10, arrowsize=2, ay=-80)
    fig.add_trace(
        go.Scatter(y=dists_df[dists_df['group']=='outgroup']['genetic_distance'], 
                   x=dists_df[dists_df['group']=='outgroup']['date'],
                   mode='markers', marker_color='rgb(211,211,211, .6)', 
                   name='outgroup'
                 ))
    fig.add_trace(go.Scatter(y=outgrp_preds['predictions'], 
                             x=outgrp_preds['date'], name='OLS (outgroup)', 
                             mode='lines', line_color='rgba(0,0,0,1.)'))
    fig.add_annotation(x=outgrp_preds.iloc[outgrp_n]['date'], 
                       y=outgrp_preds.iloc[outgrp_n]['predictions'],
            text=f"outgroup",
            showarrow=True,
            arrowhead=1, yshift=10, arrowsize=2, ay=-80)
    fig.update_layout(yaxis_title='Genetic Distance (root-to-tip)',
                      xaxis_title='Collection Date',
                      template='plotly_white', autosize=True)#, height=850,
    return fig, dists_df

fig, dists = b117_genetic_distance(gisaid, input_params['msa_fp'], 
                            input_params['b117_meta'], input_params['patient_zero'])
fig.show()

In [None]:
gisaid[gisaid['pangolin_lineage']=='B.1.1.70']

In [15]:
dists.loc[(dists['group']=='outgroup') & (dists['genetic_distance']>0.0008), 'pangolin_lineage'].value_counts()

B.1.1.70     48
B.1.351      19
B.1.160       1
B.1.1.31      1
B.1.160.6     1
B.1.1.1       1
B.1.367       1
C.14          1
Name: pangolin_lineage, dtype: int64

In [2]:
def b117_nt_distance(gisaid_data, tree_fp, b117_meta, sample_sz=250, clock_rate=8e-4):
    # nabla_symbol = u"\u2207"
    croft_meta = pd.read_csv(b117_meta, sep='\t')
    croft_meta = croft_meta[croft_meta['Country']!='USA'].copy()
    # extract B117 samples from Emma Croft's build
    b117_meta = croft_meta[croft_meta['Pangolin Lineage']=='B.1.1.7'].sample(sample_sz)
    # extract outgroup samples from Emma Croft's build
    outgrp_meta = croft_meta[croft_meta['Pangolin Lineage']!='B.1.1.7'].sample(sample_sz)
    # extract B117 US samples from GISAID
    us_b117 = gisaid_data[(gisaid_data['country']=='United States of America')
                      & (gisaid_data['pangolin_lineage']=='B.1.1.7')].copy()
    # consolidate data and analyze
    b117_data = gisaid_data[(gisaid_data['strain'].isin(b117_meta['Strain'].unique()))
                       |(gisaid_data['strain'].isin(outgrp_meta['Strain'].unique()))
                       |(gisaid_data['strain'].isin(us_b117['strain'].unique()))].copy()
    b117_data.drop_duplicates(subset=['strain', 'pos', 'alt_codon'], inplace=True)
#     b117_data = b117_data[b117_data['gene']=='S']
    dists_df = (b117_data.groupby(['strain', 'date'])
                .agg(num_nt_subs=('strain', 'count'))
                .reset_index())
    dists_df['num_nt_subs'] = dists_df['num_nt_subs'] / 29903
    dists_df = dists_df[~dists_df['date'].isna()]           
    dists_df.loc[:, 'group'] = 'outgroup'
    dists_df.loc[dists_df['strain'].isin(b117_meta['Strain'].unique()), 'group'] = 'B.1.1.7 (non-US)'
    dists_df.loc[dists_df['strain'].isin(us_b117['strain'].unique()), 'group'] = 'B.1.1.7 (US)'
    dists_df = dists_df.loc[~((dists_df['group']=='outgroup') & (dists_df['num_nt_subs']>=0.001))]
    dists_df.loc[:, 'date'] = pd.to_datetime(dists_df['date'], errors='coerce')
    dists_df['time'] = dists_df['date'].astype(str).apply(bv.decimal_date)
    b117_model = ols('num_nt_subs ~ time', data=dists_df[dists_df['group']!='outgroup']).fit()
    b117_model.params['time'] = clock_rate
    b117_preds = dists_df[dists_df['group']!='outgroup'].copy()
    b117_model.params['Intercept'] = np.mean(b117_preds['num_nt_subs'] - (clock_rate*b117_preds['time']))
    b117_preds.loc[:, 'predictions'] = b117_model.predict(b117_preds['time'])
    b117_n = int(b117_preds.shape[0] / 2)
    outgrp_model = ols('num_nt_subs ~ time', 
                       data=dists_df[dists_df['group']=='outgroup']).fit()
    outgrp_model.params['time'] = clock_rate
    outgrp_preds = dists_df[dists_df['group']=='outgroup'].copy()
    outgrp_model.params['Intercept'] = np.mean(outgrp_preds['num_nt_subs'] - (clock_rate*outgrp_preds['time']))
    outgrp_preds.loc[:, 'predictions'] = outgrp_model.predict(outgrp_preds['time'])
    outgrp_n = int(outgrp_preds.shape[0] / 2)
    fig = go.Figure(
        data=go.Scatter(y=dists_df[dists_df['group']=='B.1.1.7 (US)']['num_nt_subs'], 
                                    x=dists_df[dists_df['group']=='B.1.1.7 (US)']['date'],
                                    name='B.1.1.7 (US)', mode='markers',
                                    text=dists_df[dists_df['group']=='B.1.1.7 (US)']['strain'],
                                    hovertemplate =
                                    'Sample: %{text}',
                                    marker_color='rgba(220,20,60,.6)'))
    fig.add_trace(
        go.Scatter(y=dists_df[dists_df['group']=='B.1.1.7 (non-US)']['num_nt_subs'], 
                   x=dists_df[dists_df['group']=='B.1.1.7 (non-US)']['date'],
                   mode='markers', marker_color='rgba(30,144,255,.6)', 
                   name='B.1.1.7 (non-US)'
                 ))
    fig.add_trace(go.Scatter(y=b117_preds['predictions'], 
                             x=b117_preds['date'], name='OLS (B.1.1.7)', 
                             mode='lines', line_color='rgba(0,0,0,1.)'))
    fig.add_annotation(x=b117_preds.iloc[b117_n]['date'], 
                       y=b117_preds.iloc[b117_n]['predictions'],
            text=f"B117 Lineage",
            showarrow=True,
            arrowhead=1, yshift=10, arrowsize=2, ay=-80)
    fig.add_trace(
        go.Scatter(y=dists_df[dists_df['group']=='outgroup']['num_nt_subs'], 
                   x=dists_df[dists_df['group']=='outgroup']['date'],
                   mode='markers', marker_color='rgb(211,211,211, .6)', 
                   name='outgroup'
                 ))
    fig.add_trace(go.Scatter(y=outgrp_preds['predictions'], 
                             x=outgrp_preds['date'], name='OLS (outgroup)', 
                             mode='lines', line_color='rgba(0,0,0,1.)'))
    fig.add_annotation(x=outgrp_preds.iloc[outgrp_n]['date'], 
                       y=outgrp_preds.iloc[outgrp_n]['predictions'],
            text=f"outgroup",
            showarrow=True,
            arrowhead=1, yshift=10, arrowsize=2, ay=-80)
    fig.update_layout(yaxis_title='Genetic Distance (root-to-tip)',
                      xaxis_title='Collection Date',
                      template='plotly_white', autosize=True)#, height=850,
    return fig


# fig = b117_nt_distance(gisaid, input_params['tree_fp'], input_params['b117_meta'])
# fig.show()

In [24]:
np.log10(1)

0.0

In [30]:
def map_by_state(data: pd.DataFrame, feature: str, values: list, states_fp: str, strain: str='B117'):
    with open(states_fp) as f:
        states = json.load(f)
    state_map = {x['properties']['name']: x['id'] for x in states['features']}
    total_samples_by_state = data.groupby('division').agg(total_samples=('strain', 'nunique')).reset_index()
    results = data.loc[(data[feature].isin(values)) & (data['country']=='United States of America')]
    results_by_state = results.groupby('division').agg(num_samples=('idx', 'nunique')).reset_index()
    results_by_state = pd.merge(total_samples_by_state, results_by_state, on='division', how='left')
    results_by_state['num_samples'].fillna(0, inplace=True)
    results_by_state['id'] = results_by_state['division'].apply(lambda x: state_map.get(x, 'unk'))
    results_by_state = results_by_state[results_by_state['num_samples']>0]
#     fig = px.choropleth(results_by_state, geojson=states, scope="usa",
#                                locations='id', color='num_samples',# locationmode='USA-states',
#                                color_continuous_scale="bluered",
#                                range_color=(0, results_by_state['num_samples'].max()),
# #                                labels={'num_samples': f'Number of samples with {values}: ', 'division': 'loc:'},
#                                hover_data=['division', 'num_samples']
#                               )
    results_by_state['log_num_samples'] = results_by_state['num_samples'].apply(lambda x: np.log10(x))
    fig = px.choropleth_mapbox(results_by_state, geojson=states, 
                               locations='id', color='log_num_samples',
                               color_continuous_scale='Bluyl', center={"lat": 37.0902, "lon": -95.7129},
                               range_color=(0, results_by_state['log_num_samples'].max()),
                               mapbox_style="carto-positron", zoom=2,
                               opacity=0.5,
                               hover_data=['division', 'num_samples', 'total_samples'],
                               labels={'num_samples':f'Sequences with {strain}', 'total_samples': 'Total Sequences'}
                              )
    fig.update_coloraxes(colorbar=dict(showticklabels=False, title="B117 Cases (logarithmic)"))
    fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
    return fig, state_map, results_by_state

fig, _, df = map_by_state(gisaid, feature, values, input_params['states_fp'])
fig.show()

In [29]:
gisaid['country'].unique()

array(['China', 'United States of America', 'Taiwan', 'France',
       'Australia', 'Germany', 'Singapore', 'United Kingdom', 'Japan',
       'South Korea', 'Belgium', 'Vietnam', 'Nepal', 'Italy', 'Cambodia',
       'Sweden', 'Hong Kong', 'Brazil', 'Canada', 'Finland', 'Mexico',
       'Switzerland', 'New Zealand', 'India', 'Nigeria', 'Netherlands',
       'Luxembourg', 'Portugal', 'Czech Republic', 'Spain', 'Chile',
       'Ireland', 'Democratic Republic of the Congo', 'Panama', 'Georgia',
       'Denmark', 'Russia', 'Peru', 'Hungary', 'Saudi Arabia', 'Kuwait',
       'Poland', 'Malaysia', 'South Africa', 'Turkey', 'Pakistan',
       'Iceland', 'Ecuador', 'Norway', 'Slovakia', 'Colombia', 'Senegal',
       'Algeria', 'Greece', 'Israel', 'Austria', 'Slovenia', 'Argentina',
       'Latvia', 'Ghana', 'Thailand', 'Iran', 'Uruguay', 'Sri Lanka',
       'Gambia', 'Ukraine', 'Jordan', 'Philippines', 'Egypt',
       'Costa Rica', 'Kazakhstan', 'United Arab Emirates', 'Indonesia',
       'Brun

In [33]:
def strain_nt_distance(data, feature, values, strain='B117', sample_sz=250):
    clock_rate = 8e-4
    data['mutation'] = data['gene'] + ':' + data['ref_aa'] + data['codon_num'].astype(str) + data['alt_aa']
    if feature=='pangolin_lineage':
        dists_df = create_lineage_data(data, feature, values, strain, sample_sz)
    elif feature=='mutation':
        dists_df = create_distance_data(data, set(values), name=strain, sample_sz=sample_sz)
    dists_df['num_subs'] = dists_df['mutations'].str.len() / 29904
    dists_df = dists_df[~dists_df['date'].isna()]
    dists_df.loc[:, 'date'] = pd.to_datetime(dists_df['date'], errors='coerce')
    dists_df['time'] = dists_df['date'].astype(str).apply(bv.decimal_date)
    b117_model = ols('num_subs ~ time', data=dists_df[dists_df['group']!='outgroup']).fit()
    b117_model.params['time'] = clock_rate
    b117_preds = dists_df[dists_df['group']!='outgroup'].copy()
    b117_model.params['Intercept'] = np.mean(b117_preds['num_subs'] - (clock_rate*b117_preds['time']))
    b117_preds.loc[:, 'predictions'] = b117_model.predict(b117_preds['time'])
    b117_n = int(b117_preds.shape[0] / 2)
    outgrp_model = ols('num_subs ~ time', 
                       data=dists_df[dists_df['group']=='outgroup']).fit()
    outgrp_model.params['time'] = clock_rate
    outgrp_preds = dists_df[dists_df['group']=='outgroup'].copy()
    outgrp_model.params['Intercept'] = np.mean(outgrp_preds['num_subs'] - (clock_rate*outgrp_preds['time']))
    outgrp_preds.loc[:, 'predictions'] = outgrp_model.predict(outgrp_preds['time'])
    outgrp_n = int(outgrp_preds.shape[0] / 3)
    fig = go.Figure(
        data=go.Scatter(y=dists_df[dists_df['group']==f'Lineage {strain} in US']['num_subs'], 
                                    x=dists_df[dists_df['group']==f'Lineage {strain} in US']['date'],
                                    name=f'{strain} (US)', mode='markers',
                                    hovertemplate =
                                    'Sample: %{text}',
                                    marker_color='rgba(220,20,60,.6)'))
    fig.add_trace(
        go.Scatter(y=dists_df[dists_df['group']==f'Lineage {strain}']['num_subs'], 
                   x=dists_df[dists_df['group']==f'Lineage {strain}']['date'],
                   mode='markers', marker_color='rgba(30,144,255,.6)', 
                   name=f'{strain} (non-US)'
                 ))
    fig.add_trace(go.Scatter(y=b117_preds['predictions'], 
                             x=b117_preds['date'], name='OLS (B.1.1.7)', 
                             mode='lines', line_color='rgba(0,0,0,1.)'))
    fig.add_annotation(x=b117_preds.iloc[b117_n]['date'], 
                       y=b117_preds.iloc[b117_n]['predictions'],
            text=f"{strain} Lineage",
            showarrow=True,
            arrowhead=1, yshift=10, arrowsize=2, ay=-80)
    fig.add_trace(
        go.Scatter(y=dists_df[dists_df['group']=='outgroup']['num_subs'], 
                   x=dists_df[dists_df['group']=='outgroup']['date'],
                   mode='markers', marker_color='rgb(211,211,211, .6)', 
                   name='outgroup'
                 ))
    fig.add_trace(go.Scatter(y=outgrp_preds['predictions'], 
                             x=outgrp_preds['date'], name='OLS (outgroup)', 
                             mode='lines', line_color='rgba(0,0,0,1.)'))
    fig.add_annotation(x=outgrp_preds.iloc[outgrp_n]['date'], 
                       y=outgrp_preds.iloc[outgrp_n]['predictions'],
            text=f"outgroup",
            showarrow=True,
            arrowhead=1, yshift=10, arrowsize=2, ay=-80)
    fig.update_layout(yaxis_title='Genetic Distance (root-to-tip)',
                      xaxis_title='Collection Date',
                      template='plotly_white', autosize=True)#, height=850,
    return fig
feature = 'mutation'
values=['S:N501Y']
fig = strain_nt_distance(gisaid, feature, values)
fig.show()

In [30]:
def create_lineage_data(data, feature, values, strain, sample_sz=250):
    data = (data.groupby(['date', 'country', 'division', 
                          'location', 'pangolin_lineage', 'idx'])
                .agg(mutations=('mutation', 'unique')).reset_index())
    first_detected = data.loc[data[feature].isin(values), 'date'].min()
    mutations = set(data.loc[(data[feature].isin(values)) 
                     &(data['date']==first_detected), 'mutations'].explode().unique())
    data['d_w'] = data['mutations'].apply(compute_similarity, args=(mutations,))
    outgroup = data[~data[feature].isin(values)].nlargest(sample_sz, 'd_w')['idx'].unique()
    ingroup = data.loc[(data[feature].isin(values))].sample(sample_sz)['idx'].unique()
    usgroup = data.loc[(data[feature].isin(values)) & (data['country']=='United States of America'), 'idx'].unique()
    data = data.loc[(data['idx'].isin(ingroup)) | (data['idx'].isin(outgroup)) | (data['idx'].isin(usgroup))]
    data['group'] = 'outgroup'
    data.loc[(data['idx'].isin(ingroup)), 'group'] = f'Lineage {strain}'
    data.loc[(data['idx'].isin(usgroup)), 'group'] = f'Lineage {strain} in US'
    return data

In [14]:
def create_distance_data(data: pd.DataFrame, mutations: set, name: str, sample_sz: int=250):
    data['mutation'] = data['gene'] + ':' + data['ref_aa'] + data['codon_num'].astype(str) + data['alt_aa']
    data = (data.groupby(['date', 'country', 'division', 
                          'location', 'pangolin_lineage', 'idx'])
                .agg(mutations=('mutation', 'unique')).reset_index())
    data['is_vui'] = data['mutations'].apply(is_vui, args=(mutations,))
    ref_muts = extract_mutations(data)
    data['d_w'] = data['mutations'].apply(compute_similarity, args=(ref_muts,))
    outgroup = data[data['is_vui']==False].nlargest(sample_sz, 'd_w')['idx'].unique()
    ingroup = data.loc[(data['is_vui']==True)].sample(sample_sz)['idx'].unique()
    usgroup = data.loc[(data['is_vui']==True) & (data['country']=='United States of America')]
    data = data.loc[(data['idx'].isin(ingroup)) | (data['idx'].isin(outgroup)) | (data['idx'].isin(usgroup))]
    data['group'] = 'outgroup'
    data.loc[(data['idx'].isin(ingroup)), 'group'] = f'Lineage {name}'
    data.loc[(data['idx'].isin(usgroup)), 'group'] = f'Lineage {name} in US'
    return data

def is_vui(x, mutations: set):
    return mutations.issubset(set(x))

    
def extract_mutations(data: pd.DataFrame):
    first_detected = data.loc[data['is_vui']==True, 'date'].min()
    mutations = data.loc[(data['is_vui']==True) 
                         &(data['date']==first_detected), 'mutations'].explode().unique()
    return set(mutations)
        

def compute_similarity(x, reference_mutations: set):
    common_mutations = set(x) & reference_mutations
    return len(common_mutations)

In [26]:
mutations = {'S:K417N', 'S:E484K'}
dists_df = create_distance_data(gisaid, mutations, 'test')

In [27]:
dists_df['group'].value_counts()

Lineage test    250
outgroup        250
Name: group, dtype: int64

In [233]:
import re
import datetime as dt
def decimal_date(date,fmt="%Y-%m-%d",variable=False):
#     date = str(date)
    """ Converts calendar dates in specified format to decimal date. """
    if fmt == "":
        return date
    delimiter=re.search('[^0-9A-Za-z%]',fmt) ## search for non-alphanumeric symbols in fmt (should be field delimiter)
    delimit=None
    if delimiter is not None:
        delimit=delimiter.group()

    if variable==True: ## if date is variable - extract what is available
        if delimit is not None:
            dateL=len(date.split(delimit)) ## split date based on symbol
        else:
            dateL=1 ## no non-alphanumeric characters in date, assume dealing with an imprecise date (something like just year)

        if dateL==2:
            fmt=delimit.join(fmt.split(delimit)[:-1]) ## reduce fmt down to what's available
        elif dateL==1:
            fmt=delimit.join(fmt.split(delimit)[:-2])

    adatetime=dt.datetime.strptime(date,fmt) ## convert to datetime object
    year = adatetime.year ## get year
    boy = dt.datetime(year, 1, 1) ## get beginning of the year
    eoy = dt.datetime(year + 1, 1, 1) ## get beginning of next year
    return year + ((adatetime - boy).total_seconds() / ((eoy - boy).total_seconds())) ## return fractional year

In [19]:
meta = pd.read_csv(meta_fp, sep='\t')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [30]:
meta.iloc[-1]['location']

'unk'

In [23]:
cns = AlignIO.read(msa_fp, 'fasta')

In [25]:
patient_zero = 'NC_045512.2'
seqs, ref_seq = bm.process_cns_seqs(cns, patient_zero,
                                     start_pos=0, end_pos=30000)
seqsdf = (pd.DataFrame(index=seqs.keys(), 
                       data=seqs.values(), 
                       columns=['sequence'])
            .reset_index()
            .rename(columns={'index': 'idx'}))

In [28]:
seqsdf[seqsdf['idx']=='hCoV-19/USA/FL-BPHL-2270/2020|EPI_ISL_802725|2020-12-29']

Unnamed: 0,idx,sequence
345588,hCoV-19/USA/FL-BPHL-2270/2020|EPI_ISL_802725|2...,---aaaggtttataccttcccaggtaacaaaccaaccaactttcga...


In [33]:
df = pd.merge(seqsdf, meta, left_on='idx', right_on='strain')

In [36]:
df[df['idx'] == 'hCoV-19/USA/FL-BPHL-2270/2020|EPI_ISL_802725|2020-12-29']['pangolin_lineage']

345587    B.1.1.7
Name: pangolin_lineage, dtype: object

In [8]:
meta_fp

'/home/al/analysis/gisaid/metadata_concat_2021-01-11_16-49.tsv'

In [40]:
# gisaid.loc[(gisaid['pangolin_lineage']=='B.1.1.7'), 'location'].value_counts()

In [32]:
xtra_fp = '/home/al/analysis/gisaid/gisaid_hcov-19_2021_01_11_23.tsv'
xtra = pd.read_csv(xtra_fp, sep='\t')
xtra.rename(columns=cols, inplace=True)

xtra.columns

xtra['country'] = xtra['Location'].apply(lambda x: x.split('/')[1].strip())
xtra['division'] = xtra['Location'].apply(lambda x: x.split('/')[2].strip())
def clean_locs(x):
    if len(x.split('/')) > 3:
        return x.split('/')[3].strip()
    return 'unk'
xtra['location'] = xtra['Location'].apply(clean_locs)

sois = xtra['strain'].unique().tolist()

xtra.shape

meta_fp = '/home/al/analysis/gisaid/metadata_2021-01-11_16-49.tsv.gz'
meta = pd.read_csv(meta_fp, sep='\t', compression='gzip')
meta.columns

meta = pd.concat([meta, xtra])

meta.shape[0]

sois = dict.fromkeys(sois)

cns = AlignIO.read(msa_fp, 'fasta')
for rec in cns:
    if rec.id.split('|')[0] in sois.keys():
        sois[rec.id.split('|')[0]] = rec.id

xtra['strain'] = xtra['strain'].apply(lambda x: sois[x])

meta_fp = '/home/al/analysis/gisaid/metadata_2021-01-11_16-49.tsv.gz'
meta = pd.read_csv(meta_fp, sep='\t', compression='gzip')
# meta.columns

meta = pd.concat([meta, xtra])

In [39]:
meta.to_csv('/home/al/analysis/gisaid/metadata_concat_2021-01-11_16-49.tsv', index=False, sep='\t')

In [257]:
gisaid['mutation'] = gisaid['gene'] + ':' + gisaid['ref_aa'] + gisaid['codon_num'].astype(str) + gisaid['alt_aa']
gisaid = (gisaid.groupby(['date', 'country', 'division', 'location', 'pangolin_lineage', 'idx'])
                .agg(mutations=('mutation', 'unique')).reset_index())
gisaid[gisaid['pangolin_lineage']=='B.1.1.7']['mutations'].unique()

TypeError: unhashable type: 'numpy.ndarray'

In [269]:
first_detected = gisaid.loc[gisaid['pangolin_lineage']=='B.1.1.7', 'date'].min()
muts = gisaid.loc[(gisaid['pangolin_lineage']=='B.1.1.7')
             &(gisaid['date']==first_detected), 'mutations'].explode().unique()

In [271]:
muts

array(['5UTR:R81C', 'ORF1ab:S216S', 'ORF1ab:F924F', 'ORF1ab:T1001I',
       'ORF1ab:A1708D', 'ORF1ab:F1907F', 'ORF1ab:I2230T', 'ORF1ab:L4715L',
       'ORF1ab:P4804L', 'ORF1ab:T5005I', 'ORF1ab:L5304P', 'S:N501Y',
       'S:A570D', 'S:D614G', 'S:P681H', 'S:T716I', 'S:S982A', 'S:D1118H',
       'ORF8:Q27_', 'ORF8:R52I', 'ORF8:Y73C', 'N:D3L', 'N:R203K',
       'N:G204R', 'N:S235F'], dtype=object)

In [None]:
column_rename = {'Virus name': 'strain', 'Collection date': 'date'
                ,'Location'}

In [249]:
meta1 = '/home/al/analysis/gisaid/metadata_2021-01-08_18-19.tsv'
meta2 = '/home/al/analysis/gisaid/gisaid_hcov-19_2021_01_11_23.tsv'
df1 = pd.read_csv(meta1, sep='\t')
df2 = pd.read_csv(meta2, sep='\t')
df2.columns


Columns (8) have mixed types.Specify dtype option on import or set low_memory=False.



Index(['Virus name', 'Accession ID', 'Collection date', 'Location', 'Host',
       'Additional location information', 'Gender', 'Patient age',
       'Patient status', 'Passage', 'Specimen', 'Additional host information',
       'Lineage', 'Clade'],
      dtype='object')

In [4]:
gisaid_msa_fp = Path('/home/al/analysis/gisaid/sequences_2021-01-08_08-46_aligned.fasta')
gisaid_meta_fp = Path('/home/al/analysis/gisaid/metadata_2021-01-08_18-19.tsv')

In [213]:
df = pd.read_csv(input_params['b117_meta'], sep='\t')
df['Strain'].unique().shape

sample_sz = 300
df = df[df['Country']!='USA'].copy()
b117_strains = df[df['Pangolin Lineage']=='B.1.1.7'].sample(sample_sz)['Strain'].unique().tolist()
outgrp_strains = df[df['Pangolin Lineage']!='B.1.1.7'].sample(sample_sz)['Strain'].unique().tolist()
us_strains = gisaid[(gisaid['country']=='United States of America')
                   &(gisaid['pangolin_lineage']=='B.1.1.7')]['strain'].unique().tolist()
sois = us_strains+outgrp_strains+b117_strains+[input_params['patient_zero']]

In [214]:
def fetch_seqs(msa_filepath, out_fp, sample_idxs: list):
    cns = AlignIO.read(msa_filepath, 'fasta')
    # TODO: filter out samples, write to file
    my_cns = Align.MultipleSeqAlignment([rec for rec in cns if rec.id in sample_idxs])
    return AlignIO.write(my_cns, out_fp, 'fasta')

fetch_seqs(gisaid_msa_fp, 'test_data/b117_seqs_aligned.fasta', sois)

1

In [215]:
import bjorn_support as bs
bs.compute_tree('test_data/b117_seqs_aligned.fasta', num_cpus=20)

OMP: Info #270: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


'test_data/b117_seqs_aligned.fasta.treefile'

In [216]:
import onion_trees as bv
tree_fp = 'test_data/b117_seqs_aligned.fasta.treefile'
tree = bv.load_tree(tree_fp, input_params['patient_zero'])

In [227]:
dists = {n.name: tree.distance(n.name, input_params['patient_zero']) for n in tree.get_terminals()}
dists_df = (pd.DataFrame(index=dists.keys(), data=dists.values(), 
                      columns=['genetic_distance'])
         .reset_index()
         .rename(columns={'index': 'strain'}))

dists_df['strain'].unique().shape

# dists_df['genetic_distance']

dists_df = dists_df[dists_df['strain']!=input_params['patient_zero']].copy()
dists_df['group'] = 'outgroup'
dists_df.loc[dists_df['strain'].isin(b117_strains), 'group'] = 'B.1.1.7 (non-US)'
dists_df.loc[dists_df['strain'].isin(us_strains), 'group'] = 'B.1.1.7 (US)'
dists_df = dists_df.loc[~((dists_df['group']=='outgroup') & (dists_df['genetic_distance']>=0.0008))]
dists_df = pd.merge(gisaid, dists_df, on='strain').drop_duplicates(subset=['strain'])

In [231]:
dists_df['group'].value_counts()

B.1.1.7 (non-US)    300
outgroup            210
B.1.1.7 (US)         11
Name: group, dtype: int64

In [232]:
dists_df['date'] = pd.to_datetime(dists_df['date'])
dists_df = dists_df.loc[~dists_df['date'].isna()].copy()

In [234]:
dists_df['time'] = dists_df['date'].astype(str).apply(decimal_date)

In [235]:
dists_df['group'].value_counts()

B.1.1.7 (non-US)    300
outgroup            210
B.1.1.7 (US)         11
Name: group, dtype: int64

In [239]:
outgrp_model.params['time'] = 8e-4
outgrp_model.params

Intercept   -1.349371
time         0.000800
dtype: float64

In [244]:
outgrp_model.params

Intercept   -1.349371
time         0.000800
dtype: float64

In [None]:
a <- mean(y - b1 * x)

In [246]:

fig.show()

In [None]:
def b117_nt_distance(gisaid_data, b117_meta, sample_sz=250):
    nabla_symbol = u"\u2207"
    croft_meta = pd.read_csv(b117_meta, sep='\t')
    croft_meta = croft_meta[croft_meta['Country']!='USA'].copy()
    # extract B117 samples from Emma Croft's build
    b117_meta = croft_meta[croft_meta['Pangolin Lineage']=='B.1.1.7'].sample(sample_sz)
    # extract outgroup samples from Emma Croft's build
    outgrp_meta = croft_meta[croft_meta['Pangolin Lineage']!='B.1.1.7'].sample(sample_sz)
    # extract B117 US samples from GISAID
    us_b117 = gisaid_data[(gisaid_data['country']=='United States of America')
                      & (gisaid_data['pangolin_lineage']=='B.1.1.7')].copy()
    # consolidate data and analyze
    b117_data = gisaid_data[(gisaid_data['strain'].isin(b117_meta['Strain'].unique()))
                       |(gisaid_data['strain'].isin(outgrp_meta['Strain'].unique()))
                       |(gisaid_data['strain'].isin(us_b117['strain'].unique()))].copy()
    b117_data.drop_duplicates(subset=['strain', 'pos', 'alt_codon'], inplace=True)
#     b117_data = b117_data[b117_data['gene']=='S']
    dists_df = (b117_data.groupby(['strain', 'date'])
                .agg(num_nt_subs=('strain', 'count'))
                .reset_index())
    dists_df['num_nt_subs'] = dists_df['num_nt_subs'] / 29903
    dists_df = dists_df[~dists_df['date'].isna()]           
    dists_df.loc[:, 'group'] = 'outgroup'
    dists_df.loc[dists_df['strain'].isin(b117_meta['Strain'].unique()), 'group'] = 'B.1.1.7 (non-US)'
    dists_df.loc[dists_df['strain'].isin(us_b117['strain'].unique()), 'group'] = 'B.1.1.7 (US)'
    dists_df.loc[:, 'date'] = pd.to_datetime(dists_df['date'], errors='coerce')
    dists_df.loc[:, 'month'] = dists_df['date'].dt.month
    dists_df.loc[:, 'doy'] = dists_df['date'].dt.dayofyear
    dists_df.loc[:, 'time'] = dists_df['date'].astype(int)/1e12
    dists_df = dists_df.loc[~dists_df['doy'].isna()].copy()
    b117_model = ols('num_nt_subs ~ time', data=dists_df[dists_df['group']!='outgroup']).fit()
    b117_preds = dists_df[dists_df['group']!='outgroup'].copy()
    b117_preds.loc[:, 'predictions'] = b117_model.predict(b117_preds['time'])
    b117_n = int(b117_preds.shape[0] / 2)
    outgrp_model = ols('num_nt_subs ~ time', 
                       data=dists_df[dists_df['group']=='outgroup']).fit()
    outgrp_preds = dists_df[dists_df['group']=='outgroup'].copy()
    outgrp_preds.loc[:, 'predictions'] = outgrp_model.predict(outgrp_preds['time'])
    outgrp_n = int(outgrp_preds.shape[0] / 2)
    fig = go.Figure(
        data=go.Scatter(y=dists_df[dists_df['group']=='B.1.1.7 (US)']['num_nt_subs'], 
                                    x=dists_df[dists_df['group']=='B.1.1.7 (US)']['date'],
                                    name='B.1.1.7 (US)', mode='markers',
                                    text=dists_df[dists_df['group']=='B.1.1.7 (US)']['strain'],
                                    hovertemplate =
                                    'Sample: %{text}',
                                    marker_color='rgba(220,20,60,.6)'))
    fig.add_trace(
        go.Scatter(y=dists_df[dists_df['group']=='B.1.1.7 (non-US)']['num_nt_subs'], 
                   x=dists_df[dists_df['group']=='B.1.1.7 (non-US)']['date'],
                   mode='markers', marker_color='rgba(30,144,255,.6)', 
                   name='B.1.1.7 (non-US)'
                 ))
    fig.add_trace(go.Scatter(y=b117_preds['predictions'], 
                             x=b117_preds['date'], name='OLS (B.1.1.7)', 
                             mode='lines', line_color='rgba(0,0,0,1.)'))
    fig.add_annotation(x=b117_preds.iloc[b117_n]['date'], 
                       y=b117_preds.iloc[b117_n]['predictions'],
            text=f"{nabla_symbol} = {b117_model.params['time']:.3e}",
            showarrow=True,
            arrowhead=1, yshift=10, arrowsize=2, ay=-80)
    fig.add_trace(
        go.Scatter(y=dists_df[dists_df['group']=='outgroup']['num_nt_subs'], 
                   x=dists_df[dists_df['group']=='outgroup']['date'],
                   mode='markers', marker_color='rgb(211,211,211, .6)', 
                   name='outgroup'
                 ))
    fig.add_trace(go.Scatter(y=outgrp_preds['predictions'], 
                             x=outgrp_preds['date'], name='OLS (outgroup)', 
                             mode='lines', line_color='rgba(0,0,0,1.)'))
    fig.add_annotation(x=outgrp_preds.iloc[outgrp_n]['date'], 
                       y=outgrp_preds.iloc[outgrp_n]['predictions'],
            text=f"{nabla_symbol} = {outgrp_model.params['time']:.3e}",
            showarrow=True,
            arrowhead=1, yshift=10, arrowsize=2, ay=-80)
    fig.update_layout(yaxis_title='Nucleotide Substitutions per site (root-to-tip)',
                      xaxis_title='Collection Date',
                      template='plotly_white', autosize=True)#, height=850,
    return fig

In [5]:
# gisaid_subs_long, _ = bm.identify_replacements_per_sample(gisaid_msa_fp, gisaid_meta_fp,
#                                                        bm.GENE2POS, data_src='gisaid')

In [6]:
# gisaid.drop(columns=['sequence'], inplace=True)

In [7]:
# gisaid.to_csv('/home/al/analysis/gisaid/subs_long_2021-01-08.csv.gz', compression='gzip')

In [8]:
# gisaid['country'].unique()

In [69]:
gisaid.columns

Index(['Unnamed: 0', 'idx', 'replacements', 'pos', 'gene', 'codon_num',
       'ref_codon', 'alt_codon', 'ref_aa', 'alt_aa', 'strain', 'virus',
       'gisaid_epi_isl', 'genbank_accession', 'date', 'region', 'country',
       'division', 'location', 'region_exposure', 'country_exposure',
       'division_exposure', 'segment', 'length', 'host', 'age', 'sex',
       'Nextstrain_clade', 'pangolin_lineage', 'GISAID_clade',
       'originating_lab', 'submitting_lab', 'authors', 'url', 'title',
       'paper_url', 'date_submitted', 'tmp', 'month', 'mutation'],
      dtype='object')

In [73]:
cns.get_alignment_length

<bound method MultipleSeqAlignment.get_alignment_length of <<class 'Bio.Align.MultipleSeqAlignment'> instance (330133 records of length 29903, SingleLetterAlphabet()) at 7f4482276280>>

In [81]:
# pd.read_csv(input_params['b117_meta'], sep='\t').columns

In [83]:
gisaid.loc[gisaid['country']=='USA', 'country'] = 'United States of America'

In [127]:
def b117_distance(gisaid_data, b117_meta, sample_sz=250):
    nabla_symbol = u"\u2207"
    croft_meta = pd.read_csv(b117_meta, sep='\t')
    croft_meta = croft_meta[croft_meta['Country']!='USA'].copy()
    # extract B117 samples from Emma Croft's build
    b117_meta = croft_meta[croft_meta['Pangolin Lineage']=='B.1.1.7'].sample(sample_sz)
    # extract outgroup samples from Emma Croft's build
    outgrp_meta = croft_meta[croft_meta['Pangolin Lineage']!='B.1.1.7'].sample(sample_sz)
    # extract B117 US samples from GISAID
    us_b117 = gisaid_data[(gisaid_data['country']=='United States of America')
                      & (gisaid_data['pangolin_lineage']=='B.1.1.7')].copy()
    # consolidate data and analyze
    b117_data = gisaid_data[(gisaid_data['strain'].isin(b117_meta['Strain'].unique()))
                       |(gisaid_data['strain'].isin(outgrp_meta['Strain'].unique()))
                       |(gisaid_data['strain'].isin(us_b117['strain'].unique()))].copy()
    b117_data.drop_duplicates(subset=['strain', 'pos', 'alt_codon'], inplace=True)
#     b117_data = b117_data[b117_data['gene']=='S']
    dists_df = (b117_data.groupby(['strain', 'date'])
                .agg(num_nt_subs=('strain', 'count'))
                .reset_index())
    dists_df['num_nt_subs'] = dists_df['num_nt_subs'] / 29903
    dists_df = dists_df[~dists_df['date'].isna()]           
    dists_df.loc[:, 'group'] = 'outgroup'
    dists_df.loc[dists_df['strain'].isin(b117_meta['Strain'].unique()), 'group'] = 'B.1.1.7 (non-US)'
    dists_df.loc[dists_df['strain'].isin(us_b117['strain'].unique()), 'group'] = 'B.1.1.7 (US)'
    dists_df.loc[:, 'date'] = pd.to_datetime(dists_df['date'], errors='coerce')
    dists_df.loc[:, 'month'] = dists_df['date'].dt.month
    dists_df.loc[:, 'doy'] = dists_df['date'].dt.dayofyear
    dists_df.loc[:, 'time'] = dists_df['date'].astype(int)/1e12
    dists_df = dists_df.loc[~dists_df['doy'].isna()].copy()
    b117_model = ols('num_nt_subs ~ time', data=dists_df[dists_df['group']!='outgroup']).fit()
    b117_preds = dists_df[dists_df['group']!='outgroup'].copy()
    b117_preds.loc[:, 'predictions'] = b117_model.predict(b117_preds['time'])
    b117_n = int(b117_preds.shape[0] / 2)
    outgrp_model = ols('num_nt_subs ~ time', 
                       data=dists_df[dists_df['group']=='outgroup']).fit()
    outgrp_preds = dists_df[dists_df['group']=='outgroup'].copy()
    outgrp_preds.loc[:, 'predictions'] = outgrp_model.predict(outgrp_preds['time'])
    outgrp_n = int(outgrp_preds.shape[0] / 2)
    fig = go.Figure(
        data=go.Scatter(y=dists_df[dists_df['group']=='B.1.1.7 (US)']['num_nt_subs'], 
                                    x=dists_df[dists_df['group']=='B.1.1.7 (US)']['date'],
                                    name='B.1.1.7 (US)', mode='markers',
                                    text=dists_df[dists_df['group']=='B.1.1.7 (US)']['strain'],
                                    hovertemplate =
                                    'Sample: %{text}',
                                    marker_color='rgba(220,20,60,.6)'))
    fig.add_trace(
        go.Scatter(y=dists_df[dists_df['group']=='B.1.1.7 (non-US)']['num_nt_subs'], 
                   x=dists_df[dists_df['group']=='B.1.1.7 (non-US)']['date'],
                   mode='markers', marker_color='rgba(30,144,255,.6)', 
                   name='B.1.1.7 (non-US)'
                 ))
    fig.add_trace(go.Scatter(y=b117_preds['predictions'], 
                             x=b117_preds['date'], name='OLS (B.1.1.7)', 
                             mode='lines', line_color='rgba(0,0,0,1.)'))
    fig.add_annotation(x=b117_preds.iloc[b117_n]['date'], 
                       y=b117_preds.iloc[b117_n]['predictions'],
            text=f"{nabla_symbol} = {b117_model.params['time']:.3e}",
            showarrow=True,
            arrowhead=1, yshift=10, arrowsize=2, ay=-80)
    fig.add_trace(
        go.Scatter(y=dists_df[dists_df['group']=='outgroup']['num_nt_subs'], 
                   x=dists_df[dists_df['group']=='outgroup']['date'],
                   mode='markers', marker_color='rgb(211,211,211, .6)', 
                   name='outgroup'
                 ))
    fig.add_trace(go.Scatter(y=outgrp_preds['predictions'], 
                             x=outgrp_preds['date'], name='OLS (outgroup)', 
                             mode='lines', line_color='rgba(0,0,0,1.)'))
    fig.add_annotation(x=outgrp_preds.iloc[outgrp_n]['date'], 
                       y=outgrp_preds.iloc[outgrp_n]['predictions'],
            text=f"{nabla_symbol} = {outgrp_model.params['time']:.3e}",
            showarrow=True,
            arrowhead=1, yshift=10, arrowsize=2, ay=-80)
    fig.update_layout(yaxis_title='Nucleotide Substitutions per site (root-to-tip)',
                      xaxis_title='Collection Date',
                      template='plotly_white', autosize=True)#, height=850,
    return fig

fig, _, m = b117_distance(gisaid, input_params['b117_meta'], sample_sz=150)
fig.show()

In [120]:
m.params

Intercept   -4.251383e-02
time         2.695783e-08
dtype: float64

In [117]:
m.iloc[int(m.shape[0]/2)]['predictions']

0.00018413149686254127

In [118]:
m.iloc[int(m.shape[0]/2)]['date']

Timestamp('2020-03-23 00:00:00')

In [108]:
m.loc[m['predictions']==m['predictions'].median(), 'time']

Series([], Name: time, dtype: float64)

In [9]:
gisaid = pd.read_csv(input_params['gisaid_data_fp'], compression='gzip')

In [10]:
gisaid.loc[(gisaid['pangolin_lineage']=='B.1.1.7'), 'country'].unique()

array(['United Kingdom', 'Denmark', 'Australia', 'Italy', 'Singapore',
       'Netherlands', 'Israel', 'Hong Kong', 'Ireland', 'France', 'Spain',
       'Portugal', 'Sweden', 'Finland', 'South Korea', 'Norway', 'India',
       'Canada', 'Switzerland', 'Germany', 'USA', 'Brazil', 'Luxembourg',
       'New Zealand', 'Jamaica', 'Pakistan', 'Oman', 'Lebanon'],
      dtype=object)

In [11]:
gisaid.loc[(gisaid['pangolin_lineage']=='B.1.1.7') & (gisaid['country']=='USA'), 'strain'].unique().shape

(11,)

In [12]:
gisaid.loc[(gisaid['pangolin_lineage']=='B.1.1.7'), 'strain'].unique().shape

(9102,)

In [13]:
gisaid.columns

Index(['Unnamed: 0', 'idx', 'replacements', 'pos', 'gene', 'codon_num',
       'ref_codon', 'alt_codon', 'ref_aa', 'alt_aa', 'strain', 'virus',
       'gisaid_epi_isl', 'genbank_accession', 'date', 'region', 'country',
       'division', 'location', 'region_exposure', 'country_exposure',
       'division_exposure', 'segment', 'length', 'host', 'age', 'sex',
       'Nextstrain_clade', 'pangolin_lineage', 'GISAID_clade',
       'originating_lab', 'submitting_lab', 'authors', 'url', 'title',
       'paper_url', 'date_submitted', 'tmp', 'month'],
      dtype='object')

In [14]:
gisaid['mutation'] = gisaid['gene'] + ':' + gisaid['ref_aa'] + gisaid['codon_num'].astype(str) + gisaid['alt_aa']

N:D3L - G28279C

N:S235F - C28976T

ORF1ab:T1001I - C3266T

ORF1ab:I2230T - T6953C

ORF1ab:A1708D - C5387A

ORF8:R52I - G28047T

ORF8:Q27_ - C27971T

ORF8:Y73C - G28110A

S:N501Y - A23062T

S:T716I - C23708A

S:P681H - C23603A

S:D1118H - G24913C

S:A570D - C23270A

S:S982A - T24505G

ORF1ab:deletion - 11288:11296

S:deletion - 21765:21770

S: deletion - 21991:21993

In [16]:
# gisaid.loc[(gisaid['mutation'].isin(muts)) & (gisaid['country']=='USA'), 'strain'].unique().shape

In [32]:
ans = gisaid.groupby(['idx', 'date', 'country']).agg(muts=('mutation', 'unique')).reset_index()

In [33]:
def check_mutations(x, muts: list):
    return set(muts).issubset(x)

In [63]:
muts = ['S:N501Y', 'S:T716I', 'S:P681H', 
        'S:D1118H', 'S:A570D', 'S:S982A']

In [64]:
ans['is_b117'] = ans['muts'].apply(check_mutations, args=(muts,))

In [65]:
ans.loc[ans['is_b117']==True, 'idx'].unique().shape

(8993,)

In [66]:
ans.loc[(ans['is_b117']==True) & (ans['country']=='USA'),'idx'].unique().tolist()

['USA/CA-CDC-STM-P017/2020',
 'USA/CA-CDC-STM-P019/2020',
 'USA/CA-CDC-STM-P025/2020',
 'USA/CA-CDPH-UC301/2020',
 'USA/CA-SEARCH-5574/2020',
 'USA/CO-CDPHE-2100156850/2020',
 'USA/CT-Yale-S018/2021',
 'USA/CT-Yale-S019/2021',
 'USA/FL-CDC-STM-P012/2020',
 'USA/NY-Wadsworth-291673-01/2020']

In [68]:
gisaid[gisaid['mutation']=='S:S982A']

Unnamed: 0.1,Unnamed: 0,idx,replacements,pos,gene,codon_num,ref_codon,alt_codon,ref_aa,alt_aa,...,originating_lab,submitting_lab,authors,url,title,paper_url,date_submitted,tmp,month,mutation
1471976,1486083,England/MILK-9E2FE0/2020,24505:g,24505,S,982,TCA,GCA,S,A,...,Lighthouse Lab in Milton Keynes,Wellcome Sanger Institute for the COVID-19 Gen...,The Lighthouse Lab in Milton Keynes et al,https://www.gisaid.org,?,?,2020-10-14,"['2020', '09', '21']",9.0,S:S982A
1692252,1706472,England/CAMC-A58BA4/2020,24505:g,24505,S,982,TCA,GCA,S,A,...,Lighthouse Lab in Cambridge,Wellcome Sanger Institute for the COVID-19 Gen...,Rob Howes et al,https://www.gisaid.org,?,?,2020-10-27,"['2020', '10', '08']",10.0,S:S982A
1743404,1757624,England/MILK-A06B0F/2020,24505:g,24505,S,982,TCA,GCA,S,A,...,Lighthouse Lab in Milton Keynes,Wellcome Sanger Institute for the COVID-19 Gen...,The Lighthouse Lab in Milton Keynes et al,https://www.gisaid.org,?,?,2020-10-27,"['2020', '10', '01']",10.0,S:S982A
1745762,1759982,England/MILK-A06237/2020,24505:g,24505,S,982,TCA,GCA,S,A,...,Lighthouse Lab in Milton Keynes,Wellcome Sanger Institute for the COVID-19 Gen...,The Lighthouse Lab in Milton Keynes et al,https://www.gisaid.org,?,?,2020-10-27,"['2020', '09', '30']",9.0,S:S982A
1767805,1782025,England/MILK-9E05B3/2020,24505:g,24505,S,982,TCA,GCA,S,A,...,Lighthouse Lab in Milton Keynes,Wellcome Sanger Institute for the COVID-19 Gen...,The Lighthouse Lab in Milton Keynes et al,https://www.gisaid.org,?,?,2020-10-27,"['2020', '09', '20']",9.0,S:S982A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4504119,4529631,Ireland/WX-NVRL-AIIDW008v1/2020,24505:g,24505,S,982,TCA,GCA,S,A,...,AIID,Irish Coronavirus Sequencing Consortium-Teagas...,Matthew McCabe et al,https://www.gisaid.org,?,?,2021-01-07,"['2020', '12', '21']",12.0,S:S982A
4504315,4529827,Italy/MAR-UNIVPM101-75155/2020,24505:g,24505,S,982,TCA,GCA,S,A,...,"Virology Lab, Ospedali Riuniti di Ancona","Dipartimento di Scienze Biomediche e Cliniche,...",Alessia Lai et al,https://www.gisaid.org,?,?,2021-01-07,"['2020', '12', '18']",12.0,S:S982A
4504351,4529863,Italy/MAR-UNIVPM100-77710/2020,24505:g,24505,S,982,TCA,GCA,S,A,...,"Virology Lab, Ospedali Riuniti, Ancona","Dipartimento di Scenze Biomediche e Cliniche, ...",Alessia Lai et al,https://www.gisaid.org,?,?,2021-01-07,"['2020', '12', '18']",12.0,S:S982A
4510063,4535575,USA/CT-Yale-S018/2021,24505:g,24505,S,982,TCA,GCA,S,A,...,Yale Pathology Lab,Grubaugh Lab - Yale School of Public Health,Tara Alpert et al,https://www.gisaid.org,?,?,2021-01-07,"['2021', '01', '02']",1.0,S:S982A


In [49]:
ans.loc[(ans['is_b117']==True) & (ans['country']=='USA'),'idx'].unique().tolist()

['USA/CA-CDC-STM-P017/2020',
 'USA/CA-CDC-STM-P019/2020',
 'USA/CA-CDC-STM-P025/2020',
 'USA/CA-CDPH-UC301/2020',
 'USA/CA-CDPH-UC302/2020',
 'USA/CA-SEARCH-5574/2020',
 'USA/CO-CDPHE-2100156850/2020',
 'USA/CT-Yale-S018/2021',
 'USA/CT-Yale-S019/2021',
 'USA/FL-CDC-STM-P012/2020',
 'USA/NY-Wadsworth-291673-01/2020']

In [40]:
us_b117_ids = ans.loc[(ans['is_b117']==True) & (ans['country']=='USA'),'idx'].unique().tolist()

In [41]:
cns = AlignIO.read(gisaid_msa_fp, 'fasta')

In [42]:
for rec in cns:
    print(rec.id)
    break

NC_045512.2


In [43]:
input_params['patient_zero']

'NC_045512.2'

In [46]:
my_cns = Align.MultipleSeqAlignment([rec for rec in cns if rec.id in us_b117_strains or rec.id==input_params['patient_zero']])

In [48]:
AlignIO.write(my_cns, 'us_b117.fa', 'fasta')

1

In [10]:
# pd.to_datetime(gisaid['date']).dt.day

In [11]:
# gisaid['date'] = pd.to_datetime(gisaid['date'])

In [12]:
# gisaid[~gisaid["date"].isna()]['date'].astype(int)/1e12

In [13]:
# np.arange(1, len(df) + 1)

In [14]:
results = br.generate_voc_data(feature, values, input_params)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Columns (11) have mixed types.Specify dtype option on import or set low_memory=False.


Columns (11) have mixed types.Specify dtype option on import or set low_memory=False.



In [39]:
html = br.generate_voc_html(feature, values, results)
br.save_html(html, f'test_data/voc_test_b117{vnum}.html')

In [38]:
# codon_num = 70
# gene = 'S'
# (gisaid[
# #     (gisaid['pangolin_lineage']=='B.1.1.7')
#       (gisaid['pos']==21993)
#       &(gisaid['gene']==gene)]
#  .drop_duplicates(subset=['gene', 'ref_aa', 'alt_aa']))

Unnamed: 0.1,Unnamed: 0,idx,replacements,pos,gene,codon_num,ref_codon,alt_codon,ref_aa,alt_aa,...,GISAID_clade,originating_lab,submitting_lab,authors,url,title,paper_url,date_submitted,tmp,month
103921,114302,Sweden/20-07572/2020,21993:c,21993,S,144,TAT,TAC,Y,Y,...,GH,Narhalsan Backa vardcentral,The Public Health Agency of Sweden,Mats Olsson et al,https://www.gisaid.org,?,?,2020-04-30,"['2020', '04', '21']",4.0
1330709,1343409,USA/WA-UW-12158/2020,21993:a,21993,S,144,TAT,TAA,Y,_,...,GH,UW Virology Lab,UW Virology Lab,Pavitra Roychoudhury et al,https://www.gisaid.org,?,?,2020-10-04,"['2020', '06', '26']",6.0
4087309,4112813,Georgia/Tb-72720/2020,21993:a,21993,S,144,TAT,AAA,Y,K,...,O,"Department for Virology, Molecular Biology and...","Department for Virology, Molecular Biology and...",Gvantsa Brachveli et al,https://www.gisaid.org,?,?,2020-12-31,"['2020', '09', '02']",9.0


In [None]:
N:D3L - G28279C
N:S235F - C28976T
ORF1ab:T1001I - C3266T
ORF1ab:I2230T - T6953C
ORF1ab:A1708D - C5387A
ORF8:R52I - G28047T
ORF8:Q27_ - C27971T
ORF8:Y73C - G28110A
S:N501Y - A23062T
S:T716I - C23708A
S:P681H - C23603A
S:D1118H - G24913C
S:A570D - C23270A
S:S982A - T24505G
ORF1ab:deletion - 11288:11296
S:deletion - 21765:21770
S: deletion - 21991:21993