In [1]:
import sys
import pandas as pd
import os
from path import Path
import plotly
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from urllib.request import urlopen
import json
import statsmodels as sm
from statsmodels.formula.api import ols
from Bio import Seq, SeqIO, AlignIO, Phylo, Align
from jinja2 import Environment, FileSystemLoader  # html template engine
import cv2
import numpy as np
import skimage as sk
import matplotlib.pylab as plt
import datetime as dt

In [2]:
sys.path.append('../')

In [3]:
import bjorn_support as bs
import onion_trees as ot
import mutations as bm
import visualize as bv
import reports as br

# Data Priming

In [4]:
msa_fp = '/home/al/analysis/gisaid/sequences_2021-01-15_12-46_aligned.fasta'
meta_fp = '/home/al/analysis/gisaid/metadata_2021-01-15_14-55.tsv.gz'
date = msa_fp.split('_')[1]

In [5]:
print(date)
print(msa_fp)
print(meta_fp)

2021-01-15
/home/al/analysis/gisaid/sequences_2021-01-15_12-46_aligned.fasta
/home/al/analysis/gisaid/metadata_2021-01-15_14-55.tsv.gz


In [6]:
# subs, _ = bm.identify_replacements_per_sample(msa_fp, meta_fp,  
#                                            bm.GENE2POS, data_src='gisaid', 
#                                            test=True)
# subs.loc[dels['country']=='USA', 'country'] = 'United States of America'

In [7]:
# subs_fp = f'/home/al/analysis/gisaid/subs_long_{date}.csv.gz'
# subs.to_csv(subs_fp, index=False, compression='gzip')

In [8]:
dels, _ = bm.identify_deletions_per_sample(msa_fp, meta_fp,  
                                           bm.GENE2POS, data_src='gisaid', 
                                           min_del_len=3, test=True)
dels.loc[dels['country']=='USA', 'country'] = 'United States of America'

Loading Alignment file at: /home/al/analysis/gisaid/sequences_2021-01-15_12-46_aligned.fasta
Initial cleaning...
Identifying deletions...
Mapping Genes to mutations...
Computing codon numbers...
Fetching reference codon...
Mapping amino acids...
Naming deletions
Fuse with metadata...


  if (await self.run_code(code, result,  async_=asy)):


In [9]:
dels_fp = f'/home/al/analysis/gisaid/dels_long_{date}.csv.gz'
dels.to_csv(dels_fp, index=False, compression='gzip')

# Analysis Report Generation

In [13]:
# feature = 'mutations'
# values = ['S:S13I', 'S:W152C', 'S:L452R']
feature = 'mutation'
# S:Q677H, M:A85S, N:D377Y,
values = ['S:Q677H', 'M:A85S', 'N:D377Y']#, 'S:W152C', 'S:L452R'] # 'ORF1ab:I4205V', 
input_params = {
    'vocs': ['B.1.1.7', 'B.1.1.70'],
    'strain': 'OHVUI1',
    'date': '01/18/2021',
    'msa_fp': Path('/home/al/analysis/gisaid/sequences_2021-01-15_12-46_aligned.fasta'),
    'meta_fp' : Path('/home/al/code/HCoV-19-Genomics/metadata.csv'),
    'tree_fp' : Path('/home/al/analysis/alab_mutations_01-01-2021/alab/seqs_aligned.fa.treefile'),
    'subs_fp' : '/home/al/analysis/alab_mutations_01-01-2021/alab_substitutions_long_01-01-2021.csv',
    'countries_fp' : '/home/al/data/geojsons/countries.geo.json',
    'states_fp' : "/home/al/data/geojsons/us-states.json",
    'counties_fp' : '/home/al/data/geojsons/us-counties.json',
    'patient_zero' : 'NC_045512.2',
    'gisaid_data_fp' : '/home/al/analysis/gisaid/subs_long_2021-01-15_14-55v2.csv.gz',
    'gisaid_meta_fp': '/home/al/analysis/gisaid/metadata_2021-01-15_14-55.tsv.gz',
    'b117_meta' : '/home/al/analysis/b117/nextstrain_groups_neherlab_ncov_S.N501_metadata.tsv',
    'b117_tree': 'test_data/b117_seqs_aligned.fasta.treefile',
    'sample_sz': 150,
    'sampling_img_fp' : "/home/al/analysis/b117/figs/sars-cov-2_EM_v3.png"
}

In [14]:
results = br.generate_voc_data(feature, values, input_params)

In [15]:
html = br.generate_voc_html(feature, values, results, template_name='mut.html')
# br.save_html(html, f'test_data/orf1ab_i4205v_report.html')
br.save_html(html, f'test_data/ohvui1_report.html')

In [20]:
# feature = 'mutations'
# values = ['S:S13I', 'S:W152C', 'S:L452R']
feature = 'mutation'
# S:Q677H, M:A85S, N:D377Y,
values = ['S:S13I', 'S:W152C', 'S:L452R']#, 'S:W152C', 'S:L452R'] # 'ORF1ab:I4205V', 
input_params = {
    'vocs': ['B.1.1.7', 'B.1.1.70'],
    'strain': 'CAVUI1S',
    'date': '01/16/2021',
    'msa_fp': Path('/home/al/analysis/gisaid/sequences_2021-01-15_12-46_aligned.fasta'),
    'meta_fp' : Path('/home/al/code/HCoV-19-Genomics/metadata.csv'),
    'tree_fp' : Path('/home/al/analysis/alab_mutations_01-01-2021/alab/seqs_aligned.fa.treefile'),
    'subs_fp' : '/home/al/analysis/alab_mutations_01-01-2021/alab_substitutions_long_01-01-2021.csv',
    'countries_fp' : '/home/al/data/geojsons/countries.geo.json',
    'states_fp' : "/home/al/data/geojsons/us-states.json",
    'counties_fp' : '/home/al/data/geojsons/us-counties.json',
    'patient_zero' : 'NC_045512.2',
    'gisaid_data_fp' : '/home/al/analysis/gisaid/subs_long_2021-01-15_14-55v2.csv.gz',
    'gisaid_meta_fp': '/home/al/analysis/gisaid/metadata_2021-01-15_14-55.tsv.gz',
    'b117_meta' : '/home/al/analysis/b117/nextstrain_groups_neherlab_ncov_S.N501_metadata.tsv',
    'b117_tree': 'test_data/b117_seqs_aligned.fasta.treefile',
    'sample_sz': 150,
    'sampling_img_fp' : "/home/al/analysis/b117/figs/sars-cov-2_EM_v3.png"
}

In [21]:
results = br.generate_voc_data(feature, values, input_params)

In [22]:
html = br.generate_voc_html(feature, values, results, template_name='mut.html')
# br.save_html(html, f'test_data/orf1ab_i4205v_report.html')
br.save_html(html, f'test_data/cavui1s_report.html')

In [19]:
runs = {'orf1a_i4205v': 'ORF1a:I4205V', 'orf1b_d1183y': 'ORF1b:D1183Y', 
        's_s13i': 'S:S13I', 's_w152c': 'S:W152C', 's_l452r': 'S:L452R'}
for name, mut in runs.items():
    input_params['strain'] = mut
    results = br.generate_voc_data(feature, [mut], input_params)
    html = br.generate_voc_html(feature, [mut], results, template_name='mut.html')
    br.save_html(html, f'test_data/{name}_report.html')

In [17]:
gisaid = pd.read_csv(input_params['gisaid_data_fp'], compression='gzip')

In [16]:
# res = (gisaid.groupby(['date', 'country', 'division', 
#                                         'purpose_of_sequencing',
#                                         'location', 'pangolin_lineage', 'strain'])
#                        .agg(mutations=('mutation', 'unique')).reset_index())
# res['is_vui'] = res['mutations'].apply(bv.is_vui, args=(set(values),))

In [12]:

def get_mutations(data: pd.DataFrame, lineage: str='B.1.1.7'):
    mutations = set(data[data['pangolin_lineage']==lineage]['mutation'].unique().tolist())
    return mutations