In [1]:
from plotly.figure_factory import create_dendrogram
import numpy as np
import pandas as pd
from scipy.spatial.distance import pdist
import os, pickle

from lib import built, plot_data,utils

# Load data

In [2]:
args = pickle.load(open('arguments.pickle', 'rb'))

In [3]:
clade_df = pd.read_csv(os.path.join(args.preprocess_dir, 'clade_kmean.csv'))
clade_df.shape

(3406, 9)

In [4]:
clade_df.loc[:, 'Collection date'] = pd.to_datetime(clade_df['Collection date'])
clade_df.head()

Unnamed: 0,Accession ID,Clade,Lineage,Collection date,Year,Month,Year-Month,WHO name,k_mean
0,406973,L,B.1.36.10,2020-01-23,2020,1,2020-01,Others,4
1,407987,L,B,2020-01-25,2020,1,2020-01,Others,4
2,407988,O,B,2020-02-01,2020,2,2020-02,Others,4
3,410535,S,A,2020-02-03,2020,2,2020-02,Others,4
4,410536,O,B,2020-02-06,2020,2,2020-02,Others,4


In [5]:
clade_df.loc[:, 'Day'] = clade_df['Collection date'].dt.strftime('%d')

In [6]:
clade_df.head()

Unnamed: 0,Accession ID,Clade,Lineage,Collection date,Year,Month,Year-Month,WHO name,k_mean,Day
0,406973,L,B.1.36.10,2020-01-23,2020,1,2020-01,Others,4,23
1,407987,L,B,2020-01-25,2020,1,2020-01,Others,4,25
2,407988,O,B,2020-02-01,2020,2,2020-02,Others,4,1
3,410535,S,A,2020-02-03,2020,2,2020-02,Others,4,3
4,410536,O,B,2020-02-06,2020,2,2020-02,Others,4,6


## Load  X

In [7]:
data = pickle.load(open(os.path.join(args.preprocess_dir, "data_X_df.pickle"), "rb"))

In [8]:
X = data

In [9]:
X.shape

(3406, 54)

In [10]:
X = np.where(X=='A', 1, X)
X = np.where(X=='T', 2, X)
X = np.where(X=='G', 3, X)
X = np.where(X=='C', 4, X)
X = np.where(X=='N', 0, X)
X = np.where(X=='-', 0, X)
X = np.where(X=='Y', 0, X)
X = np.where(X=='R', 0, X)
X = np.where(X=='S', 0, X)
X = np.where(X=='K', 0, X)
X = np.where(X=='M', 0, X)
X = np.where(X=='W', 0, X)

In [11]:
len(X)

3406

# Sampling data for dendro

In [12]:
rand_pos = pickle.load(open(os.path.join(args.preprocess_dir, "rand_pos.pickle"), "rb"))

In [13]:
clade_df = clade_df.loc[rand_pos]
X = X[rand_pos]
X_id_list = list(clade_df['Accession ID'])

In [14]:
clade_df = clade_df.sort_values(['Accession ID'])
clade_df = clade_df.reset_index()

In [15]:
X.shape, len(X_id_list), clade_df.shape

((3406, 54), 3406, (3406, 11))

In [16]:
clade_df

Unnamed: 0,index,Accession ID,Clade,Lineage,Collection date,Year,Month,Year-Month,WHO name,k_mean,Day
0,0,406973,L,B.1.36.10,2020-01-23,2020,1,2020-01,Others,4,23
1,1,407987,L,B,2020-01-25,2020,1,2020-01,Others,4,25
2,2,407988,O,B,2020-02-01,2020,2,2020-02,Others,4,01
3,3,410535,S,A,2020-02-03,2020,2,2020-02,Others,4,03
4,4,410536,O,B,2020-02-06,2020,2,2020-02,Others,4,06
...,...,...,...,...,...,...,...,...,...,...,...
3401,3401,2820284,G,B.1.617.2,2021-06-04,2021,6,2021-06,Delta,1,04
3402,3402,2820285,G,B.1.617.2,2021-06-03,2021,6,2021-06,Delta,6,03
3403,3403,2820286,G,B.1.617.2,2021-06-02,2021,6,2021-06,Delta,1,02
3404,3404,2820287,GRY,B.1.1.7,2021-06-01,2021,6,2021-06,Alpha,5,01


In [17]:
clade_df.columns = ['index', 'id', 'clade', 'lineage', 'date', 
                    'year', 'month', 'year_month', 'who_name', 'k_mean', 'day']

In [18]:
pickle.dump(clade_df, open(os.path.join(args.preprocess_dir, 'clade_dendro.pickle'), 'wb'))

In [19]:
pickle.dump(X, open(os.path.join(args.preprocess_dir, 'X_dendro.pickle'), 'wb'))

In [20]:
pickle.dump(X_id_list, open(os.path.join(args.preprocess_dir, 'X_id_list_dendro.pickle'), 'wb'))

# dendro

In [21]:
summary_change_protein_df = pd.read_csv(os.path.join(args.preprocess_dir, 'summary_change_protein_df.csv')) 
summary_change_protein_df = summary_change_protein_df[summary_change_protein_df['id'].isin(X_id_list)][['id','change_protein','gene','check']]

In [22]:
summary_change_protein_df

Unnamed: 0,id,change_protein,gene,check
0,407987,D209H,M,False
1,407988,L37F,NSP6,False
2,410535,L84S,ORF8,False
3,410535,S202N,N,False
4,410536,G251V,ORF3a,False
...,...,...,...,...
77886,2887795,A394V,NSP14,False
77887,2887795,G671S,NSP12,False
77888,2887795,P323L,NSP12,False
77889,2887795,F120-,ORF8,False


In [23]:
snps_max_df = pd.read_csv(os.path.join(args.preprocess_dir, 'snps_max_df.csv'))
snps_max_df.head()

Unnamed: 0,id,query,position,sbjct,start_end,length,change
0,1034260,C,241,T,18_29891,29874,C241T
1,1034260,C,27286,N,18_29891,29874,C27286N
2,1034260,T,27285,N,18_29891,29874,T27285N
3,1034260,A,27284,N,18_29891,29874,A27284N
4,1034260,A,27283,N,18_29891,29874,A27283N


In [24]:
fig_dendro_kmean,_,_ = plot_data.fig_dendro(args.preprocess_dir, 'k_mean')

In [26]:
# fig_dendro_kmean

In [27]:
fig_dendro_who,_,_ = plot_data.fig_dendro(args.preprocess_dir, 'who_name')

In [29]:
# fig_dendro_who

In [30]:
# fig_dendro_kmean.write_image("../pic/fig_dendro_kmean.png", scale=3)
# fig_dendro_who.write_image("../pic/fig_dendro_who.png", scale=3)