In [1]:
from os import listdir
from os.path import isfile, join, split
import os
from tqdm.notebook import trange
import re

import numpy as np
import pandas as pd

import matplotlib
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns
import statistics



from src.vcfs_parser import parser

metadata_files_path = "data/ENA_metadata"
lineages_files_path = "data/lineages"

clinical_data_path = "data/clinical_variant_files"
lineage_metadata_path = "data/SARS-CoV-2 lineage meta data.csv"
genes_coordinates_path = "data/NC_045512.2_annot.gff3"

In [None]:
orf1a_sub_names = ['NSP'+str(i) for i in range(1, 11)]
orf1b_sub_names = ['NSP12a', 'NSP12b'] + ['NSP'+str(i) for i in range(13,17)]

genes_coordinates = pd.read_csv(genes_coordinates_path, delimiter='\t', comment = '#', header = None)


genes_coordinates = genes_coordinates.append({0: 'NC_045512.2', \
                                              1: 'Giorgi', \
                                              2: 'CDS', \
                                              3: genes_coordinates.loc[genes_coordinates[8].isin(orf1a_sub_names)][3].min(), \
                                              4: genes_coordinates.loc[genes_coordinates[8].isin(orf1a_sub_names)][4].max(), \
                                              5: '.', \
                                              6: '+', \
                                              7: '.', \
                                              8: 'ORF1a', \
                                              9: 'desc'} \
                                            ,ignore_index=True)

genes_coordinates = genes_coordinates.append({0: 'NC_045512.2', \
                                              1: 'Giorgi', \
                                              2: 'CDS', \
                                              3: genes_coordinates.loc[genes_coordinates[8].isin(orf1b_sub_names)][3].min(), \
                                              4: genes_coordinates.loc[genes_coordinates[8].isin(orf1b_sub_names)][4].max(), \
                                              5: '.', \
                                              6: '+', \
                                              7: '.', \
                                              8: 'ORF1b', \
                                              9: 'desc'} \
                                            ,ignore_index=True)

genes_coordinates.drop(labels = genes_coordinates[genes_coordinates[8].isin(orf1a_sub_names)].index, inplace = True)
genes_coordinates.drop(labels = genes_coordinates[genes_coordinates[8].isin(orf1b_sub_names)].index, inplace = True)

genes_coordinates.rename(columns={3: 'gene_start', 4: 'gene_end', 8: 'gene'}, inplace = True)
genes_coordinates = genes_coordinates[['gene_start', 'gene_end', 'gene']].copy().set_index('gene')

print(len(genes_coordinates), 'genes coordinates have been loaded.')

In [None]:
lineage_metadata = pd.read_csv(lineage_metadata_path, index_col = 'INAB sample ID', skiprows = [1,2])
print(len(lineage_metadata), 'samples with known lineages have been loaded.')

lineage_metadata['lineage'] = lineage_metadata['lineage'].apply(lambda x: re.sub(r" ?\([^)]+\)", "", str(x)))
lineage_metadata['lineage'] = lineage_metadata['lineage'].str.strip()
lineage_metadata['lineage'] = lineage_metadata['lineage'].apply(lambda x: np.nan if (x in ['nan', '']) else x)

In [None]:
# Comment out these 4 lines to load the data from separate files - first attempt 
# metadata_files = [join(metadata_files_path, f) for f in listdir(metadata_files_path) if isfile(join(metadata_files_path, f)) if 'metadata' in f]
# metadata = pd.concat([pd.read_csv(f, sep = '\t', index_col = 'sample_alias', comment = '#') for f in metadata_files])
# metadata.drop(axis = 1, labels = 'Unnamed: 18', inplace = True)
# print(len(metadata_files), 'metadata files with', len(metadata), 'lineages have been loaded.')

# Second attempt - load the metadata from the downloaded xml file after its conversion using the xmp_to_csv python script
metadata = pd.read_csv('data/biosample_result.csv', index_col = 'Title')
print(len(metadata), 'metadata samples have been loaded.')

In [None]:
metadata = pd.concat([lineage_metadata, metadata], axis = 1)

In [None]:
lineages = metadata['lineage'].unique().tolist()
lineages = [i for i in lineages if pd.notna(i)]

# comment out to download the files
# for idx, lineage in enumerate(lineages):
#     os.system('python scripts/scraper.py --lineage={}'.format(lineage))
# print(idx+1, 'files have been downloaded')
    
lineages_files = [join(lineages_files_path, f) for f in listdir(lineages_files_path+'') if isfile(join(lineages_files_path, f)) and \
        any(f.startswith(substring+'_') for substring in lineages)]

lineages_data = pd.concat([pd.read_csv(f, usecols = ['lineage','gene', 'ref_aa', 'alt_aa', 'codon_num', 'codon_end']) for f in lineages_files])
lineages_data['codon_end'].replace({"None": np.nan}, inplace=True)
lineages_data['codon_end'] = pd.to_numeric(lineages_data['codon_end'])

# lineages_data
lineages_data = pd.merge(lineages_data, genes_coordinates, how='left', on = 'gene', validate = 'many_to_one')

# Calculate mutation start-end coordinates
lineages_data = lineages_data.assign(mut_start = lambda x: ((x['codon_num'] * 3) + x['gene_start'] - 3))

lineages_data['mut_end'] = lineages_data.apply(lambda x: ((x['codon_num'] * 3) + x['gene_start'] - 1) \
                                               if (pd.isna(x['codon_end'])) \
                                               else ((float(x['codon_end']) * 3) + x['gene_start'] - 1), \
                                              axis=1).astype('int64')

In [None]:
p = parser(clinical_data_path, metadata, lineages_data)
p.convert_to_bin()
data = p.data

In [None]:
result = pd.merge(data, metadata, how='left', left_index = True, right_index = True)
msk = (result == 0).all() # get rid of columns containing only zeros
# result = result.loc[:,~msk].copy()
result.to_csv('data/dataset.csv')

In [None]:
# fig, ax = plt.subplots()

# for item in [fig, ax]:
#     item.patch.set_visible(False)

# # define the colors
# cmap = matplotlib.colors.ListedColormap(['w', 'k', 'r'])

# # create a normalize object the describes the limits of
# # each color
# bounds = [0., 0.5, 1.1, 2.1]
# norm = matplotlib.colors.BoundaryNorm(bounds, cmap.N)

# fig.colorbar(matplotlib.cm.ScalarMappable(norm=norm, cmap=cmap), ax=ax)

# ax.xaxis.set_tick_params(pad=5, labelsize=4)
# ax.yaxis.set_tick_params(pad=5, labelsize=4)

# # plot it
# ax.imshow(data.loc[:,~msk].astype(float), interpolation='none', cmap=cmap, norm=norm, )
# fig.savefig('../new-test-mutations.pdf')  

In [None]:
# # this erases labels for any blank plots on the last page
# sns.set(font_scale=0.0)
# m, n = 3, 2
# datasize = len(cols) # 39 % (m*n) = 15, 24 (m*n) - 15 thus should result in 9 blank subplots on final page
# ctheme = [
#     "k", "gray", "magenta", "fuchsia", "#be03fd", "#1e488f",
#     (0.44313725490196076, 0.44313725490196076, 0.88627450980392153), "#75bbfd",
#     "teal", "lime", "g", (0.6666674, 0.6666663, 0.29078014184397138), "y",
#     "#f1da7a", "tan", "orange", "maroon", "r", ] # pick whatever colors you wish
# colors = sns.blend_palette(ctheme, datasize)
# fz = 5  # labels fontsize


# def new_page(m, n):
#     global splot_index
#     splot_index = 0
#     fig, axarr = plt.subplots(m, n, sharey="row")
#     plt.subplots_adjust(hspace=0.5, wspace=0.15)
#     arr_ij = [(x, y) for x, y in np.ndindex(axarr.shape)]
#     subplots = [axarr[index] for index in arr_ij]
#     for s, splot in enumerate(subplots):
#         splot.grid(
#             b=True,
#             which="major",
#             color="gray",
#             linestyle="-",
#             alpha=0.25,
#             zorder=1,
#             lw=0.5,
#         )
#         last_row = m * n - s < n + 1
#         first_in_row = s % n == 0
#         if last_row:
#             splot.set_xlabel("X-axis label", labelpad=8, fontsize=fz)
#         if first_in_row:
#             splot.set_ylabel("Y-axis label", labelpad=8, fontsize=fz)
#     return (fig, subplots)


# with PdfPages("auto_subplotting_colors.pdf") as pdf:
#     fig, subplots = new_page(m, n)

#     for sample in trange(datasize):
#         splot = subplots[splot_index]
#         splot_index += 1
#         splot.scatter(
#             range(len(data[:,1])),
#             data[:,cols[sample]],
#             s=5
#         )
#         splot.set_title("Sample {}".format(cols[sample]), fontsize=fz)
#         # tick fontsize & spacing
#         splot.xaxis.set_tick_params(pad=4, labelsize=6)
#         splot.yaxis.set_tick_params(pad=4, labelsize=6)

#         # make new page:
#         if splot_index == m * n:
#             pdf.savefig()
#             plt.close(fig)
#             fig, subplots = new_page(m, n)

#     if splot_index > 0:
#         pdf.savefig()
#         plt.close(fig)