In [1]:
from pysam import VariantFile
from os import listdir
from os.path import isfile, join, split
from tqdm.notebook import trange

import numpy as np

import matplotlib
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns
import statistics
import pandas as pd

clinical_data_path = "clinical_variant_files"
metadata_files_path = "ENA_metadata"

In [2]:
lineage_meta_data = pd.read_csv('SARS-CoV-2 lineage meta data.csv', index_col = 'INAB sample ID', skiprows = [1,2])
print(len(lineage_meta_data), 'known lineages have been loaded.')

3200 known lineages have been loaded.


In [3]:
metadata_files = [join(metadata_files_path, f) for f in listdir(metadata_files_path) if isfile(join(metadata_files_path, f)) if 'metadata' in f]
print(len(metadata_files), 'metadata files have been loaded.')

metadata = pd.concat([pd.read_csv(f, sep = '\t', index_col = 'sample_alias', comment = '#') for f in metadata_files])
metadata.drop(axis = 1, labels = 'Unnamed: 18', inplace = True)

print(len(metadata), 'lineages have been loaded.')

20 metadata files have been loaded.
3155 lineages have been loaded.


In [4]:
metadata_lineages = pd.concat([lineage_meta_data, metadata], axis = 1)

In [5]:
vcfs = [f for f in listdir(clinical_data_path) if isfile(join(clinical_data_path, f))]
vcfs_prefixes = [k.split('_', 1)[0] for k in vcfs]

data = np.zeros((len(vcfs), 29903), dtype=bool) # init a bin table 
print(len(vcfs), 'vcfs have been loaded.')

subs = []
for i, file in enumerate(vcfs):
    bcf_in = VariantFile(join(clinical_data_path, file))  # auto-detect input format
#     bcf_out = VariantFile('-', 'w', header=bcf_in.header)

    path, file = split(file)
#     print(file)
    count = 0
    for rec in bcf_in.fetch():
        data[i, rec.start : rec.stop] = 1
        count = count + 1 
    subs.append(count)
#         print(data[0, rec.start : rec.stop])
#         print (rec.alleles)
#         print (rec.pos, rec.start, rec.stop)
#     time.sleep(2)
data = pd.DataFrame(data = data, index = vcfs_prefixes, dtype = 'int8')

3698 vcfs have been loaded.


In [6]:
result = pd.merge(data, metadata_lineages, how='left', left_index = True, right_index = True)
msk = (result == 0).all()
result.loc[:,~msk].to_csv('dataset.csv')

In [7]:
# fig, ax = plt.subplots()

# for item in [fig, ax]:
#     item.patch.set_visible(False)

# # define the colors
# cmap = matplotlib.colors.ListedColormap(['w', 'k'])

# # create a normalize object the describes the limits of
# # each color
# bounds = [0., 0.5, 1.]
# norm = matplotlib.colors.BoundaryNorm(bounds, cmap.N)

# ax.xaxis.set_tick_params(pad=5, labelsize=4)
# ax.yaxis.set_tick_params(pad=5, labelsize=4)

# # plot it
# ax.imshow(data[:,cols], interpolation='none', cmap=cmap, norm=norm)
# fig.savefig('test-mutations.pdf')  

In [8]:
# # this erases labels for any blank plots on the last page
# sns.set(font_scale=0.0)
# m, n = 3, 2
# datasize = len(cols) # 39 % (m*n) = 15, 24 (m*n) - 15 thus should result in 9 blank subplots on final page
# ctheme = [
#     "k", "gray", "magenta", "fuchsia", "#be03fd", "#1e488f",
#     (0.44313725490196076, 0.44313725490196076, 0.88627450980392153), "#75bbfd",
#     "teal", "lime", "g", (0.6666674, 0.6666663, 0.29078014184397138), "y",
#     "#f1da7a", "tan", "orange", "maroon", "r", ] # pick whatever colors you wish
# colors = sns.blend_palette(ctheme, datasize)
# fz = 5  # labels fontsize


# def new_page(m, n):
#     global splot_index
#     splot_index = 0
#     fig, axarr = plt.subplots(m, n, sharey="row")
#     plt.subplots_adjust(hspace=0.5, wspace=0.15)
#     arr_ij = [(x, y) for x, y in np.ndindex(axarr.shape)]
#     subplots = [axarr[index] for index in arr_ij]
#     for s, splot in enumerate(subplots):
#         splot.grid(
#             b=True,
#             which="major",
#             color="gray",
#             linestyle="-",
#             alpha=0.25,
#             zorder=1,
#             lw=0.5,
#         )
#         last_row = m * n - s < n + 1
#         first_in_row = s % n == 0
#         if last_row:
#             splot.set_xlabel("X-axis label", labelpad=8, fontsize=fz)
#         if first_in_row:
#             splot.set_ylabel("Y-axis label", labelpad=8, fontsize=fz)
#     return (fig, subplots)


# with PdfPages("auto_subplotting_colors.pdf") as pdf:
#     fig, subplots = new_page(m, n)

#     for sample in trange(datasize):
#         splot = subplots[splot_index]
#         splot_index += 1
#         splot.scatter(
#             range(len(data[:,1])),
#             data[:,cols[sample]],
#             s=5
#         )
#         splot.set_title("Sample {}".format(cols[sample]), fontsize=fz)
#         # tick fontsize & spacing
#         splot.xaxis.set_tick_params(pad=4, labelsize=6)
#         splot.yaxis.set_tick_params(pad=4, labelsize=6)

#         # make new page:
#         if splot_index == m * n:
#             pdf.savefig()
#             plt.close(fig)
#             fig, subplots = new_page(m, n)

#     if splot_index > 0:
#         pdf.savefig()
#         plt.close(fig)