In [2]:
!nvidia-smi

Fri Sep 16 12:02:15 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.73.05    Driver Version: 510.73.05    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA RTX A5000    Off  | 00000000:00:05.0 Off |                  Off |
| 30%   41C    P8    20W / 230W |   1525MiB / 24564MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
!pip install umap-learn

[0m

In [3]:
#import cudf
import cuml
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import time
from cuml.decomposition import PCA as PCA_CUDA
from cuml import UMAP as UMAP_CUDA
from cuml import IncrementalPCA as IncPCA_CUDA
import cupy as cp
from cuml.preprocessing import MaxAbsScaler
import numba
from cuml.preprocessing import OneHotEncoder
from cuml import TSNE as TSNE_CUDA
from sklearn.decomposition import PCA as PCA_CPU
from sklearn.manifold import TSNE as TSNE_CPU
from umap import UMAP as UMAP_CPU
from sklearn.preprocessing import OneHotEncoder as CPU_ohe
from sklearn.decomposition import IncrementalPCA as IncPCA

In [4]:
import gc
#import plotly.express as px

In [5]:
runtimes = {
        "1000Genomes": {
            "GPU": {},
            "CPU": {}},
        "AdaptMAP":{
            "GPU": {},
            "CPU": {}
        }
    }

In [6]:
class GenomicVisualizer():
    
    
    def __init__(self, file_path):
        self.memory_cleared = False
        self.df_genetic_data = pd.read_hdf(path_or_buf=file_path, key="Genotypes_variants_per_sample")
        self.df_phenotype_data = pd.read_hdf(path_or_buf=file_path, key="phenotype_data")
        self.X = None
        self.performance_dict = {}
        self.components = 0
        self.columns = []
        self.X_CPU = None
        self.X_transformed_GPU = None
        self.X_transformed_CPU = None
        self.df_reduced = None
        self.label = None
        
        
    def one_hot_encode(self):
        if self.memory_cleared is False:
            if self.X is None:
                self.X = cp.asarray(CPU_ohe().fit_transform(self.df_genetic_data.values).toarray().astype(np.float32))
            else: print("The values of the genotype data frame self.df_genetic_data.values are already encoded")
        else: 
            if self.X is None:
                self.X = cp.asarray(self.X_CPU)
                self.memory_cleared = False
        
        
    def reduce_dimension(self, algorithm='PCA_CUDA', n_components=3, label=None):
        if self.memory_cleared is True: self.X = cp.asarray(self.X_CPU)
        if label not in self.df_phenotype_data.columns:
            print(f"{label} is not a phenotype. Check the columns of df_phenotype_data from the class GenomicVisualizer()")
            return
        else: self.label = label
        self.components = n_components
        if self.components not in [2, 3]:
            print("The only number of n_components / dimensions allowed are 2 or 3")
            return
        else:
            if self.components == 2: self.columns = ['component1', 'component2']
            else: self.columns = ['component1', 'component2', 'component3']
        if algorithm == 'PCA_CUDA':
            self.pca_cuda(n_components=n_components, algorithm="PCA_CUDA")
        elif algorithm == "TSNE_CUDA":
            self.tsne_cuda(n_components=n_components, algorithm="TSNE_CUDA")
        elif algorithm == "UMAP_CUDA":
            self.umap_cuda(n_components=n_components, algorithm="UMAP_CUDA")
    
    
    def pca_cuda(self, n_components, algorithm):
        start_time = time.time()
        if self.X.shape[1] <= 3000:
            self.X_transformed_GPU = PCA_CUDA(n_components=n_components).fit_transform(self.X)
        else:
            self.X_transformed_GPU = IncPCA_CUDA(n_components=n_components).fit_transform(self.X)
        self.performance_dict[algorithm] = time.time() - start_time
        self.create_reduced_dataframe()
    
    
    def tsne_cuda(self, n_components, algorithm):
        start_time = time.time()
        self.X_transformed_GPU = TSNE_CUDA(n_components=n_components).fit_transform(self.X)
        self.performance_dict[algorithm] = time.time() - start_time
        self.create_reduced_dataframe()

    
    def umap_cuda(self, n_components, algorithm):
        start_time = time.time()
        self.X_transformed_GPU = UMAP_CUDA(n_components=n_components).fit_transform(self.X)
        self.performance_dict[algorithm] = time.time() - start_time
        self.create_reduced_dataframe()
    
    
    def create_reduced_dataframe(self):
        self.df_reduced = pd.DataFrame(self.X_transformed_GPU.get(),
                            index=self.df_genetic_data.index,
                            columns=self.columns)
        if self.label is not None:
            self.df_reduced = self.df_reduced.merge(self.df_phenotype_data[self.label], left_index=True, right_index=True)

    
    def garbage_collect(self):
        self.memory_cleared = True
        self.X_CPU = cp.asnumpy(self.X).astype(np.float32)
        self.X_transformed_CPU = cp.asnumpy(self.X_transformed_GPU).astype(np.float32)
        del self.X 
        del self.X_transformed_GPU
        gc.collect()    
        
    
    def generate_figure_image(self, save=False):
        pass


In [7]:
!pip install tables
#!pip install -U kaleido

Collecting tables
  Downloading tables-3.7.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.9/5.9 MB[0m [31m100.8 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting numexpr>=2.6.2
  Downloading numexpr-2.8.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (381 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m381.5/381.5 kB[0m [31m74.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: numexpr, tables
Successfully installed numexpr-2.8.3 tables-3.7.0
[0m

In [8]:
genomic_visualizer = GenomicVisualizer("AdaptMap-Goat-Project.h5")

In [9]:
genomic_visualizer.one_hot_encode()

In [10]:
!nvidia-smi

Fri Sep 16 12:03:37 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.73.05    Driver Version: 510.73.05    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA RTX A5000    Off  | 00000000:00:05.0 Off |                  Off |
| 30%   48C    P2    64W / 230W |   5296MiB / 24564MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [12]:
df_reduced_dimensions = genomic_visualizer.reduce_dimension(algorithm='PCA_CUDA', n_components=2, label="Breeds")

In [23]:
display(genomic_visualizer.df_reduced)

Unnamed: 0_level_0,component1,component2,Breeds
iid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ET_ABR0001,29.230209,-5.742365,Abergelle
ET_ABR0002,26.097027,-5.110254,Abergelle
ET_ABR0003,29.976587,-5.457181,Abergelle
ET_ABR0004,27.303400,-5.334871,Abergelle
ET_ABR0005,27.061237,-0.924010,Abergelle
...,...,...,...
ET_WYG0042,49.261333,-8.354962,Woyito Guji
ET_WYG0043,48.686619,-10.296255,Woyito Guji
ET_WYG0044,46.896328,-13.967939,Woyito Guji
ET_WYG0045,51.091370,-13.467413,Woyito Guji


In [14]:
!nvidia-smi

Thu Sep 15 14:53:34 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.73.05    Driver Version: 510.73.05    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:05.0 Off |                    0 |
| N/A   38C    P0    68W / 500W |  10063MiB / 81920MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [15]:
#genomic_visualizer.df_reduced.to_csv("IncPCA_adaptmap.csv")

In [22]:
#np.save("Encoded Genotype Data", genomic_visualizer.X.get())

In [15]:
genomic_visualizer.garbage_collect()
!nvidia-smi

Thu Sep 15 14:54:15 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.73.05    Driver Version: 510.73.05    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:05.0 Off |                    0 |
| N/A   39C    P0    77W / 500W |   2013MiB / 81920MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [11]:
#df_reduced_dimensions = genomic_visualizer.reduce_dimension(algorithm='PCA_CUDA', n_components=2)
#df_reduced_dimensions = genomic_visualizer.reduce_dimension(algorithm='TSNE_CUDA', n_components=2)
genomic_visualizer.reduce_dimension(algorithm='TSNE_CUDA', n_components=2, label="Breeds")

In [12]:
display(genomic_visualizer.df_reduced)

Unnamed: 0_level_0,component1,component2,Breeds
iid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ET_ABR0001,42.045727,26.956886,Abergelle
ET_ABR0002,45.466545,23.352505,Abergelle
ET_ABR0003,39.130085,26.954721,Abergelle
ET_ABR0004,42.875393,26.871880,Abergelle
ET_ABR0005,38.830101,21.693975,Abergelle
...,...,...,...
ET_WYG0042,27.106670,29.790258,Woyito Guji
ET_WYG0043,29.572624,28.359650,Woyito Guji
ET_WYG0044,28.590889,30.493013,Woyito Guji
ET_WYG0045,29.639254,31.097387,Woyito Guji


In [21]:
!nvidia-smi

Thu Sep 15 14:56:06 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.73.05    Driver Version: 510.73.05    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:05.0 Off |                    0 |
| N/A   39C    P0    69W / 500W |   4761MiB / 81920MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [13]:
genomic_visualizer.df_reduced.to_csv("TSNE_GPU_adaptmap.csv")

In [25]:
genomic_visualizer.garbage_collect()
!nvidia-smi

Thu Sep 15 14:58:43 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.73.05    Driver Version: 510.73.05    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:05.0 Off |                    0 |
| N/A   39C    P0    70W / 500W |   2079MiB / 81920MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [26]:
genomic_visualizer.reduce_dimension(algorithm='UMAP_CUDA', n_components=2, label="Breeds")

In [27]:
display(genomic_visualizer.df_reduced)

Unnamed: 0_level_0,component1,component2,Breeds
iid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ET_ABR0001,-14.081036,-0.519868,Abergelle
ET_ABR0002,-14.108814,-0.538992,Abergelle
ET_ABR0003,-14.070383,-0.557611,Abergelle
ET_ABR0004,-14.107050,-0.545480,Abergelle
ET_ABR0005,-14.121336,-0.549765,Abergelle
...,...,...,...
ET_WYG0042,-5.932293,-5.257528,Woyito Guji
ET_WYG0043,-5.960082,-5.342309,Woyito Guji
ET_WYG0044,-5.958420,-5.390850,Woyito Guji
ET_WYG0045,-5.982540,-5.471337,Woyito Guji


In [46]:
!nvidia-smi

Thu Sep 15 15:46:13 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.73.05    Driver Version: 510.73.05    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:05.0 Off |                    0 |
| N/A   40C    P0    68W / 500W |   2245MiB / 81920MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [29]:
genomic_visualizer.df_reduced.to_csv("UMAP_GPU_adaptmap.csv")

In [38]:
runtimes["AdaptMAP"]["GPU"] = genomic_visualizer.performance_dict
runtimes

{'1000Genomes': {'GPU': {'PCA_CUDA': 15.263635396957397,
   'TSNE_CUDA': 0.9569859504699707,
   'UMAP_CUDA': 1.5256545543670654},
  'CPU': {}},
 'AdaptMAP': {'GPU': {'PCA_CUDA': 15.263635396957397,
   'TSNE_CUDA': 0.9569859504699707,
   'UMAP_CUDA': 1.5256545543670654},
  'CPU': {}}}

In [41]:
genomic_visualizer.garbage_collect()
!nvidia-smi

Thu Sep 15 15:42:22 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.73.05    Driver Version: 510.73.05    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:05.0 Off |                    0 |
| N/A   40C    P0    71W / 500W |   2171MiB / 81920MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [42]:
genomic_visualizer = GenomicVisualizer("1000_genomes_project.h5")
genomic_visualizer.one_hot_encode()

In [44]:
genomic_visualizer.df_phenotype_data.head()

Unnamed: 0_level_0,Superpopulation code,Superpopulation name
Sample name,Unnamed: 1_level_1,Unnamed: 2_level_1
HG00105,EUR,European Ancestry
HG00112,EUR,European Ancestry
HG00117,EUR,European Ancestry
HG00124,EUR,European Ancestry
HG00129,EUR,European Ancestry


In [47]:
genomic_visualizer.reduce_dimension(algorithm='PCA_CUDA', n_components=2, label="Superpopulation name")

In [48]:
genomic_visualizer.df_reduced.head()

Unnamed: 0,component1,component2,Superpopulation name
HG00096,-1.552997,1.867352,European Ancestry
HG00097,-1.730807,1.883513,European Ancestry
HG00099,-1.972736,2.060777,European Ancestry
HG00100,-2.437779,2.292182,European Ancestry
HG00101,-2.180666,2.179227,European Ancestry


In [49]:
!nvidia-smi

Thu Sep 15 15:47:03 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.73.05    Driver Version: 510.73.05    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:05.0 Off |                    0 |
| N/A   39C    P0    68W / 500W |   2245MiB / 81920MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [50]:
genomic_visualizer.df_reduced.to_csv("PCA_GPU_1000genomes.csv")

In [51]:
genomic_visualizer.reduce_dimension(algorithm='TSNE_CUDA', n_components=2, label="Superpopulation name")

In [52]:
genomic_visualizer.df_reduced.head()

Unnamed: 0,component1,component2,Superpopulation name
HG00096,-19.458225,33.937195,European Ancestry
HG00097,2.362793,61.716125,European Ancestry
HG00099,13.721524,47.530949,European Ancestry
HG00100,-12.949867,60.266125,European Ancestry
HG00101,-24.531895,33.519676,European Ancestry


In [53]:
genomic_visualizer.df_reduced.to_csv("TSNE_GPU_1000genomes.csv")

In [55]:
genomic_visualizer.reduce_dimension(algorithm='UMAP_CUDA', n_components=2, label="Superpopulation name")

In [56]:
genomic_visualizer.df_reduced.head()

Unnamed: 0,component1,component2,Superpopulation name
HG00096,-0.315826,6.784998,European Ancestry
HG00097,0.126371,7.973272,European Ancestry
HG00099,0.411083,7.316635,European Ancestry
HG00100,-0.461454,8.295925,European Ancestry
HG00101,-0.857195,6.983219,European Ancestry


In [57]:
genomic_visualizer.df_reduced.to_csv("UMAP_GPU_1000genomes.csv")

In [58]:
runtimes["1000Genomes"]["GPU"] = genomic_visualizer.performance_dict
runtimes

{'1000Genomes': {'GPU': {'PCA_CUDA': 0.01383209228515625,
   'TSNE_CUDA': 0.510061502456665,
   'UMAP_CUDA': 0.04385709762573242},
  'CPU': {}},
 'AdaptMAP': {'GPU': {'PCA_CUDA': 15.263635396957397,
   'TSNE_CUDA': 0.9569859504699707,
   'UMAP_CUDA': 1.5256545543670654},
  'CPU': {'PCA': 196.69504523277283}}}

In [59]:
runtimes["AdaptMAP"].pop("CPU")
runtimes

{'1000Genomes': {'GPU': {'PCA_CUDA': 0.01383209228515625,
   'TSNE_CUDA': 0.510061502456665,
   'UMAP_CUDA': 0.04385709762573242},
  'CPU': {}},
 'AdaptMAP': {'GPU': {'PCA_CUDA': 15.263635396957397,
   'TSNE_CUDA': 0.9569859504699707,
   'UMAP_CUDA': 1.5256545543670654}}}

In [60]:
runtimes["1000Genomes"].pop("CPU")
runtimes

{'1000Genomes': {'GPU': {'PCA_CUDA': 0.01383209228515625,
   'TSNE_CUDA': 0.510061502456665,
   'UMAP_CUDA': 0.04385709762573242}},
 'AdaptMAP': {'GPU': {'PCA_CUDA': 15.263635396957397,
   'TSNE_CUDA': 0.9569859504699707,
   'UMAP_CUDA': 1.5256545543670654}}}

In [61]:
import json
with open('runtimes_GPU_.json', 'w') as file:
    json.dump(runtimes, file)

In [62]:
!nvidia-smi

Thu Sep 15 16:00:20 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.73.05    Driver Version: 510.73.05    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:05.0 Off |                    0 |
| N/A   39C    P0    67W / 500W |   2253MiB / 81920MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [63]:
genomic_visualizer.garbage_collect()
!nvidia-smi

Thu Sep 15 16:00:43 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.73.05    Driver Version: 510.73.05    CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A100-SXM...  Off  | 00000000:00:05.0 Off |                    0 |
| N/A   39C    P0    68W / 500W |   2201MiB / 81920MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces