In [27]:
!nvidia-smi

Fri Sep  9 16:13:38 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.91.03    Driver Version: 460.91.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  RTX A6000           Off  | 00000000:00:05.0 Off |                  Off |
| 30%   38C    P8    23W / 300W |  10036MiB / 48685MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
import cudf
import cuml
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#!pip install plotly
#import plotly.graph_objs as go
import requests
#import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import time
from cuml.decomposition import PCA as PCA_CUDA
from cuml import UMAP as UMAP_CUDA
from cuml import IncrementalPCA as IncPCA_CUDA
import cupy as cp
from cuml.preprocessing import MaxAbsScaler
import numba
from cuml.preprocessing import OneHotEncoder
from cuml import TSNE as TSNE_CUDA
#from plotly.offline import init_notebook_mode, iplot
#init_notebook_mode(connected=True) #plotly
#from tsne_torch import TorchTSNE 
#import h5py
from sklearn.decomposition import PCA as PCA_CPU
from sklearn.manifold import TSNE as TSNE_CPU
#from umap import UMAP as UMAP_CPU
from sklearn.preprocessing import OneHotEncoder as CPU_ohe

In [4]:
class GenomicVisualizerDataPrep():
    
    
    def __init__(self, file_path):
        start = time.time()
        self.file_paths = file_path
        self.df_genetic_data = cudf.read_hdf(path_or_buf=file_path, key="Genotypes_variants_per_sample")
        self.df_phenotype_data = cudf.read_hdf(path_or_buf=file_path, key="phenotype_data")
        self.X = None
        self.performance_dict = {}
        self.components = 0
        self.columns = []
        self.performance_dict["HDF5_Read_GPU"] = time.time() - start        
        
        
    def one_hot_encode(self):
        start = time.time()
        self.X = OneHotEncoder().fit_transform(self.df_genetic_data)
        self.X = self.X.toarray().astype(cp.float64)
        self.performance_dict["One_Hot_Encoding_GPU"] = time.time() - start 
        

class CPU_GenomicVisualizerDataPrep():
    
    def __init__(self, file_path):
        start = time.time()
        self.df_genetic_data = pd.read_hdf(path_or_buf=file_path, key="Genotypes_variants_per_sample")
        self.df_phenotype_data = pd.read_hdf(path_or_buf=file_path, key="phenotype_data")
        self.X = None
        self.performance_dict = {}
        self.components = 0
        self.columns = []
        self.performance_dict["HDF5_Read_CPU"] = time.time() - start
        
        
    def one_hot_encode(self):
        start = time.time()
        self.X = CPU_ohe().fit_transform(self.df_genetic_data.values).toarray().astype(np.float64)  
        self.performance_dict["One_Hot_Encoding_CPU"] = time.time() - start


In [5]:
!pip install tables

Collecting tables
  Downloading tables-3.7.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.9/5.9 MB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting numexpr>=2.6.2
  Downloading numexpr-2.8.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (381 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m381.5/381.5 kB[0m [31m68.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: numexpr, tables
Successfully installed numexpr-2.8.3 tables-3.7.0
[0m

In [20]:
genomic_visualizer = GenomicVisualizerDataPrep("AdaptMap-Goat-Project.h5")

In [21]:
print(genomic_visualizer.performance_dict)

{'HDF5_Read_GPU': 12.901121854782104}


In [12]:
genomic_visualizer.one_hot_encode()

In [13]:
print(genomic_visualizer.performance_dict)

{'HDF5_Read_GPU': 13.290412425994873, 'One_Hot_Encoding_GPU': 309.4436366558075}


In [15]:
genomic_visualizer_cpu = CPU_GenomicVisualizerDataPrep("AdaptMap-Goat-Project.h5")

In [16]:
print(genomic_visualizer_cpu.performance_dict)

{'HDF5_Read_CPU': 0.9782798290252686}


In [18]:
genomic_visualizer_cpu.one_hot_encode()

In [19]:
print(genomic_visualizer_cpu.performance_dict)

{'HDF5_Read_CPU': 0.9782798290252686, 'One_Hot_Encoding_CPU': 33.14959168434143}


In [23]:
start = time.time()
genomic_visualizer.df_genetic_data = genomic_visualizer.df_genetic_data.to_pandas
print(f"Transfer of dataframe from GPU to CPU: {time.time()-start}")

Transfer of dataframe from GPU to CPU: 0.00011777877807617188


In [25]:
start = time.time()
genomic_visualizer.X = cp.asnumpy(genomic_visualizer.X).astype(np.float32)
print(f"Transfer of array from GPU to CPU: {time.time()-start}")

Transfer of array from GPU to CPU: 0.00011205673217773438


In [26]:
print(type(genomic_visualizer.X))

<class 'numpy.ndarray'>
