# Demonstrating the embedding of new biosamples into the coordinate space.
Based on 733 sample x 3.5+ million DHS presence/absence matrix

# 1. Import libraries & load datasets

In [1]:
from platform import python_version
print(python_version())

3.6.4


In [2]:
import sys
import numpy as np
import pandas as pd
import gzip
import matplotlib.pyplot as plt
from sklearn.decomposition import NMF, non_negative_factorization

In [3]:
sys.path.append('..')
import OONMFhelpers
import OONMF

#### Fetch the 733-biosample presence/absence Index
This will take at least a few minutes.
These data can be obtained from here: https://doi.org/10.5281/zenodo.3752359

In [4]:
A = pd.read_table('../data/dat_bin_FDR01_hg38.txt.gz', header=None).T
A.shape

(733, 3591898)

#### Fetch the new-to-be-added immune data, mapped to the 733-biosample Index
This is done using an external script (see `../scripts/get_new_peak_data.sh`) that calls `bedmap` to determine whether hotspot2 0.1% FDR variable-width peak calls for the new-to-be-added datasets overlap the 3.5M+ DHS Index elements.
It then pastes these results together to form a 3591898 x N matrix, where N is the number of new-to-be-added datasets.
Alternatively, it is possible, but less straightforward, to include data this way without explicitly having peak calls. This is demonstrated elsewhere (see `../scripts/get_new_signal_data.sh`).

In [7]:
dat_immune = pd.read_table('../data/dat_immunopedia.txt.gz', header=None).T
dat_immune.shape

(38, 3591898)

#### Combine the two datasets

In [8]:
A_immune = np.concatenate([A, dat_immune])
A_immune.shape

(771, 3591898)

#### Load in NMF decomposition based on 733 biosamples
These data can also be obtained from here: https://doi.org/10.5281/zenodo.3752359

In [9]:
B = np.load(gzip.GzipFile('../data/2018-06-08NC16_NNDSVD_Mixture.npy.gz', "r"))
B.shape

(16, 3591898)

# 2. Solve the model for a fixed matrix H
H represents the DHS-wise decomposition of the original 733 biosamples)

In [10]:
W, H, n_iter = non_negative_factorization(A_immune, n_components=16, init='custom', random_state=3, update_H=False, H=B, verbose=1)

violation: 1.0
violation: 0.49078091705465937
violation: 0.18456859657810026
violation: 0.08228335564482
violation: 0.042609737196511795
violation: 0.025079357702181126
violation: 0.016286164046059516
violation: 0.010908663213629759
violation: 0.007755537427980669
violation: 0.005585662753447762
violation: 0.004038211289276709
violation: 0.002935039120549785
violation: 0.00210858647902221
violation: 0.0015394798110338018
violation: 0.0011504768972838112
violation: 0.0008592764207627129
violation: 0.0006511876399618211
violation: 0.0005004296659783482
violation: 0.00038875881594693535
violation: 0.0003028347850570793
violation: 0.00023704594233261848
violation: 0.0001841462970127696
violation: 0.00014212936372233226
violation: 0.00011053179547323599
violation: 8.64181689412054e-05
Converged at iteration 25


#### Save the results to disk for later use

In [11]:
f = gzip.GzipFile('../data/2020-04-14ImmuneSpecialNC16seed20.npy.gz', "w")
np.save(f, W)
f.close()