# Demonstrating the embedding of new biosamples into the coordinate space.
Based on 733 sample x 3.5+ million DHS presence/absence matrix

# 1. Import libraries & load datasets

In [1]:
from platform import python_version
print(python_version())

3.6.4


In [2]:
import sys
import numpy as np
import pandas as pd
import gzip
import matplotlib.pyplot as plt
from sklearn.decomposition import NMF, non_negative_factorization

In [3]:
sys.path.append('..')
import OONMFhelpers
import OONMF

#### Fetch the 733-biosample presence/absence Index
This will take at least a few minutes.
These data can be obtained from here: https://drive.google.com/open?id=1Nel7wWOWhWn40Yv7eaQFwvpMcQHBNtJ2

In [4]:
A = pd.read_table('../data/dat_bin_FDR01_hg38.txt.gz', header=None).T
A.shape

(733, 3591898)

#### Fetch the pancreatic (new) data, mapped to the 733-biosample Index
This is done using an external scripts (see `../scripts/get_new_data.sh`) that calls `bedmap` to determine whether hotspot2 0.1% FDR variable-width peak calls for the new-to-be-added datasets overlap the 3.5M+ DHS Index elements.
It then pastes these results together to form a 3591898 x N matrix, where N is the number of new-to-be-added datasets.

In [5]:
dat_panc = pd.read_table('../data/dat_pancreatic.txt.gz', header=None).T
dat_panc.shape

(13, 3591898)

#### Combine the two datasets

In [6]:
A_panc = np.concatenate([A, dat_panc])
A_panc.shape

(746, 3591898)

#### Load in NMF decomposition based on 733 biosamples

In [7]:
B = np.load(gzip.GzipFile('../data/2018-06-08NC16_NNDSVD_Mixture.npy.gz', "r"))
B.shape

(16, 3591898)

# 2. Solve the model for a fixed matrix H
H represents the DHS-wise decomposition of the original 733 biosamples)

In [8]:
W, H, n_iter = non_negative_factorization(A_panc, n_components=16, init='custom', random_state=3, update_H=False, H=B, verbose=1)

violation: 1.0
violation: 0.4929621744510417
violation: 0.1853323330745837
violation: 0.08331321927403668
violation: 0.043617396007525136
violation: 0.025852198932186866
violation: 0.016842854453874376
violation: 0.011298137150839724
violation: 0.008033248578312117
violation: 0.005779695162327012
violation: 0.004171513997047539
violation: 0.0030354465412899888
violation: 0.0021863463495587158
violation: 0.0015979261519840889
violation: 0.0011930990429652511
violation: 0.0008902162131922194
violation: 0.0006741234513121173
violation: 0.000516968860628029
violation: 0.0004006243327516972
violation: 0.0003120934212809746
violation: 0.0002445470343324248
violation: 0.0001899730978228717
violation: 0.0001464879477680397
violation: 0.00011381632473706886
violation: 8.892023343986581e-05
Converged at iteration 25


#### Save the results to disk for later use

In [9]:
f = gzip.GzipFile('../data/2018-11-09PancSpecialNC16seed20.npy.gz', "w")
np.save(f, W)
f.close()