In [1]:
import h5py
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import time
import torch
import sys
sys.path.append('../scripts/src')
from multihead_admixture import AdmixtureMultiHead
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mutual_info_score, rand_score, adjusted_rand_score, fowlkes_mallows_score, adjusted_mutual_info_score, v_measure_score, homogeneity_completeness_v_measure

In [2]:
def get_model_preds(model, data, only_time=False, use_gpu=True):
    if use_gpu:
        assert torch.cuda.is_available(), 'GPU not available. Set use_gpu to False.'
    model.to(torch.device('cuda:0' if use_gpu else 'cpu'))
    if not only_time:
        outs = torch.tensor([])
    t0 = time.time()
    model.eval()
    with torch.no_grad():
        for i, (X, _) in enumerate(model._batch_generator(data, 200, shuffle=False)):
            X = X.to(torch.device('cuda:0' if use_gpu else 'cpu'))
            out = model(X, True)
            if not only_time:
                outs = torch.cat((outs, out[0].detach().cpu()), axis=0)
    te = time.time()
    print(te-t0, 'seconds ellapsed in {}'.format('GPU' if use_gpu else 'CPU'))
    return time.time()-t0 if only_time else outs.cpu().numpy()

In [3]:
def output_metrics(gt, preds_class, preds_neur):
    assert len(gt) == len(preds_class) == len(preds_neur), 'GT and predictions do not have same number of samples'
    class_assign = np.argmax(preds_class, axis=1)
    neur_assign = np.argmax(preds_neur, axis=1)
    gt_oh = OneHotEncoder().fit_transform(gt.reshape(-1,1))
    print('--------- METRICS (FUZZY) ---------')
    print('Mean Squared Second Order Difference (MSSOD)')
    print('Classical', ((preds_class@preds_class.T-gt_oh@gt_oh.T)**2).mean())
    print('Neural', ((preds_neur@preds_neur.T-gt_oh@gt_oh.T)**2).mean())
    print('---------------------------')
    print('--------- METRICS (HARD) ---------')
    print('--------------------------------------------------')
    print('|Will use argmax of predictions to compute scores|')
    print('--------------------------------------------------')
    print('Fowlkes Mallows Score')
    print('Classical', fowlkes_mallows_score(gt, class_assign))
    print('Neural', fowlkes_mallows_score(gt, neur_assign))
    print('---------------------------')
    print('V-Measure')
    print('Classical', v_measure_score(gt, class_assign))
    print('Neural', v_measure_score(gt, neur_assign))
    print('---------------------------')
    print('Adjusted mutual information score')
    print('Classical', adjusted_mutual_info_score(gt, class_assign))
    print('Neural', adjusted_mutual_info_score(gt, neur_assign))
    print('---------------------------')

In [4]:
tr_file_0 = h5py.File('/mnt/gpid08/users/albert.dominguez/data/chr22/windowed/train_found_avg.h5', 'r')
val_file_0 = h5py.File('/mnt/gpid08/users/albert.dominguez/data/chr22/windowed/valid_found_avg.h5', 'r')
tr_file_0_2 = h5py.File('/mnt/gpid08/users/albert.dominguez/data/chr22/windowed/train_2gen_avg.h5', 'r')
val_file_0_2 = h5py.File('/mnt/gpid08/users/albert.dominguez/data/chr22/windowed/valid_2gen_avg.h5', 'r')

trX_0, trY_0 = tr_file_0['snps'], tr_file_0['populations']
valX_0, valY_0 = val_file_0['snps'], val_file_0['populations']
trX_0_2, trY_0_2 = tr_file_0_2['snps'], tr_file_0_2['populations']
valX_0_2, valY_0_2 = val_file_0_2['snps'], val_file_0_2['populations']

## Chr22 - Unsupervised, Gens 0 and 2 (207b03a28f894906912a5397acd3ffd4)

In [14]:
model_path = '/mnt/gpid08/users/albert.dominguez/weights/chr22/207b03a28f894906912a5397acd3ffd4.pt'
model = AdmixtureMultiHead([7], 317408, batch_norm=True,
                           batch_norm_hidden=False, dropout=0, pooling=1, linear=1).to(torch.device('cuda:0'))
model.load_state_dict(torch.load(model_path, map_location=torch.device('cuda:0')), strict=False)

<All keys matched successfully>

In [15]:
Qs_class = pd.read_csv('/home/usuaris/imatge/albert.dominguez/gpid08/classical/gen0_2_chr22.7.Q',sep=' ', names=np.array(range(7))).to_numpy()
Qs_class_valid = pd.read_csv('/home/usuaris/imatge/albert.dominguez/gpid08/classical/gen0_2_chr22_valid.7.Q',sep=' ', names=np.array(range(7))).to_numpy()

In [16]:
assert Qs_class.shape[0] == trX_0_2.shape[0], 'Not same data'
assert Qs_class_valid.shape[0] == valX_0_2.shape[0], 'Not same data'
Qs_neur = get_model_preds(model, trX_0_2)
Qs_neur_valid = get_model_preds(model, valX_0_2)

13.044774293899536 seconds ellapsed in GPU
2.2285995483398438 seconds ellapsed in GPU


In [17]:
print('TRAIN')
output_metrics(trY_0_2[:], Qs_class, Qs_neur)

TRAIN
--------- METRICS (FUZZY) ---------
Mean Squared Second Order Difference (MSSOD)
Classical 15.011621800366477
Neural 1.77631949832968
---------------------------
--------- METRICS (HARD) ---------
--------------------------------------------------
|Will use argmax of predictions to compute scores|
--------------------------------------------------
Fowlkes Mallows Score
Classical 0.8640602350942466
Neural 0.9342663309158413
---------------------------
V-Measure
Classical 0.8999268699146612
Neural 0.9252862151148682
---------------------------
Adjusted mutual information score
Classical 0.8997227527818956
Neural 0.9251401371275143
---------------------------


In [18]:
print('VALID')
output_metrics(valY_0_2[:], Qs_class_valid, Qs_neur_valid)

VALID
--------- METRICS (FUZZY) ---------
Mean Squared Second Order Difference (MSSOD)
Classical 2.9632264020637216
Neural 0.6219446060366675
---------------------------
--------- METRICS (HARD) ---------
--------------------------------------------------
|Will use argmax of predictions to compute scores|
--------------------------------------------------
Fowlkes Mallows Score
Classical 0.8291473850220182
Neural 0.8864775389428563
---------------------------
V-Measure
Classical 0.8463014084748981
Neural 0.8821732393340607
---------------------------
Adjusted mutual information score
Classical 0.8442532851510034
Neural 0.8806709026796845
---------------------------


## Chr22 - Unsupervised Gen 0 (376b06410f424d2d91516cb3f52d61ba)

In [11]:
model_path = '/mnt/gpid08/users/albert.dominguez/weights/chr22/376b06410f424d2d91516cb3f52d61ba.pt'
model = AdmixtureMultiHead([7], 317408, batch_norm=True,
                           batch_norm_hidden=False, dropout=0, pooling=1, linear=1).to(torch.device('cuda:0'))
model.load_state_dict(torch.load(model_path, map_location=torch.device('cuda:0')), strict=True)

<All keys matched successfully>

In [12]:
Qs_class = pd.read_csv('/home/usuaris/imatge/albert.dominguez/ADMIXTURE/gen0_chr22_train.7.Q',sep=' ', names=np.array(range(7))).to_numpy()
Qs_class_valid = pd.read_csv('/home/usuaris/imatge/albert.dominguez/ADMIXTURE/gen0_chr22_valid.7.Q',sep=' ', names=np.array(range(7))).to_numpy()

In [13]:
assert Qs_class.shape[0] == trX_0.shape[0], 'Not same training data'
assert Qs_class_valid.shape[0] == valX_0.shape[0], 'Not same validation data'
Qs_neur = get_model_preds(model, trX_0)
Qs_neur_valid = get_model_preds(model, valX_0)

5.6363115310668945 seconds ellapsed in GPU
1.1841380596160889 seconds ellapsed in GPU


In [24]:
print('TRAIN')
output_metrics(trY_0[:], Qs_class, Qs_neur)

TRAIN
--------- METRICS (FUZZY) ---------
Mean Squared Second Order Difference (MSSOD)
Classical 6.707298310219567
Neural 2.437341182741671
---------------------------
--------- METRICS (HARD) ---------
--------------------------------------------------
|Will use argmax of predictions to compute scores|
--------------------------------------------------
Fowlkes Mallows Score
Classical 0.8716743608620509
Neural 0.9269361397359324
---------------------------
V-Measure
Classical 0.8267612512619568
Neural 0.8800007015886288
---------------------------
Adjusted mutual information score
Classical 0.8258333889614003
Neural 0.879443724896991
---------------------------


In [25]:
print('VALID')
output_metrics(valY_0[:], Qs_class_valid, Qs_neur_valid)

VALID
--------- METRICS (FUZZY) ---------
Mean Squared Second Order Difference (MSSOD)
Classical 1.551405133282568
Neural 0.6677955358847198
---------------------------
--------- METRICS (HARD) ---------
--------------------------------------------------
|Will use argmax of predictions to compute scores|
--------------------------------------------------
Fowlkes Mallows Score
Classical 0.8361400491655274
Neural 0.9115387626490555
---------------------------
V-Measure
Classical 0.7947127155466417
Neural 0.8710457569604863
---------------------------
Adjusted mutual information score
Classical 0.7893895576547093
Neural 0.8681440098071712
---------------------------


In [10]:
print('VALID')
_ = get_model_preds(model, valX_0, use_gpu=True, only_time=True)
_ = get_model_preds(model, valX_0, use_gpu=False, only_time=True)

VALID
1.0049283504486084 seconds ellapsed in GPU
4.798365831375122 seconds ellapsed in CPU


## Chr22 - ADMIXTURE reusage, Gen 0 (0b5468c41e714147844e45a0813865d7)

In [6]:
model_path = '/mnt/gpid08/users/albert.dominguez/weights/chr22/0b5468c41e714147844e45a0813865d7.pt'
model = AdmixtureMultiHead([7], 317408, batch_norm=True,
                           batch_norm_hidden=False, dropout=0, pooling=1, linear=1).to(torch.device('cuda:0'))
model.load_state_dict(torch.load(model_path, map_location=torch.device('cuda:0')), strict=False)

<All keys matched successfully>

In [35]:
Qs_class = pd.read_csv('/home/usuaris/imatge/albert.dominguez/ADMIXTURE/gen0_chr22_train.7.Q',sep=' ', names=np.array(range(7))).to_numpy()
Qs_class_valid = pd.read_csv('/home/usuaris/imatge/albert.dominguez/ADMIXTURE/gen0_chr22_valid.7.Q',sep=' ', names=np.array(range(7))).to_numpy()

In [36]:
assert Qs_class.shape[0] == trX_0.shape[0], 'Not same training data'
assert Qs_class_valid.shape[0] == valX_0.shape[0], 'Not same validation data'
Qs_neur = get_model_preds(model, trX_0)
Qs_neur_valid = get_model_preds(model, valX_0)

5.588572978973389 seconds ellapsed
1.133047103881836 seconds ellapsed


In [37]:
print('TRAIN')
output_metrics(trY_0[:], Qs_class, Qs_neur)

TRAIN
--------- METRICS (FUZZY) ---------
Mean Squared Second Order Difference (MSSOD)
Classical 6.707298310219567
Neural 6.06827458116481
---------------------------
--------- METRICS (HARD) ---------
--------------------------------------------------
|Will use argmax of predictions to compute scores|
--------------------------------------------------
Fowlkes Mallows Score
Classical 0.8716743608620509
Neural 0.8420178574500671
---------------------------
V-Measure
Classical 0.8267612512619568
Neural 0.7927823813066228
---------------------------
Adjusted mutual information score
Classical 0.8258333889614003
Neural 0.7916737947827464
---------------------------


In [38]:
print('VALID')
output_metrics(valY_0[:], Qs_class_valid, Qs_neur_valid)

VALID
--------- METRICS (FUZZY) ---------
Mean Squared Second Order Difference (MSSOD)
Classical 1.551405133282568
Neural 1.5426310529053393
---------------------------
--------- METRICS (HARD) ---------
--------------------------------------------------
|Will use argmax of predictions to compute scores|
--------------------------------------------------
Fowlkes Mallows Score
Classical 0.8361400491655274
Neural 0.8283815180725387
---------------------------
V-Measure
Classical 0.7947127155466417
Neural 0.78709299888713
---------------------------
Adjusted mutual information score
Classical 0.7893895576547093
Neural 0.7815571388679269
---------------------------


In [7]:
print('VALID')
_ = get_model_preds(model, valX_0, use_gpu=True, only_time=True)
_ = get_model_preds(model, valX_0, use_gpu=False, only_time=True)

VALID
1.0814101696014404 seconds ellapsed in GPU
4.681434392929077 seconds ellapsed in CPU


## Chr22 - Supervised, Gen 0 (e28a10422a9e4b62af8572bc54363851)

### Metrics

In [45]:
model_path = '/mnt/gpid08/users/albert.dominguez/weights/chr22/e28a10422a9e4b62af8572bc54363851.pt'
model = AdmixtureMultiHead([7], 317408, batch_norm=True,
                           batch_norm_hidden=False, dropout=0, pooling=1, linear=1).to(torch.device('cuda:0'))
model.load_state_dict(torch.load(model_path, map_location=torch.device('cuda:0')), strict=False)

<All keys matched successfully>

In [46]:
Qs_class = pd.read_csv('/home/usuaris/imatge/albert.dominguez/ADMIXTURE/gen0_chr22_train_supervised.7.Q',sep=' ', names=np.array(range(7))).to_numpy()
Qs_class_valid = pd.read_csv('/home/usuaris/imatge/albert.dominguez/ADMIXTURE/gen0_chr22_valid_supervised.7.Q',sep=' ', names=np.array(range(7))).to_numpy()

In [47]:
assert Qs_class.shape[0] == trX_0.shape[0], 'Not same training data'
assert Qs_class_valid.shape[0] == valX_0.shape[0], 'Not same validation data'
Qs_neur = get_model_preds(model, trX_0)
Qs_neur_valid = get_model_preds(model, valX_0)

5.179257154464722 seconds ellapsed
1.1776306629180908 seconds ellapsed


In [48]:
print('TRAIN')
output_metrics(trY_0[:], Qs_class, Qs_neur)

TRAIN
--------- METRICS (FUZZY) ---------
Mean Squared Second Order Difference (MSSOD)
Classical 3.358932046499114e-07
Neural 1.1184102656377742e-05
---------------------------
--------- METRICS (HARD) ---------
--------------------------------------------------
|Will use argmax of predictions to compute scores|
--------------------------------------------------
Fowlkes Mallows Score
Classical 1.0
Neural 1.0
---------------------------
V-Measure
Classical 1.0
Neural 1.0
---------------------------
Adjusted mutual information score
Classical 1.0
Neural 1.0
---------------------------


In [49]:
print('VALID')
output_metrics(valY_0[:], Qs_class_valid, Qs_neur_valid)

VALID
--------- METRICS (FUZZY) ---------
Mean Squared Second Order Difference (MSSOD)
Classical 0.4651858439708802
Neural 0.2640352250473474
---------------------------
--------- METRICS (HARD) ---------
--------------------------------------------------
|Will use argmax of predictions to compute scores|
--------------------------------------------------
Fowlkes Mallows Score
Classical 0.9247131693469138
Neural 0.9375786750641184
---------------------------
V-Measure
Classical 0.8835567334847422
Neural 0.9002929354258082
---------------------------
Adjusted mutual information score
Classical 0.8805300871814828
Neural 0.8976951184058082
---------------------------


### Time

In [56]:
print('TRAIN')
_ = get_model_preds(model, trX_0, use_gpu=True, only_time=True)
print('VALID')
_ = get_model_preds(model, valX_0, use_gpu=True, only_time=True)

TRAIN
4.699835300445557 seconds ellapsed in GPU
VALID
1.0670619010925293 seconds ellapsed in GPU


In [57]:
print('VALID')
_ = get_model_preds(model, valX_0, use_gpu=False, only_time=True)

VALID
6.23095440864563 seconds ellapsed in CPU
