In [1]:
import aaanalysis as aa
import pandas as pd
import numpy as np
aa.options["verbose"] = False
X = np.array([[0.2, 0.1], [0.25, 0.2], [0.2, 0.3], [0.5, 0.7]])
labels = np.array([1, 2, 2, 2])

Use ``dPULearn`` with default Principal Component Analysis (PCA) to obtain a defined number of reliable negatives samples (0) by specifying the ``n_unl_to_neg`` parameter:

In [2]:
dpul = aa.dPULearn()
dpul.fit(X=X, labels=labels, n_unl_to_neg=1)
df_pu = dpul.df_pu_
labels = dpul.labels_ # Updated labels
aa.display_df(df_pu)

Unnamed: 0,selection_via,PC1 (100.0%),PC1 (100.0%)_abs_dif
1,,-0.4,0.0
2,,-0.2,0.2
3,,0.4,0.8
4,PC1,0.8,1.2


As a real-world example, you can load our γ-secretase substrate prediction dataset containing substrates (positive samples, 1) and a redundancy-reduced set of single-span type I transmembrane proteins with unknown substrates status (unlabeled samples, 2):

In [3]:
df_seq = aa.load_dataset(name="DOM_GSEC_PU")
labels = df_seq["label"].to_numpy()
n_pos = sum([x == 1 for x in labels])   # Get number of positive samples
aa.display_df(df=df_seq, show_shape=True)

DataFrame shape: (694, 8)


Unnamed: 0,entry,sequence,label,tmd_start,tmd_stop,jmd_n,tmd,jmd_c
1,P05067,MLPGLALLLLAA...NPTYKFFEQMQN,1,701,723,FAEDVGSNKG,AIIGLMVGGVVIATVIVITLVML,KKKQYTSIHH
2,P14925,MAGRARSGLLLL...YSAPLPKPAPSS,1,868,890,KLSTEPGSGV,SVVLITTLLVIPVLVLLAIVMFI,RWKKSRAFGD
3,P70180,MRSLLLFTFSAC...REDSIRSHFSVA,1,477,499,PCKSSGGLEE,SAVTGIVVGALLGAGLLMAFYFF,RKKYRITIER
4,Q03157,MGPTSPAARGQG...ENPTYRFLEERP,1,585,607,APSGTGVSRE,ALSGLLIMGAGGGSLIVLSLLLL,RKKKPYGTIS
5,Q06481,MAATGTAAAAAT...NPTYKYLEQMQI,1,694,716,LREDFSLSSS,ALIGLLVIAVAIATVIVISLVML,RKRQYGTISH
6,P35613,MAAALFVLLGFA...DKGKNVRQRNSS,1,323,345,IITLRVRSHL,AALWPFLGIVAEVLVLVTIIFIY,EKRRKPEDVL
7,P35070,MDRAARCSGASS...PINEDIEETNIA,1,119,141,LFYLRGDRGQ,ILVICLIAVMVVFIILVIGVCTC,CHPLRKRRKR
8,P09803,MGARCRSFSALL...KLADMYGGGEDD,1,711,733,GIVAAGLQVP,AILGILGGILALLILILLLLLFL,RRRTVVKEPL
9,P19022,MCRIAGALRTLL...KKLADMYGGGDD,1,724,746,RIVGAGLGTG,AIIAILLCIIILLILVLMFVVWM,KRRDKERQAK
10,P16070,MDKFWWHAAWGL...RNLQNVDMKIGV,1,650,672,GPIRTPQIPE,WLIILASLLALALILAVCIAVNS,RRRCGQKKKL


In [4]:
df_feat = aa.load_features(name="DOM_GSEC")
sf = aa.SequenceFeature()
df_parts = sf.get_df_parts(df_seq=df_seq)
X = sf.feature_matrix(features=df_feat["feature"], df_parts=df_parts)
# Number of positive (1) and unlabeled (2) samples
print(pd.Series(labels).value_counts())
dpul.fit(X=X, labels=labels, n_unl_to_neg=n_pos)
df_pu = dpul.df_pu_
new_labels = dpul.labels_ 
# Number of updated labels containing reliable negatives (0)
print(pd.Series(new_labels).value_counts())
aa.display_df(df=df_pu, show_shape=True)

2    631
1     63
Name: count, dtype: int64
2    568
1     63
0     63
Name: count, dtype: int64
DataFrame shape: (694, 15)


Unnamed: 0,selection_via,PC1 (56.2%),PC2 (7.4%),PC3 (2.9%),PC4 (2.8%),PC5 (2.1%),PC6 (1.7%),PC7 (1.6%),PC1 (56.2%)_abs_dif,PC2 (7.4%)_abs_dif,PC3 (2.9%)_abs_dif,PC4 (2.8%)_abs_dif,PC5 (2.1%)_abs_dif,PC6 (1.7%)_abs_dif,PC7 (1.6%)_abs_dif
1,,0.052,-0.039,0.066,-0.021,-0.003,-0.023,0.041,0.007,0.01,0.036,0.003,0.002,0.019,0.043
2,,0.052,-0.019,0.046,-0.046,0.015,0.004,0.02,0.007,0.011,0.015,0.028,0.016,0.008,0.021
3,,0.045,-0.058,0.0,-0.087,-0.013,-0.007,0.032,0.001,0.028,0.031,0.069,0.012,0.003,0.034
4,,0.052,-0.043,0.042,-0.012,0.002,0.006,0.025,0.006,0.013,0.012,0.006,0.003,0.01,0.027
5,,0.052,-0.051,0.062,0.004,-0.028,0.009,0.035,0.007,0.022,0.031,0.022,0.027,0.013,0.037
6,,0.049,-0.046,0.03,0.086,0.036,0.029,0.001,0.004,0.016,0.001,0.104,0.037,0.032,0.003
7,,0.036,0.04,0.035,-0.014,0.048,-0.062,-0.073,0.01,0.069,0.004,0.003,0.048,0.058,0.072
8,,0.05,-0.06,0.043,-0.014,-0.023,-0.015,0.034,0.005,0.03,0.013,0.004,0.022,0.011,0.036
9,,0.054,-0.065,0.028,0.001,0.049,0.002,-0.007,0.008,0.035,0.002,0.019,0.05,0.005,0.005
10,,0.044,-0.056,0.011,0.015,0.025,-0.022,-0.016,0.002,0.027,0.02,0.033,0.025,0.018,0.014


Since ``dPULearn().fit()`` returns the fitted model, list comprehension can be utilized to create results for various settings of a ``n_componentes``. If given as a float > 0 and < 1, this parameter represents the percentage of total variance to be retained by principal component analysis (PCA).

In [5]:
list_labels = [dpul.fit(X=X, labels=labels, n_unl_to_neg=n_pos, n_components=i).labels_ for i in [0.6, 0.7, 0.8, 0.9, 0.95]]

As alternative to ``PCA-based identification`` of negatives, ``distance-based identification`` can be performed using distance metrics including 'euclidean', 'manhattan', or 'cosine' distance. A DataFrame with the 

In [8]:
df_pu = dpul.fit(X=X, labels=labels, n_unl_to_neg=n_pos, metric="euclidean").df_pu_
aa.display_df(df_pu.sort_values(by="selection_via").head(100))

Unnamed: 0,selection_via,euclidean_dif,euclidean_abs_dif
84,euclidean,3.481,3.481
505,euclidean,3.233,3.233
509,euclidean,3.336,3.336
526,euclidean,3.39,3.39
533,euclidean,3.364,3.364
542,euclidean,3.075,3.075
546,euclidean,3.162,3.162
548,euclidean,3.112,3.112
552,euclidean,3.289,3.289
553,euclidean,3.621,3.621


Using ``PCA-based identification``, 'df_pu' provides the principal component (PC) values for all used PC and offers a label indicating based on which PC the respective negative samples was identified on:   

In [9]:
df_pu = dpul.fit(X=X, labels=labels, n_unl_to_neg=n_pos, n_components=0.8).df_pu_
aa.display_df(df_pu.sort_values(by="selection_via").head(100))

Unnamed: 0,selection_via,PC1 (56.2%),PC2 (7.4%),PC3 (2.9%),PC4 (2.8%),PC5 (2.1%),PC6 (1.7%),PC7 (1.6%),PC1 (56.2%)_abs_dif,PC2 (7.4%)_abs_dif,PC3 (2.9%)_abs_dif,PC4 (2.8%)_abs_dif,PC5 (2.1%)_abs_dif,PC6 (1.7%)_abs_dif,PC7 (1.6%)_abs_dif
497,PC1,0.022,0.051,-0.013,0.016,-0.06,0.064,-0.07,0.023,0.081,0.044,0.034,0.06,0.067,0.068
615,PC1,0.026,0.053,-0.099,0.002,0.001,0.027,0.005,0.02,0.083,0.13,0.019,0.001,0.031,0.007
406,PC1,0.025,0.031,-0.027,-0.01,0.052,-0.002,-0.009,0.02,0.061,0.058,0.007,0.052,0.002,0.007
446,PC1,0.026,0.014,-0.054,-0.062,0.024,-0.019,-0.054,0.019,0.044,0.085,0.044,0.024,0.015,0.052
455,PC1,0.027,0.052,-0.09,-0.045,-0.073,0.029,0.061,0.019,0.082,0.12,0.027,0.072,0.033,0.063
468,PC1,0.026,0.069,-0.012,-0.003,-0.027,0.006,-0.078,0.02,0.099,0.042,0.015,0.027,0.01,0.076
471,PC1,0.025,0.006,-0.084,-0.016,-0.064,-0.056,-0.007,0.02,0.035,0.114,0.002,0.063,0.052,0.005
668,PC1,0.023,0.017,-0.076,0.058,-0.023,0.011,0.042,0.022,0.047,0.107,0.076,0.023,0.015,0.044
605,PC1,0.026,0.054,-0.007,0.019,-0.015,-0.002,0.021,0.02,0.084,0.037,0.037,0.015,0.002,0.023
505,PC1,0.023,0.048,-0.034,0.047,-0.002,0.016,-0.031,0.022,0.078,0.065,0.065,0.001,0.019,0.029
