In [53]:
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from tqdm import tqdm

from parse_features import read_pssm

In [54]:
def pca_features(features, n_components=2):
    pca = PCA(n_components=n_components)
    pca.fit(features)
    return pca


In [55]:
def train_pca(pca, features):
    pca.fit(features)
    return pca

In [64]:
pca = PCA(n_components=4)
pssm_files = Path('data/features/pssm').glob('*.pssm')
disorder_train = set(pd.read_csv('data/splits/disorder_train.tsv', sep='\t', header=None).iloc[:, 0])
pssm_train_files = [p for p in pssm_files if p.stem in disorder_train]
for file in tqdm(pssm_train_files):
    pca = train_pca(pca, read_pssm(file))


100%|██████████| 1585/1585 [00:18<00:00, 87.68it/s] 


In [65]:
pca.explained_variance_ratio_

array([0.41447671, 0.15028737, 0.09884377, 0.08248491])

In [66]:
import pickle

In [67]:
from joblib import dump

dump(pca, 'data/features/pssm_pca/pssm.joblib')


['data/features/pssm_pca/pssm.joblib']

In [68]:
from joblib import load

pca = load('data/features/pssm_pca/pssm.joblib')


In [74]:
pssm = read_pssm('data/features/pssm/A6Q0K5.pssm')


In [77]:
pca.transform(pssm)


array([[ 6.60800175e+00, -5.40845427e+00, -1.97750923e+00,
        -2.45207381e+00],
       [ 5.66764405e+00, -4.87086835e+00, -2.63889938e+00,
        -3.94992249e-01],
       [-2.64260333e-02, -3.47695836e+00, -3.61231188e+00,
         2.49387583e+00],
       [ 1.55803789e+00, -4.14164428e+00, -2.89909575e+00,
         4.30499174e+00],
       [-5.18806840e+00, -1.24517472e+00, -1.19916070e+00,
        -7.42672389e-01],
       [-3.76182379e+00, -2.59633091e+00, -2.67912198e+00,
         1.60834712e+00],
       [-2.95213613e+00, -3.67185988e+00,  3.04440240e-01,
         6.12481410e-03],
       [ 1.11611079e+00, -4.46643541e+00, -4.02665322e+00,
         1.16175261e+00],
       [ 8.02963219e+00, -4.50334083e+00, -2.32406765e+00,
         5.87948370e-01],
       [-5.90242961e-01, -5.91103583e+00, -1.70004209e+00,
         2.71289731e+00],
       [-2.82093380e+00, -1.47160704e+00,  2.50591820e+00,
        -2.11129826e+00],
       [-1.95497052e+00, -2.26476651e+00, -2.90146043e+00,
      