# Protein Protein Interactions Prediction

## Feature extraction:

Now that we have the protein sequences and their corresponding PSSM matrices, we can start extarcting the features used to train our model.

First we start by loading our JSON parsed data into numpy arrays so that we can perform mathematical computations on them more easily.

The result is a dictionnary in the format `{seq : pssm}` that we can use to extract features from either sequence or the pssm matrix.

In [15]:
import psycopg2
import os
import numpy as np
from tqdm import tqdm
from dotenv.main import load_dotenv
from urllib.parse import urlparse

In [16]:
# Reading the DB API info from the .env file
load_dotenv()
URI = urlparse(os.getenv("DB_URI"))

## First feature vector: Estimation of the distribution of the protein

In [3]:
# connecting to the database
with psycopg2.connect(URI.geturl()) as conn:
    with conn.cursor() as cur:

        # add a new column to the table
        # cur.execute("ALTER TABLE PSSMS ADD COLUMN pssm_sum NUMERIC[]")
        
        # get all the proteins in the database where the pssm column does not have a value
        print("Fetching all the proteins...")
        cur.execute("SELECT sequence, pssm FROM PSSMS WHERE pssm_sum IS NULL")

        # fetch all the proteins
        proteins = cur.fetchall()
        print("Done!")

        # list to save the pssm sums temporarily for normalizing
        pssm_sums_temp = []

        # for each protein in the DB, get the sum column wise
        print("Getting the pssm sums...")
        for protein in tqdm(proteins):
            seq, pssm = protein

            # converting the pssm to a numpy array for easier manipulation
            pssm_np = np.array(pssm)

            # summing the columns
            pssm_sum = np.sum(pssm_np, axis=0)

            # adding the sum to the list
            pssm_sums_temp.append(pssm_sum)
        
        # converting the list to a numpy array
        pssm_sums_temp_numpy = np.array(pssm_sums_temp)

        # get the max and min values for each position
        pssm_sum_max = np.max(pssm_sums_temp_numpy, axis=0)
        pssm_sum_min = np.min(pssm_sums_temp_numpy, axis=0)

        # for each protein in the DB, normalize the pssm sum
        print("Normalizing the pssm sums...")
        for i, protein in enumerate(tqdm(proteins)):

            seq, _ = protein

            # get the sum
            pssm_sum = pssm_sums_temp[i]

            # normalize the sum
            pssm_sum_norm = (pssm_sum - pssm_sum_min) / (pssm_sum_max * len(seq))

            # update the database
            cur.execute("UPDATE PSSMS SET pssm_sum = %s WHERE sequence = %s", (pssm_sum_norm.tolist(), protein[0]))


        

Fetching all the proteins...
Done!
Getting the pssm sums...


100%|██████████| 16114/16114 [00:09<00:00, 1633.08it/s]


Normalizing the pssm sums...


100%|██████████| 16114/16114 [36:55<00:00,  7.27it/s]  


### Normalizing the data:

To avoid biases, we can normalize the 20-length vectors that we generated using the following formula: `d_i = (d_i - min)/(L * max)`

In [15]:
# connecting to the database
with psycopg2.connect(URI.geturl()) as conn:
    with conn.cursor() as cur:

        # get all the proteins in the database
        print("Fetching all the proteins...")
        cur.execute("SELECT sequence, pssm_sum FROM PSSMS")

        # fetch all the proteins
        proteins = cur.fetchall()
        print("Done!")
        
        # get the max and min values for each position
        pssm_sum_matrix = np.array([protein[1] for protein in proteins])
        pssm_sum_max = np.max(pssm_sum_matrix, axis=0)
        pssm_sum_min = np.min(pssm_sum_matrix, axis=0)
        
        # normalize the pssm_sum vector
        for protein in tqdm(proteins):
            seq, pssm_sum = protein
            L = len(seq)

            # converting the pssm to a numpy array for easier manipulation
            pssm_sum = np.array(pssm_sum)

            # normalizing the values
            pssm_sum_norm = (pssm_sum - pssm_sum_min) / (pssm_sum_max * L)

            # converting the numpy array to a list
            pssm_sum_norm = pssm_sum_norm.tolist()

            # updating the database
            cur.execute("UPDATE PSSMS SET pssm_sum = %s WHERE sequence = %s", (pssm_sum_norm, seq))

Fetching all the proteins...
Done!


100%|██████████| 10037/10037 [13:17<00:00, 12.58it/s]


In [16]:
# connecting to the database
with psycopg2.connect(URI.geturl()) as conn:
    with conn.cursor() as cur:

        # get a random protein and show it's pssm_sum
        cur.execute("SELECT sequence, pssm_sum FROM PSSMS LIMIT 1")

        for protein in cur.fetchall():
            
            # fetch the protein
            seq, pssm_sum = protein

            # print the protein
            print(seq)
            print()
            print(pssm_sum)


MEEVVIAGMSGKLPESENLQEFWDNLIGGVDMVTDDDRRWKAGLYGLPRRSGKLKDLSRFDASFFGVHPKQAHTMDPQLRLLLEVTYEAIVDGGINPDSLRGTHTGVWVGVSGSETSEALSRDPETLVGYSMVGCQRAMMANRLSFFFDFRGPSIALDTACSSSLMALQNAYQAIHSGQCPAAIVGGINVLLKPNTSVQFLRLGMLSPEGTCKAFDTAGNGYCRSEGVVAVLLTKKSLARRVYATILNAGTNTDGFKEQGVTFPSGDIQEQLIRSLYQSAGVAPESFEYIEAHGTGTKVGDPQELNGITRALCATRQEPLLIGSTKSNMGHPEPASGLAALAKVLLSLEHGLWAPNLHFHSPNPEIPALLDGRLQVVDQPLPVRGGNVGINSFGFGGSNVHIILRPNTQPPPAPAPHATLPRLLRASGRTPEAVQKLLEQGLRHSQDLAFLSMLNDIAAVPATAMPFRGYAVLGGERGGPEVQQVPAGERPLWFICSGMGTQWRGMGLSLMRLDRFRDSILRSDEAVKPFGLKVSQLLLSTDESTFDDIVHSFVSLTAIQIGLIDLLSCMGLRPDGIVGHSLGEVACGYADGCLSQEEAVLAAYWRGQCIKEAHLPPGAMAAVGLSWEECKQRCPPGVVPACHNSKDTVTISGPQAPVFEFVEQLRKEGVFAKEVRTGGMAFHSYFMEAIAPPLLQELKKVIREPKPRSARWLSTSIPEAQWHSSLARTSSAEYNVNNLVSPVLFQEALWHVPEHAVVLEIAPHALLQAVLKRGLKPSCTIIPLMKKDHRDNLEFFLAGIGRLHLSGIDANPNALFPPVEFPAPRGTPLISPLIKWDHSLAWDVPAAEDFPNGSGSPSAAIYNIDTSSESPDHYLVDHTLDGRVLFPATGYLSIVWKTLARALGLGVEQLPVVFEDVVLHQATILPKTGTVSLEVRLLEASRAFEVSENGNLVVSGKVYQWDDPDPRLFDHPESPTPNPTEPLFLAQAEVYKELRLRGYD

## Second feature vector: (Idk fih hhhhhhhh)

In [5]:
# Calculate the FPSSM from PSSM
def get_fpssm(pssm):
    column_sums = pssm.sum(axis=0)
    fpssm = pssm / column_sums
    return fpssm

# Calculate the feature vector S
def get_feature_vector_s(fpssm):
    S = []
    for i in range(20):
        for j in range(20):
            sij = fpssm[:, i].sum() - fpssm[:, j].sum()
            S.append(sij)
    return np.array(S)

In [6]:
# connecting to the database
with psycopg2.connect(URI.geturl()) as conn:
    # get all the proteins in the database and their pssm matrix
    with conn.cursor() as cur:
        
        # # add new column to the table for the feature vector S named S_vector
        # cur.execute("ALTER TABLE PSSMS ADD COLUMN S_vector NUMERIC[]")
        
        # get all the proteins in the database
        print("Fetching all the proteins...")
        cur.execute("SELECT sequence, pssm FROM PSSMS where S_vector IS NULL")
        
        # fetch all the proteins
        proteins = cur.fetchall()
        print("Done!")
        
        # get vector S for each protein
        for protein in tqdm(proteins):
            seq, pssm = protein

            # converting the pssm to a numpy array for easier manipulation
            pssm_np = np.array(pssm)

            # get the fpssm
            fpssm = get_fpssm(pssm_np)
            
            # get the feature vector S
            S = get_feature_vector_s(fpssm)
            
            # converting the numpy array to a list
            S = S.tolist()
            
            # updating the database
            cur.execute("UPDATE PSSMS SET S_vector = %s WHERE sequence = %s", (S, seq))

Fetching all the proteins...
Done!


  fpssm = pssm / column_sums
  fpssm = pssm / column_sums
  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
100%|██████████| 16114/16114 [40:46<00:00,  6.59it/s] 


In [3]:
# get all data from database and save it to a csv file
with psycopg2.connect(URI.geturl()) as conn:
    with conn.cursor() as cur:
        
        # get all the proteins in the database
        print("Fetching all the proteins...")
        cur.execute("SELECT sequence, S_vector FROM PSSMS")
        
        # fetch all the proteins
        proteins = cur.fetchall()
        print("Done!")
    

Fetching all the proteins...
Done!


In [4]:
len(proteins)

26151

: 