# Generating the final database

This step's aim is to generate the final database that we're going to use to train our model. This database contains the features of both interacting proteins concatenated into one line

In [1]:
import psycopg2
import pandas as pd
import os
import itertools
import numpy as np
from tqdm import tqdm
from dotenv.main import load_dotenv
from urllib.parse import urlparse

In [2]:
# Reading the DB API info from the .env file
load_dotenv()
URI = urlparse(os.getenv("DB_URI"))

## Reading the the two datasets

The first step is to read the two seperated datasets and clean them by removing self-interacting proteins, then the Refseq ID of each interactor is used to find the corresponding AA sequence which is then used to query the Render DB for the features

In [3]:
# read the data from the two csv files
print("Reading the data from the two csv files...")
df1 = pd.read_csv("../data/data1.csv")
df2 = pd.read_csv("../data/data2.csv")
print("Reading done!")

Reading the data from the two csv files...
Reading done!


In [4]:
# remove the lines where Interactor A is the same as Interactor B
print("Removing the lines where Interactor A is the same as Interactor B...")
data1_old_size = df1.shape[0]
data2_old_size = df2.shape[0]

df1 = df1[df1["Interactor A"] != df1["Interactor B"]]
df2 = df2[df2["Interactor A"] != df2["Interactor B"]]

data1_new_size = df1.shape[0]
data2_new_size = df2.shape[0]

print("Removed {} lines from data1.csv".format(data1_old_size - data1_new_size))
print("Removed {} lines from data2.csv".format(data2_old_size - data2_new_size))

Removing the lines where Interactor A is the same as Interactor B...
Removed 2160 lines from data1.csv
Removed 0 lines from data2.csv


In [5]:
# concatenate the two dataframes
print("Concatenating the two dataframes...")
df = pd.concat([df1, df2], ignore_index=True)
print("Concatenation done!")

# save the dataframe as a csv file
print("Saving the dataframe as a csv file...")
df.to_csv("../data/data_conc.csv", index=False)
print("Saving done!")

Concatenating the two dataframes...
Concatenation done!
Saving the dataframe as a csv file...
Saving done!


In [6]:
# print the new size of the dataframe
print(f"Number of rows in the dataframe: {df.shape[0]}")

Number of rows in the dataframe: 2551888


In [7]:
# limit the dataframe df to the first 100000 rows because HOLY SHIT IT'S TAKING TOO LONG!!!
df = df.iloc[:100000, :]

# save the dataframe as a csv file
print("Saving the dataframe as a csv file...")
df.to_csv("../data/data_conc_100k.csv", index=False)

Saving the dataframe as a csv file...


In [8]:
# sequences dic
sequences = {}

# read two lines from the sequences.fasta file at a time
with open("../dbs/generated/sequences.fasta", "r") as f:
    for id, seq in itertools.zip_longest(*[f]*2):
        id = id.strip()[1:]
        seq = seq.strip()
        sequences[id] = seq

# print the length of the dictionary
print(len(sequences))

10277


In [9]:
# create a new dataframe with two columns: Interactor A and Interactor B, this time the values are going to be AA sequences instead of RefSeq IDs
print("Creating a new dataframe with two columns: Interactor A and Interactor B...")
df_new = pd.DataFrame(columns=["Interactor A", "Interactor B"])

# read the dictionary containing the 100k examples
print("Reading the dictionary containing the 100k examples...")
df = pd.read_csv("../data/data_conc_100k.csv")

Creating a new dataframe with two columns: Interactor A and Interactor B...
Reading the dictionary containing the 100k examples...


In [11]:
# for each row in the dataframe, get the AA sequence of Interactor A and Interactor B
print("For each row in the dataframe, get the AA sequence of Interactor A and Interactor B...")
for i in tqdm(range(df.shape[0])):

    # get the RefSeq IDs of Interactor A and Interactor B
    interactor_a = df.loc[i, "Interactor A"]
    interactor_b = df.loc[i, "Interactor B"]

    # get the AA sequences of Interactor A and Interactor B if they exist
    if interactor_a in sequences and interactor_b in sequences:
        seq_a = sequences[interactor_a]
        seq_b = sequences[interactor_b]

        # add the row to the new dataframe
        df_new = pd.concat([df_new, pd.DataFrame([[seq_a, seq_b]], columns=["Interactor A", "Interactor B"])])
print("Done!")

# print the shape of the new dataframe
print(f"Shape of the new dataframe: {df_new.shape}")

# save the dataframe as a csv file
print("Saving the dataframe as a csv file...")
df_new.to_csv("../data/data_conc_100k_aa.csv", index=False)

For each row in the dataframe, get the AA sequence of Interactor A and Interactor B...


  0%|          | 0/100000 [00:00<?, ?it/s]


TypeError: unsupported operand type(s) for -: 'str' and 'str'

In [3]:
# ferch all the rows from the database
print("Fetching all the rows from the database...")
conn = psycopg2.connect(
    database=URI.path[1:],
    user=URI.username,
    password=URI.password,
    host=URI.hostname,
    port=URI.port
)

cur = conn.cursor()
cur.execute("SELECT sequence, pssm FROM PSSMS")
rows = cur.fetchall()
print("Fetching done!")

Fetching all the rows from the database...
Fetching done!


In [4]:
len(rows)

26151

In [5]:
# read data1.csv
print("Reading data1.csv...")
df1 = pd.read_csv("data1.csv")

Reading data1.csv...


In [6]:
def read_sequences_from_fasta(fasta_file):
        protein_sequences = {}
        with open(fasta_file, "r") as file:
            for line in file:
                if line.startswith(">"):
                    protein_id = line.strip().split(">")[1]
                else:
                    protein_sequences[protein_id] = line.strip()
        return protein_sequences

In [7]:
sequences = read_sequences_from_fasta("sequences.fasta")

In [8]:
# get set of all the sequences and check if they are in rows from the database
print("Getting set of all the sequences and checking if they are in rows from the database...")
sequences_set = set(sequences.values())
db_sequences_set = set()
for row in tqdm(rows):
    db_sequences_set.add(row[0])
print("Done!")

Getting set of all the sequences and checking if they are in rows from the database...


100%|██████████| 26151/26151 [00:00<00:00, 1325982.16it/s]

Done!





In [9]:
len(db_sequences_set)

26151

In [10]:
len(sequences_set)

10071

In [11]:
db_sequences_set

{'MNINGYTRMAAVVANPIKHSLSPFIHNLAFDLMNENGVYLAWEVEAEKLPAIVENVRTLDMYGLNISMPYKGEIIKFMDELSPAAELIGVVNTVVNQSGKLIGHNTDGIGFFNSLEKYHFNIQNKQMLILGGGGAAIAIIAQAALSGAKKIVVAARKSASYIPLKEKLEKLSVKTGIEILLTDLSEADRLQKELKQTDLLVNATSVGMDGESLPLEKSLVLPEKLLVVDAIYKVRETPFLRWAKEQGAQTENGLGMLIGQAAESFYLWTGKKMPVAEITLEMEKEA',
 'MCSLASGATGGRGAVENEEDLPELSDSGDEAAWEDEDDADLPHGKQQTPCLFCNRLFTSAEETFSHCKSEHQFNIDSMVHKHGLEFYGYIKLINFIRLKNPTVEYMNSIYNPVPWEKEEYLKPVLEDDLLLQFDVEDLYEPVSVPFSYPNGLSENTSVVEKLKHMEARALSAEAALARAREDLQKMKQFAQDFVMHTDVRTCSSSTSVIADLQEDEDGVYFSSYGHYGIHEEMLKDKIRTESYRDFIYQNPHIFKDKVVLDVGCGTGILSMFAAKAGAKKVLGVDQSEILYQAMDIIRLNKLEDTITLIKGKIEEVHLPVEKVDVIISEWMGYFLLFESMLDSVLYAKNKYLAKGGSVYPDICTISLVAVSDVNKHADRIAFWDDVYGFKMSCMKKAVIPEAVVEVLDPKTLISEPCGIKHIDCHTTSISDLEFSSDFTLKITRTSMCTAIAGYFDIYFEKNCHNRVVFSTGPQSTKTHWKQTVFLLEKPFSVKAGEALKGKVTVHKNKKDPRSLTVTLTLNNSTQTYGLQ',
 'MPSIKFDDFYHKYTESPERKAAVEQFEEQLKASVLLSELREREDYTQKELAELAGTSQSTVARIESGTMNVTFDTLAHIVNAMGYKLEFNITHL',
 'MSEQTNNLLNLFSKLLHNPSVLFALRADGISKQMKNRGNRNGAQGLLVELWNKDGLTNAEIAELLDIKPSSV

In [12]:
sequences_set

{'MFADLDYDIEEDKLGIPTVPGKVTLQKDAQNLIGISIGGGAQYCPCLYIVQVFDNTPAALDGTVAAGDEITGVNGRSIKGKTKVEVAKMIQEVKGEVTIHYNKLQADPKQGMSLDIVLKKVKHRLVENMSSGTADALGLSRAILCNDGLVKRLEELERTAELYKGMTEHTKNLLRAFYELSQTHRAFGDVFSVIGVREPQPAASEAFVKFADAHRSIEKFGIRLLKTIKPMLTDLNTYLNKAIPDTRLTIKKYLDVKFEYLSYCLKVKEMDDEEYSCIALGEPLYRVSTGNYEYRLILRCRQEARARFSQMRKDVLEKMELLDQKHVQDIVFQLQRLVSTMSKYYNDCYAVLRDADVFPIEVDLAHTTLAYGLNQEEFTDGEEEEEEEDTAAGEPSRDTRGAAGPLDKGGSWCDS',
 'MAGSPTCLTLIYILWQLTGSAASGPVKELVGSVGGAVTFPLKSKVKQVDSIVWTFNTTPLVTIQPEGGTIIVTQNRNRERVDFPDGGYSLKLSKLKKNDSGIYYVGIYSSSLQQPSTQEYVLHVYEHLSKPKVTMGLQSNKNGTCVTNLTCCMEHGEEDVIYTWKALGQAANESHNGSILPISWRWGESDMTFICVARNPVSRNFSSPILARKLCEGAADDPDSSMVLLCLLLVPLLLSLFVLGLFLWFLKRERQEEYIEEKKRVDICRETPNICPHSGENTEYDTIPHTNRTILKEDPANTVYSTVEIPKKMENPHSLLTMPDTPRLFAYENVI',
 'MSPSLQEGAQLGENKPSTCSFSIERILGLDQKKDCVPLMKPHRPWADTCSSSGKDGNLCLHVPNPPSGISFPSVVDHPMPEERASKYENYFSASERLSLKRELSWYRGRRPRTAFTQNQIEVLENVFRVNCYPGIDIREDLAQKLNLEEDRIQIWFQNRRAKLKRSHRESQFLMAKKNFNTNLLE',
 'MERMSDSADKPIDNDAEGVWSPDIEQSFQEALAIYPPCGRRKIILSDE

In [13]:
# check if all sequences in sequences_set are in db_sequences_set
print("Checking if all sequences in sequences_set are in db_sequences_set...")
c = 0
for seq in tqdm(sequences_set):
    if seq not in db_sequences_set:
        print(seq)
        c += 1
        
print("Done!")

Checking if all sequences in sequences_set are in db_sequences_set...


100%|██████████| 10071/10071 [00:00<00:00, 1006165.39it/s]

MDVHGEAAGEAGQEGGEGLQGGAGQSEEGPSAEKCRVCPCVCRERHPQEERRCELASDGVPRRRSGLQGADSCDYEGGDQEYGPGDQSPGQGPEHHGPAEGLLSDGQVRAAGAEPGRPYIGDGGLHELGHHPDHAAGAGGQPHHADRRGEWPGGAGPAQPAARGRLCRGRELCAQPGGPAVTEVGRLEELAVPRRCAPPLPRDVLEGSCPLPTASCLCADPAGLRPAATLRLSPARPAWP
MTSRGFQRSCNNPPCSSMTGRRANQIHHLTPDFSLRELLPPKKAGTWADCVSPPCGERDRCEGWADRHTACSSPASTCQVHTQDCDSLNNMRSRHIHCGRLCHANKAVSSSKRDTAFFLPHFSPGKPGNQNSKNEPPKKRERERSSHCYPAAPAAQAEAPLVPLSRQNKSTVETSNLKMLISFPKTLLRGPQEGWWHQGINPGSGAATLGPGSSERPQSIEASCSMARRTFFAVSSNSFFLLLVSFAILFLALSLSSFKNSPRVNSSNCFLTERKAQPDECFLCSSMGSSSGSQPSSSLKQKKHWAKSGSFSVGQWMKPASAIRSGVQRSPPRRASS
MGSDVTGIGLFESDPVADPVLRENSERILSKVQTLARMYSSKASTMKVPLHHKRTGAIRTQAWVSARLPARPAQNRDKVLSLREQETGGGAQTRSEARTERGACKQPGQNRAVVQEELRIHGGRPEGQTDDPQEEGPAPCEPALLGHVTPLGHVTPLGHVTPLDHVTPALSRPRDFLFTLTCSAPGEGPSRQNQACSTGAALSSASPGPAHRAGAQPEPRGSRWSSSSPSVSPAALQGRGSPGEKHQDVPADAESEVQQAAGGPGRDEGCCVYSSSSASLSTLPADPEACRAPSEEEHDPDVAAAPDKQENVGPRQDRGVTETDGSLQSGSRTTWGSRPGLREEPAQGEPRAEEPLGSMGRQKIAEPGRENEGSSAETCAEKASRELCVVPTDPVDPENPVDDGTVTQTSQHLHKVKAAPS


