In [2]:
# NOTE: Import libraries

from Bio import Entrez
from Bio import SeqIO
from os import path
from utils import *

import pandas as pd

In [16]:
# NOTE: Preprocessing dataset

df_path = path.join(DATA_PATH, 'ids.csv')
dataset = pd.read_csv(df_path, delimiter='\t')

# NOTE: Remove zero values

dataset = dataset[dataset['Tm_(C)'] != 0]
dataset.to_csv(path.join(DATA_PATH, 'ids.csv'), sep='\t', index=False)

In [37]:
# NOTE: Read dataset of IDs

df_path = path.join(DATA_PATH, 'ids.csv')
dataset = pd.read_csv(df_path, delimiter='\t')

IDs = dataset['UNIPROT_ID'].tolist()
TMs = dataset['Tm_(C)'].tolist()

print(len(TMs))
print(len(IDs))

4271
4271


In [39]:
Entrez.email = 'gabriel.loayza@utec.edu.pe'

# NOTE: Send request to NCBI as chunks of 200 IDs

sequences_ids = []
sequences = []
sequences_tms = []
chunk_size = 200
for i in range(0, len(IDs), chunk_size):
    with Entrez.efetch(db='protein', id=IDs[i : i + chunk_size], rettype='fasta', retmode='text') as handle:
        seeker = 0
        for record in SeqIO.parse(handle, 'fasta'):
            while IDs[i + seeker] not in record.id and seeker < chunk_size:
                seeker += 1
            if seeker < chunk_size:
                # print(f'{IDs[i + seeker]} - {record.id}')
                sequences_ids.append(IDs[i + seeker])
                sequences.append(record.seq)
                sequences_tms.append(TMs[i + seeker])

print(len(sequences_ids))
print(len(sequences))

4265
4265


In [41]:
# NOTE: Save sequences

sequences_path = path.join(DATA_PATH, 'sequences.csv')
sequences_df = pd.DataFrame({
    'UNIPROT_ID' : sequences_ids,
    'SEQUENCE' : sequences,
    'TM' : sequences_tms
})

print(sequences_df.head())
pd.DataFrame(sequences_df).to_csv(sequences_path, index=False, header=True, sep='\t')

  UNIPROT_ID                                           SEQUENCE     TM
0     P00350  (M, S, K, Q, Q, I, G, V, V, G, M, A, V, M, G, ...  57.83
1     P00363  (M, Q, T, F, Q, A, D, L, A, I, V, G, A, G, G, ...  46.77
2     P00370  (M, D, Q, T, Y, S, L, E, S, F, L, N, H, V, Q, ...  58.78
3     P00448  (M, S, Y, T, L, P, S, L, P, Y, A, Y, D, A, L, ...  66.59
4     P00452  (M, N, Q, N, L, L, V, T, K, R, D, G, S, T, E, ...  44.79


In [42]:
# NOTE: Read sequences

df_path = path.join(DATA_PATH, 'sequences.csv')
dataset = pd.read_csv(df_path, delimiter='\t')

print(dataset.head())

  UNIPROT_ID                                           SEQUENCE     TM
0     P00350  MSKQQIGVVGMAVMGRNLALNIESRGYTVSIFNRSREKTEEVIAEN...  57.83
1     P00363  MQTFQADLAIVGAGGAGLRAAIAAAQANPNAKIALISKVYPMRSHT...  46.77
2     P00370  MDQTYSLESFLNHVQKRDPNQTEFAQAVREVMTTLWPFLEQNPKYR...  58.78
3     P00448  MSYTLPSLPYAYDALEPHFDKQTMEIHHTKHHQTYVNNANAALESL...  66.59
4     P00452  MNQNLLVTKRDGSTERINLDKIHRVLDWAAEGLHNVSISQVELRSH...  44.79
