In [1]:
# NOTE: Import libraries

from Bio import Entrez
from Bio import SeqIO
from os import path
from utils import *

import pandas as pd

In [16]:
# NOTE: Preprocessing dataset

df_path = path.join(DATA_PATH, 'ids.csv')
dataset = pd.read_csv(df_path, delimiter='\t')

# NOTE: Remove zero values

dataset = dataset[dataset['Tm_(C)'] != 0]
dataset.to_csv(path.join(DATA_PATH, 'ids.csv'), sep='\t', index=False)

In [3]:
# NOTE: Read dataset of IDs

df_path = path.join(DATA_PATH, 'ids.csv')
dataset = pd.read_csv(df_path, delimiter='\t')

IDs = dataset['UNIPROT_ID'].tolist()
TMs = dataset['Tm_(C)'].tolist()

print(len(TMs))
print(len(IDs))

4271
4271


In [39]:
Entrez.email = 'gabriel.loayza@utec.edu.pe'

# NOTE: Send request to NCBI as chunks of 200 IDs

sequences_ids = []
sequences = []
sequences_tms = []
chunk_size = 200
for i in range(0, len(IDs), chunk_size):
    with Entrez.efetch(db='protein', id=IDs[i : i + chunk_size], rettype='fasta', retmode='text') as handle:
        seeker = 0
        for record in SeqIO.parse(handle, 'fasta'):
            while IDs[i + seeker] not in record.id and seeker < chunk_size:
                seeker += 1
            if seeker < chunk_size:
                # print(f'{IDs[i + seeker]} - {record.id}')
                sequences_ids.append(IDs[i + seeker])
                sequences.append(record.seq)
                sequences_tms.append(TMs[i + seeker])

print(len(sequences_ids))
print(len(sequences))

4265
4265


In [2]:
# NOTE: Save sequences

sequences_path = path.join(DATA_PATH, 'sequences.csv')
sequences_df = pd.DataFrame({
    'UNIPROT_ID' : sequences_ids,
    'SEQUENCE' : sequences,
    'TM' : sequences_tms
})

print(sequences_df.head())
pd.DataFrame(sequences_df).to_csv(sequences_path, index=False, header=True, sep='\t')

NameError: name 'sequences_ids' is not defined

In [3]:
# NOTE: Read sequences

df_path = path.join(DATA_PATH, 'sequences.csv')
dataset = pd.read_csv(df_path, delimiter='\t')

print(dataset.head())

  UNIPROT_ID                                           SEQUENCE     TM
0     P00350  MSKQQIGVVGMAVMGRNLALNIESRGYTVSIFNRSREKTEEVIAEN...  57.83
1     P00363  MQTFQADLAIVGAGGAGLRAAIAAAQANPNAKIALISKVYPMRSHT...  46.77
2     P00370  MDQTYSLESFLNHVQKRDPNQTEFAQAVREVMTTLWPFLEQNPKYR...  58.78
3     P00448  MSYTLPSLPYAYDALEPHFDKQTMEIHHTKHHQTYVNNANAALESL...  66.59
4     P00452  MNQNLLVTKRDGSTERINLDKIHRVLDWAAEGLHNVSISQVELRSH...  44.79


In [4]:
# NOTE: Get labels for sequences

dataset.loc[dataset['TM'] <= 70, 'LABEL'] = 0
dataset.loc[dataset['TM'] > 70, 'LABEL'] = 1
dataset['LABEL'] = dataset['LABEL'].astype(int)

print(dataset.head())

# NOTE: Count labels

print(dataset['LABEL'].value_counts())

# NOTE: Save sequences labels

sequences_path = path.join(DATA_PATH, 'sequences_labeled.csv')
pd.DataFrame(dataset).to_csv(sequences_path, index=False, header=True, sep='\t')

  UNIPROT_ID                                           SEQUENCE     TM  LABEL
0     P00350  MSKQQIGVVGMAVMGRNLALNIESRGYTVSIFNRSREKTEEVIAEN...  57.83      0
1     P00363  MQTFQADLAIVGAGGAGLRAAIAAAQANPNAKIALISKVYPMRSHT...  46.77      0
2     P00370  MDQTYSLESFLNHVQKRDPNQTEFAQAVREVMTTLWPFLEQNPKYR...  58.78      0
3     P00448  MSYTLPSLPYAYDALEPHFDKQTMEIHHTKHHQTYVNNANAALESL...  66.59      0
4     P00452  MNQNLLVTKRDGSTERINLDKIHRVLDWAAEGLHNVSISQVELRSH...  44.79      0
0    3359
1     906
Name: LABEL, dtype: int64


In [6]:
# NOTE: Get mix and max temperaturees

print(dataset['TM'].min(), dataset['TM'].max())

26.57 90.99
