## Sobre la base de datos

Está relacionada al trabajo:
[CalcAMP: A New Machine Learning Model for the Accurate Prediction of Antimicrobial Activity of Peptides](https://pmc.ncbi.nlm.nih.gov/articles/PMC10135148/)

Contiene registros positivos y negativos de antifúngicos en formato FASTA.

Usaremos la biblioteca `Biopython` que es muy útil para trabajar con secuencias biológicas.

In [1]:
%pip install biopython

Collecting biopython
  Downloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (13 kB)
Downloading biopython-1.86-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.86


In [2]:
from Bio import SeqIO
import pandas as pd

In [3]:
fasta_train = '/content/training_fungi.fasta'
fasta_test = '/content/Holdout_AFP.fasta'

## Secuencias entrenamiento

In [4]:
data = []
try:
    for record in SeqIO.parse(fasta_train, "fasta"):
        data.append({
            "ID": record.id,
            "Description": record.description,
            "Sequence": str(record.seq)
        })

    df_train = pd.DataFrame(data)
    display(df_train.head())

except FileNotFoundError:
    print(f"Error: El archivo no se encuentra en la ruta especificada: {fasta_train}")
except Exception as e:
    print(f"Ocurrió un error al parsear el archivo fasta: {e}")

Unnamed: 0,ID,Description,Sequence
0,Non-AFP_0001,Non-AFP_0001,SGTSEKERESERLLGVVNPLIKCFRSPCP
1,AFP_0001,AFP_0001,FVPWFSKFLKRIL
2,AFP_0002,AFP_0002,GFKMALKLLKKVL
3,AFP_0003,AFP_0003,SLLSLFRKLIT
4,Non-AFP_0002,Non-AFP_0002,GWGSIFKHIFKAGKFIHGAIQAHNDG


In [16]:
# Filter df_train for 'Non-AFP_' in the 'Description' column
df_train_neg = df_train[df_train['Description'].str.startswith('Non-AFP_')].copy()

# Filter df_train for 'AFP_' in the 'Description' column, ensuring it starts with 'AFP_'
df_train_pos = df_train[df_train['Description'].str.startswith('AFP_')].copy()

In [17]:
print("Training data - Non-AFP_:")
display(df_train_neg.head())

Training data - Non-AFP_:


Unnamed: 0,ID,Description,Sequence
0,Non-AFP_0001,Non-AFP_0001,SGTSEKERESERLLGVVNPLIKCFRSPCP
4,Non-AFP_0002,Non-AFP_0002,GWGSIFKHIFKAGKFIHGAIQAHNDG
7,Non-AFP_0003,Non-AFP_0003,FLPLIAGLAAKCAITKKC
8,Non-AFP_0004,Non-AFP_0004,SLWSSIKDMAAAAGRAALNAVNGILNP
9,Non-AFP_0005,Non-AFP_0005,RLWRRWRRWLR


In [18]:
print("\nTraining data - AFP_:")
display(df_train_pos.head())


Training data - AFP_:


Unnamed: 0,ID,Description,Sequence
1,AFP_0001,AFP_0001,FVPWFSKFLKRIL
2,AFP_0002,AFP_0002,GFKMALKLLKKVL
3,AFP_0003,AFP_0003,SLLSLFRKLIT
5,AFP_0004,AFP_0004,ALWKSILKNVGKAAGKAVLNAVTDMVNQ
6,AFP_0005,AFP_0005,GIINTLQKYYSRVRGGR


## Secuencia de prueba

In [5]:
data = []
try:
    for record in SeqIO.parse(fasta_test, "fasta"):
        data.append({
            "ID": record.id,
            "Description": record.description,
            "Sequence": str(record.seq)
        })

    df_test = pd.DataFrame(data)
    display(df_test.head())

except FileNotFoundError:
    print(f"Error: El archivo no se encuentra en la ruta especificada: {fasta_test}")
except Exception as e:
    print(f"Ocurrió un error al parsear el archivo fasta: {e}")

Unnamed: 0,ID,Description,Sequence
0,Non-AFP_00,Non-AFP_00,GLWSTIKNVGKEAAIAAGKAVLGSL
1,Non-AFP_01,Non-AFP_01,VKRFKKFFRKLKKSVKKL
2,Non-AFP_02,Non-AFP_02,GAKLAKKQVRALGKFFSF
3,Non-AFP_03,Non-AFP_03,HFLGKLVNLAKKIL
4,Non-AFP_04,Non-AFP_04,WRWRWR


In [14]:
# Filter df_train for 'Non-AFP_' in the 'Description' column
df_test_neg = df_test[df_test['Description'].str.startswith('Non-AFP_')].copy()

# Filter df_train for 'AFP_' in the 'Description' column, ensuring it starts with 'AFP_'
df_test_pos = df_test[df_test['Description'].str.startswith('AFP_')].copy()

In [19]:
print("Training data negative - Non-AFP_:")
display(df_train_neg.head())

Test data - Non-AFP_:


Unnamed: 0,ID,Description,Sequence
0,Non-AFP_00,Non-AFP_00,GLWSTIKNVGKEAAIAAGKAVLGSL
1,Non-AFP_01,Non-AFP_01,VKRFKKFFRKLKKSVKKL
2,Non-AFP_02,Non-AFP_02,GAKLAKKQVRALGKFFSF
3,Non-AFP_03,Non-AFP_03,HFLGKLVNLAKKIL
4,Non-AFP_04,Non-AFP_04,WRWRWR


In [None]:
print("\nTest data - AFP_:")
display(df_train_pos.head())

## Concatenar ambas secuencias

In [None]:
# Seleccionar las secuencias negativas
train_pos = df_train_pos[['Sequence']]
test_pos = df_test_pos[['Sequence']]

# Agregar una columna de label (si tiene actividad antifúngica o no)
train_pos['label'] = 1
test_pos['label'] = 1

In [None]:
# Seleccionar las secuencias negativas
train_neg = df_train_neg[['Sequence']]
test_neg = df_test_neg[['Sequence']]

# Agregar una columna de label (si tiene actividad antifúngica o no)
train_neg['label'] = 0
test_neg['label'] = 0

In [24]:
# Concatenar los dataframes
df_combined = pd.concat([train_pos, train_neg, test_pos, test_neg])

In [25]:
df_combined.head()

Unnamed: 0,Sequence,label
1,FVPWFSKFLKRIL,1
2,GFKMALKLLKKVL,1
3,SLLSLFRKLIT,1
5,ALWKSILKNVGKAAGKAVLNAVTDMVNQ,1
6,GIINTLQKYYSRVRGGR,1


In [26]:
df_combined.describe()

Unnamed: 0,label
count,2248.0
mean,0.407918
std,0.491557
min,0.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [27]:
df_combined.to_csv('CalcAMP_combinado.csv', index=False, header=True)