## Cargar y parsear archivo

In [3]:
import pandas as pd

In [None]:
url = 'https://raw.githubusercontent.com/AnaLuisaIA/antifungal-peptides/refs/heads/main/databases/PlantPepDB/activity.csv'

In [None]:
df = pd.read_csv(url, sep='\t')

In [None]:
df.columns

Index(['PPepDB-ID', 'Peptide Name', 'Plant Source', 'Peptide Family',
       'Peptide Activity', 'Sequence Length', 'Molecular Weight', 'Validation',
       'PMID/DOI'],
      dtype='object')

In [None]:
display(df.head())

Unnamed: 0,PPepDB-ID,Peptide Name,Plant Source,Peptide Family,Peptide Activity,Sequence Length,Molecular Weight,Validation,PMID/DOI
0,PPepDB_245,Tomato Snakin-2,Solanum lycopersicum,Snakin,"Antibacterial, Antifungal",66,7012.09,Experimental evidence at protein level,--NA--
1,PPepDB_246,PaSn,Persea americana var. drymifolia,Snakin,Antibacterial,79,8565.8,Experimental evidence at protein level,--NA--
2,PPepDB_252,MsSN-1,Medicago sativa,Snakin,"Antibacterial, Antifungal",66,7167.46,Experimental evidence at protein level,--NA--
3,PPepDB_256,CaThi,Capsicum annuum,Thionin,Antifungal,59,6702.08,Experimental evidence at protein level,--NA--
4,PPepDB_257,Thionin-like peptide 1,Capsicum annuum,Thionin,"Antibacterial, Antifungal",66,7490.75,Experimental evidence at protein level,--NA--


In [None]:
df.shape

(3212, 9)

In [None]:
df_afp = df[df['Peptide Activity'].str.contains('antifungal', case=False)].copy()

In [None]:
df_afp.shape

(122, 9)

## Cargar segundo archivo

In [22]:
url = 'https://raw.githubusercontent.com/AnaLuisaIA/antifungal-peptides/refs/heads/main/databases/PlantPepDB/plantpepdb.csv'

In [23]:
df = pd.read_csv(url)

In [24]:
df.columns

Index(['PPepDB_ID', 'sequence', 'label'], dtype='object')

In [25]:
display(df.head())

Unnamed: 0,PPepDB_ID,sequence,label
0,PPepDB_245,DSYKKIDCGGACAARCRLSSRPRLCHRACGTCCARCNCVPPGTSGN...,1
1,PPepDB_252,GTDSGRFCSSICGQRCSKAGMKDRCMKFCGICCGKCKCVPSGTYGN...,1
2,PPepDB_256,KEICCKELTKPVKCSSDPLCQKLCMEKEKYEDGHCFTILSKCLCMK...,1
3,PPepDB_257,KEICCKVPTTPFLCTNDPQCKTLCSKVNYEDGHCFDILSKCVCMNR...,1
4,PPepDB_258,KLCERPSGTWSGVCGNNNACKNQCINLEKARHGSCNYVFPAHKCIC...,1


In [26]:
df.shape

(122, 3)

## Verificar duplicados

In [27]:
df["sequence"].unique().shape

(122,)

In [28]:
df = df.drop_duplicates().copy()

In [29]:
df.shape

(122, 3)

## Verificar canónicos

In [30]:
# Definir aminoácidos canónicos
aminoacidos = set("ACDEFGHIKLMNPQRSTVWY")

In [31]:
# Verificar cuáles secuencias válidas
df["is_valid"] = [set(seq).issubset(aminoacidos) for seq in df["sequence"]]

In [32]:
no_validas = df[~df["is_valid"]]
print("Secuencias inválidas:", len(no_validas))

Secuencias inválidas: 7


In [33]:
df_validas = df[df["is_valid"]].drop(columns="is_valid")
print("Secuencias válidas restantes:", len(df_validas))

Secuencias válidas restantes: 115


In [34]:
display(df_validas)

Unnamed: 0,PPepDB_ID,sequence,label
0,PPepDB_245,DSYKKIDCGGACAARCRLSSRPRLCHRACGTCCARCNCVPPGTSGN...,1
1,PPepDB_252,GTDSGRFCSSICGQRCSKAGMKDRCMKFCGICCGKCKCVPSGTYGN...,1
2,PPepDB_256,KEICCKELTKPVKCSSDPLCQKLCMEKEKYEDGHCFTILSKCLCMK...,1
3,PPepDB_257,KEICCKVPTTPFLCTNDPQCKTLCSKVNYEDGHCFDILSKCVCMNR...,1
4,PPepDB_258,KLCERPSGTWSGVCGNNNACKNQCINLEKARHGSCNYVFPAHKCIC...,1
...,...,...,...
117,PPepDB_3950,ITCGLVASKLAPCIGYLQGAPGPSAACCGGIKSLNSAAASPADRKT...,1
118,PPepDB_3952,QKLCERPSGTWSGVCGNSNACKNQCINLEKARHGSCNYVFPAHKCI...,1
119,PPepDB_3959,AISCGQVSSALSPCISYARGNGAKPPAACCSGVKRLAGAAQSTADK...,1
120,PPepDB_3960,AITCGQVSSALGPCAAYAKGSGTSPSAGCCSGVKRLAGLARSTADK...,1


In [37]:
df_filtered = df_validas[['sequence', 'label']].copy()
display(df_filtered.head())

Unnamed: 0,sequence,label
0,DSYKKIDCGGACAARCRLSSRPRLCHRACGTCCARCNCVPPGTSGN...,1
1,GTDSGRFCSSICGQRCSKAGMKDRCMKFCGICCGKCKCVPSGTYGN...,1
2,KEICCKELTKPVKCSSDPLCQKLCMEKEKYEDGHCFTILSKCLCMK...,1
3,KEICCKVPTTPFLCTNDPQCKTLCSKVNYEDGHCFDILSKCVCMNR...,1
4,KLCERPSGTWSGVCGNNNACKNQCINLEKARHGSCNYVFPAHKCIC...,1


In [39]:
df_filtered.to_csv('PlantPepDB_labeled.csv', index=False, header=True)

## Obtener metadatos

In [40]:
import json

In [41]:
def export_json(path_to_export, data_to_export):
    with open(path_to_export, 'w') as doc_export:
        json.dump(
            data_to_export,
            doc_export,
            indent=4,
            default=str,
            ensure_ascii=False)

In [42]:
def create_metada_with_multiple_values(df_metada_filter, full_df):
    dict_metadata = {}

    for column in df_metada_filter.columns:
        values = df_metada_filter[column].unique().tolist()

        if len(values)>1:
            values = [str(value) for value in values]
            values = ";".join(values)
            dict_metadata.update({column:values})
        else:
            dict_metadata.update({column:values[0]})

    dict_metadata.update({
        "number_of_sequences" : len(full_df)
    })

    return dict_metadata

In [43]:
def read_metadata(path_data, name_source):
    df_metada = pd.read_excel(path_data)
    df_metada_filter = df_metada[df_metada["name source"] == name_source]
    df_metada_filter = df_metada_filter[['type source',
                                         'estatico-dinamico',
                                         'licencia',
                                         'año de publicación',
                                         'fecha ultima actualizacion',
                                         'download date',
                                         'formato',
                                         'peptide property',
                                         'informacion del dataset',
                                         'unidad de medida',
                                         'Construccion de dataset negativos',
                                         'repositorio o servidor',
                                         'Publicacion',]]
    return df_metada_filter

In [45]:
df_metada_filter = read_metadata("/content/description_raw_data.xlsx", "Plantpepdb")

In [46]:
dict_metadata = create_metada_with_multiple_values(df_metada_filter, df_validas)

In [47]:
export_json("metadata.json", dict_metadata)