## Cargar y parsear archivo

In [15]:
import pandas as pd

In [16]:
df = pd.read_excel("/content/biopep-umw.xlsx")

In [17]:
display(df.head())

Unnamed: 0,ID,Name,Sequence,Chem. mass,Monois. mass,Activity,InChlKey
0,2566,regulating cell-permeability peptide,NYKKPKL,890.078,889.5369,regulating,RQFNGWRPSKHMKK-NXBWRCJVSA-N
1,2567,regulating cell-permeability peptide,NYKKPKLAAAPALLALLVAPLLAVAA,2601.2117,2599.6143,regulating,SLFAVMFXXFBIEI-PHOGLEJUSA-N
2,2568,regulating cell-permeability peptide,AAVALLPAVLLALLAPAAANYKKPKL,2601.2117,2599.6143,regulating,SCGYKWYBPYHFEB-PHOGLEJUSA-N
3,2569,regulating cell-permeability peptide,NYKKPKLAAAAAVALLPAVLLALLAP,2601.2117,2599.6143,regulating,FCRVOEOQSNIOPS-PHOGLEJUSA-N
4,2570,VV-hemorphin-7,VVYPWTQRF,1195.3651,1194.6167,opioid,FFBBHLKDMHCFTH-AKYHLAPZSA-N


In [18]:
df.shape

(5360, 7)

In [19]:
df_afp = df[df["Activity"] == 'antifungal ']

In [20]:
df_afp.shape

(63, 7)

In [21]:
df_afp = df_afp[["Sequence", "Activity"]]
df_afp = df_afp.rename(columns={"Sequence": "sequence", "Activity": "label"})
df_afp["label"] = 1

In [32]:
df_afp["sequence"] = df_afp["sequence"].str.strip()

In [33]:
display(df_afp.head())

Unnamed: 0,sequence,label,is_valid
1257,FLSFPTTKTYFPHFDLSHGSAQVKGHGAK,1,False
1434,FKCRRWQWRW,1,False
2041,GWGSFFKKAAHVGKHVGKAALTHYL~,1,False
2042,GWGSFFKKAAHVGKHVGKAALTHYLG,1,False
2045,RWRSFFKKAAHRGKHVGKRARTHYL~,1,False


## Verificar duplicados

In [34]:
df_afp["sequence"].unique().shape

(63,)

In [35]:
df_afp = df_afp.drop_duplicates().copy()

In [36]:
df_afp.shape

(63, 3)

## Verificar canónicos

In [37]:
# Definir aminoácidos canónicos
aminoacidos = set("ACDEFGHIKLMNPQRSTVWY")

In [38]:
# Verificar cuáles secuencias válidas
df_afp["is_valid"] = [set(seq).issubset(aminoacidos) for seq in df_afp["sequence"]]

In [39]:
no_validas = df_afp[~df_afp["is_valid"]]
print("Secuencias inválidas:", len(no_validas))

Secuencias inválidas: 14


In [40]:
df_validas = df_afp[df_afp["is_valid"]].drop(columns="is_valid")
print("Secuencias válidas restantes:", len(df_validas))

Secuencias válidas restantes: 49


In [41]:
df_validas.to_csv('BIOPEP-UWM_labeled.csv', index=False, header=True)

## Obtener metadatos

In [42]:
import json

In [43]:
def export_json(path_to_export, data_to_export):
    with open(path_to_export, 'w') as doc_export:
        json.dump(
            data_to_export,
            doc_export,
            indent=4,
            default=str,
            ensure_ascii=False)

In [44]:
def create_metada_with_multiple_values(df_metada_filter, full_df):
    dict_metadata = {}

    for column in df_metada_filter.columns:
        values = df_metada_filter[column].unique().tolist()

        if len(values)>1:
            values = [str(value) for value in values]
            values = ";".join(values)
            dict_metadata.update({column:values})
        else:
            dict_metadata.update({column:values[0]})

    dict_metadata.update({
        "number_of_sequences" : len(full_df)
    })

    return dict_metadata

In [45]:
def read_metadata(path_data, name_source):
    df_metada = pd.read_excel(path_data)
    df_metada_filter = df_metada[df_metada["name source"] == name_source]
    df_metada_filter = df_metada_filter[['type source',
                                         'estatico-dinamico',
                                         'licencia',
                                         'año de publicación',
                                         'fecha ultima actualizacion',
                                         'download date',
                                         'formato',
                                         'peptide property',
                                         'informacion del dataset',
                                         'unidad de medida',
                                         'Construccion de dataset negativos',
                                         'repositorio o servidor',
                                         'Publicacion',]]
    return df_metada_filter

In [46]:
df_metada_filter = read_metadata("/content/description_raw_data.xlsx", "BIOPEP-UWM")

In [47]:
dict_metadata = create_metada_with_multiple_values(df_metada_filter, df_validas)

In [48]:
export_json("metadata.json", dict_metadata)