## Carga y parseo de conjunto de datos

In [1]:
import pandas as pd

In [2]:
def remove_alternate_lines(input_file, output_file):
    """
    Reads an input text file and writes to an output file,
    removing every other line.
    """
    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        for i, line in enumerate(infile):
            if i % 2 == 1:
                outfile.write(line)

In [3]:
# Example usage:
input_filename = 'AFP.txt'  # Replace with your input file name
output_filename = 'PrMFTP.csv' # Replace with your desired output file name

remove_alternate_lines(input_filename, output_filename)

print(f"Alternate lines removed. Output saved to {output_filename}")

Alternate lines removed. Output saved to PrMFTP.csv


In [4]:
df = pd.read_csv('/content/PrMFTP.csv', header=None, names=['Sequence'])

In [5]:
df.head(5)

Unnamed: 0,Sequence
0,AAGMGFFGAR
1,AAHGACHVRNGKHMCFCYF
2,AAKKSFIIKQKLAKAKNQNRPLPQWFRLKTNNTIRYNAKRRHWRRT...
3,AALKGCWTKSIPPKPCSGKR
4,AALRGCWTKSIPPKPCPGKR


In [6]:
# Agregar una columna de label (si tiene actividad antifúngica o no)
df['label'] = 1

In [8]:
df.shape

(2324, 2)

## Verificar duplicados

In [9]:
df["Sequence"].unique().shape

(2324,)

In [12]:
df = df.drop_duplicates().copy()

In [13]:
df.shape

(2324, 2)

## Verificar canónicos

In [14]:
# Definir aminoácidos canónicos
aminoacidos = set("ACDEFGHIKLMNPQRSTVWY")

In [15]:
# Verificar cuáles secuencias válidas
df["is_valid"] = [set(seq).issubset(aminoacidos) for seq in df["Sequence"]]

In [16]:
no_validas = df[~df["is_valid"]]
print("Secuencias inválidas:", len(no_validas))

Secuencias inválidas: 0


In [17]:
df_validas = df[df["is_valid"]].drop(columns="is_valid")
print("Secuencias válidas restantes:", len(df_validas))

Secuencias válidas restantes: 2324


In [18]:
display(df_validas)

Unnamed: 0,Sequence,label
0,AAGMGFFGAR,1
1,AAHGACHVRNGKHMCFCYF,1
2,AAKKSFIIKQKLAKAKNQNRPLPQWFRLKTNNTIRYNAKRRHWRRT...,1
3,AALKGCWTKSIPPKPCSGKR,1
4,AALRGCWTKSIPPKPCPGKR,1
...,...,...
2319,YRGGYTGPIPRPPPIGRPPLRLVVCACYRLSVSDARNCCIKFGSCC...,1
2320,YSRCQLQGFNCVVRSYGLPTIPCCRGLTCRSYFPGSTYGRCQRY,1
2321,YSYKKIDCGGACAARCRLSSRPRLCNRACGTCCARCNCVPPGTSGN...,1
2322,YVPKIPKPQPNKPNFPSFPGHGPFNPHASRFPRSPKDNGKIVFDAK...,1


In [19]:
df_validas = df_validas.rename(columns={'Sequence': 'sequence'})
display(df_validas.head())

Unnamed: 0,sequence,label
0,AAGMGFFGAR,1
1,AAHGACHVRNGKHMCFCYF,1
2,AAKKSFIIKQKLAKAKNQNRPLPQWFRLKTNNTIRYNAKRRHWRRT...,1
3,AALKGCWTKSIPPKPCSGKR,1
4,AALRGCWTKSIPPKPCPGKR,1


In [20]:
df_validas.to_csv('PrMFTP_labeled.csv', index=False, header=True)

## Obtener metadatos

In [21]:
import json

In [22]:
def export_json(path_to_export, data_to_export):
    with open(path_to_export, 'w') as doc_export:
        json.dump(
            data_to_export,
            doc_export,
            indent=4,
            default=str,
            ensure_ascii=False)

In [23]:
def create_metada_with_multiple_values(df_metada_filter, full_df):
    dict_metadata = {}

    for column in df_metada_filter.columns:
        values = df_metada_filter[column].unique().tolist()

        if len(values)>1:
            values = [str(value) for value in values]
            values = ";".join(values)
            dict_metadata.update({column:values})
        else:
            dict_metadata.update({column:values[0]})

    dict_metadata.update({
        "number_of_sequences" : len(full_df)
    })

    return dict_metadata

In [24]:
def read_metadata(path_data, name_source):
    df_metada = pd.read_excel(path_data)
    df_metada_filter = df_metada[df_metada["name source"] == name_source]
    df_metada_filter = df_metada_filter[['type source',
                                         'estatico-dinamico',
                                         'licencia',
                                         'año de publicación',
                                         'fecha ultima actualizacion',
                                         'download date',
                                         'formato',
                                         'peptide property',
                                         'informacion del dataset',
                                         'unidad de medida',
                                         'Construccion de dataset negativos',
                                         'repositorio o servidor',
                                         'Publicacion',]]
    return df_metada_filter

In [25]:
df_metada_filter = read_metadata("/content/description_raw_data.xlsx", "PrMFTP")

In [26]:
dict_metadata = create_metada_with_multiple_values(df_metada_filter, df_validas)

In [27]:
export_json("metadata.json", dict_metadata)