In [2]:
import pandas as pd
import numpy as np
import os 
import sys
import json
import random
from Bio import SeqIO
from sklearn.model_selection import train_test_split

## Loading data

In [9]:
# Select data from raw data folder
FILENAME = "antimicrobial_peptides_dataset_raw.tsv"
OUTPUT_NAME = "amp_train.json"
OUTPUT_NAME_2 = "amp_test.json"

PATH = os.path.join(os.getcwd(), 'data', 'raw', FILENAME)
OUTPUT_PATH = os.path.join(os.getcwd(), 'data', 'processed', OUTPUT_NAME)
OUTPUT_PATH_2 = os.path.join(os.getcwd(), 'data', 'processed', OUTPUT_NAME_2)
print(OUTPUT_PATH)

/home/developer/Projects/novo_dpo/data/processed/amp_train.json


In [5]:
df_raw = pd.read_csv(filepath_or_buffer = PATH,
                     sep="\t")


## Functions for datast processing

In [10]:
# create dictionary for input:
REMOVALS = []

def parse_dictionaries(raw_dataset):
    '''
    Raw dataset --> Dictionary {prompt, completion}
    '''
    dictionary = [{'prompt': seq, 'completion': seq} for seq in raw_dataset['Sequence']]

    return dictionary

# def  remove_last_n_chars(sequence, max_removals = None):
#     '''
#     Randomly removes chars from og sequence for prompting to llm
#     seq: str sequence of AAs
#     max_removals = number of 
#     '''
#     if len(sequence) <= 1:
#         return sequence  # Avoid removing everything

#     if max_removals is None:
#         max_removals = len(sequence) - 1  # Allow max possible removals

#     num_removals = random.randint(0,
#                                   min(max_removals, len(sequence)-1))
#     REMOVALS.append(num_removals)
    
#     shortened_sequence = sequence[: -num_removals] if num_removals > 0 else sequence

    

#     return shortened_sequence


def remove_last_n_chars(sequence, max_removals=None):
    '''
    Randomly removes chars from original sequence and returns both the shortened sequence
    and the complementary part that was removed.
    
    Args:
        sequence (str): Sequence of characters (e.g., AAs)
        max_removals (int, optional): Maximum number of characters to remove. Defaults to None.
    
    Returns:
        tuple: (shortened_sequence, removed_part)
    '''
    if len(sequence) <= 1:
        return sequence, ''  # Avoid removing everything, return empty string for removed part

    if max_removals is None:
        max_removals = len(sequence) - 1  # Allow max possible removals

    num_removals = random.randint(1, min(max_removals, len(sequence)-1))
    
    shortened_sequence = sequence[:-num_removals] if num_removals > 0 else sequence
    removed_part = sequence[-num_removals:] if num_removals > 0 else ''
    
    return shortened_sequence, removed_part


def parse_dataset_dictionaries( raw_df, max_removals = None):

    dictionaries = list(map(lambda seq : {'prompt': remove_last_n_chars(seq, max_removals),
                                          'completion': seq},
                                          raw_df['Sequence']))
    
    return dictionaries


def parse_dataset_dictionaries(raw_df, max_removals=None):
    """
    Creates a list of dictionaries where:
    - 'prompt' is the shortened sequence
    - 'completion' is just the removed part (complement)
    
    Args:
        raw_df: DataFrame containing sequences
        max_removals: Maximum number of characters to remove
        
    Returns:
        List of dictionaries with prompt-completion pairs
    """
    dictionaries = []
    
    for seq in raw_df['Sequence']:
        shortened, removed = remove_last_n_chars(seq, max_removals)
        dictionaries.append({
            'prompt': shortened,
            'completion': removed
        })
    
    return dictionaries




def save_dataset(dictionaries, output_path):

    with open(output_path, "w") as f :
        json.dump(dictionaries,
                  f,
                  indent = 4)
        

def fasta_to_dataframe(fasta_file):
    """
    Converts a FASTA file into a pandas DataFrame with two columns:
    - 'entry': The header (without '>')
    - 'sequence': The sequence

    Parameters:
        fasta_file (str): Path to the FASTA file

    Returns:
        pd.DataFrame: DataFrame with columns ['entry', 'sequence']
    """
    entries = []
    sequences = []

    # Read FASTA file
    for record in SeqIO.parse(fasta_file, "fasta"):
        entries.append(record.id)  # Header without '>'
        sequences.append(str(record.seq))  # Sequence as string

    # Create DataFrame
    df = pd.DataFrame({"entry": entries, "sequence": sequences})
    return df


import pandas as pd
import random

def parse_sequences_to_dict(df, max_removals=None):
    """
    Parses sequences from a DataFrame into prompt-completion pairs.
    
    Args:
        df (pd.DataFrame): DataFrame containing a 'Sequence' column
        max_removals (int, optional): Max characters to remove from sequence
        
    Returns:
        dict: Dictionary with 'prompt' and 'completion' lists
    """
    if 'Sequence' not in df.columns:
        raise ValueError("DataFrame must contain 'Sequence' column")
    
    result = {
        "prompt": [],
        "completion": []
    }
    
    for seq in df['Sequence']:
        if pd.isna(seq) or not isinstance(seq, str) or len(seq.strip()) == 0:
            continue
            
        # Remove random number of characters from end
        if max_removals is None:
            max_removals = len(seq) - 1
            
        num_removals = random.randint(1, min(max_removals, len(seq)-1))
        prompt_part = seq[:-num_removals] if num_removals > 0 else seq
        completion_part = seq[-num_removals:] if num_removals > 0 else ""
        
        # Append to lists instead of using indices
        result["prompt"].append(prompt_part)
        result["completion"].append(completion_part)
        
    return result

## Execution

In [12]:
df_train, df_test = train_test_split(df_raw, test_size=0.2, random_state=42)

# Parse each split
train_dict = parse_sequences_to_dict(df_train)
test_dict = parse_sequences_to_dict(df_test)

save_dataset(dictionaries = train_dict,
             output_path = OUTPUT_PATH)

save_dataset(dictionaries = test_dict,
             output_path = OUTPUT_PATH_2)

#dataset_dictionary = parse_sequences_to_dict(df_raw)



In [13]:
df = pd.read_parquet("data/train-00000-of-00001.parquet")
df.head()

Unnamed: 0,prompt,completion
0,SUBREDDIT: r/relationships\n\nTITLE: I (f/22) ...,I still have contact with an old ex's friends...
1,SUBREDDIT: r/loseit\n\nTITLE: SV & NSV! Keepin...,"Progress is still happening, even when you th..."
2,SUBREDDIT: r/relationships\n\nTITLE: Me [19F] ...,My skin is scarred badly; what could I do/say...
3,SUBREDDIT: r/personalfinance\n\nTITLE: Priorit...,$14k in student debt (all <5%) and need to sa...
4,SUBREDDIT: r/relationships\n\nTITLE: My[25m] g...,"GF is a meanie-bo-beanie when I'm nice, and a..."


In [16]:
df.to_json("data/train.json", indent=4)