In [2]:
import pandas as pd
from Bio import SeqIO
import csv
from sklearn.model_selection import train_test_split
import numpy as np

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


# New merge toxicity

In [13]:

# Specify the input FASTA file and output CSV file
fasta_files = ['../data/tox/merged_nontoxic_peptides.fa', '../data/tox/merged_toxic_peptides.fa']
for label, input_fasta in enumerate(fasta_files):
    output_csv = input_fasta.replace('.fa', '.tsv')

    # Open the output CSV file
    with open(output_csv, "w", newline='') as csvfile:
        # Create a CSV writer object
        csv_writer = csv.writer(csvfile, delimiter='\t')

        # Write the header row
#         csv_writer.writerow(["ID", "Length", "Sequence", "Target"])

        # Parse the FASTA file and write each record to the CSV
        for record in SeqIO.parse(input_fasta, "fasta"):
            seq_len = len(str(record.seq))
            seq_id = record.id.split(',')[0].split(';')[0]
            csv_writer.writerow([seq_id, seq_len, str(record.seq).upper(), str(label)])

    print(f"FASTA file '{input_fasta}' has been converted to CSV file '{output_csv}'")
list_fasta = ' '.join([input_fasta.replace('.fa', '.tsv') for input_fasta in fasta_files])
out_fname = '../data/tox/tox_merged_all.tsv'
!cat {list_fasta} > {out_fname}

FASTA file '../data/tox/merged_nontoxic_peptides.fa' has been converted to CSV file '../data/tox/merged_nontoxic_peptides.tsv'
FASTA file '../data/tox/merged_toxic_peptides.fa' has been converted to CSV file '../data/tox/merged_toxic_peptides.tsv'


In [14]:
df = pd.read_csv(out_fname, sep='\t', header=None)

# Assume the last column is the target variable
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Perform the 80-20 split first
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Split the remaining 20% into validation and test sets (10% each of the original data)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

# Combine X and y for each set and save to TSV files
train_df = pd.concat([X_train, y_train], axis=1)
val_df = pd.concat([X_val, y_val], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

# Write to TSV files
train_df.to_csv('../data/tox/tox_train.tsv', sep='\t', index=False, header=False)
val_df.to_csv('../data/tox/tox_validation.tsv', sep='\t', index=False, header=False)
test_df.to_csv('../data/tox/tox_test.tsv', sep='\t', index=False, header=False)

# AMP New 2024_08_09

In [40]:
# Specify the input FASTA file and output CSV file
fasta_files = ['../data/AMP_new/final_uniprotkb_nonAMP_2024_08_09.fa', '../data/AMP_new/final_AMP.fa']
all_labels = {2: 'anti_Gram-', 3: 'anti_Gram+', 4: 'antifungal', 5: 'antiviral'}

for k, v in all_labels.items():
    fasta_files.append(fasta_files[-1])

for label, input_fasta in enumerate(fasta_files):
    output_csv = input_fasta.replace('.fa', '.tsv') if label < 2 \
    else input_fasta.replace('.fa', f'_{all_labels[label]}.tsv')

    # Open the output CSV file
    num_lines = 0
    with open(output_csv, "w", newline='') as csvfile:
        # Create a CSV writer object
        csv_writer = csv.writer(csvfile, delimiter='\t')

        # Parse the FASTA file and write each record to the CSV
        for record in SeqIO.parse(input_fasta, "fasta"):
            seq_len = len(str(record.seq))
            seq_id = record.id.split(',')[0].split(';')[0]
            
#             if label > 1:
#                 print(record.description)
            if ((label < 2) or (all_labels[label] in record.description)) and (seq_len >=8) and (seq_len <= 100):
                csv_writer.writerow([seq_id, seq_len, str(record.seq).upper(), str(1 if label > 0 else 0)])
                num_lines += 1

    print(f"FASTA file '{input_fasta}'' has been converted to TSV file '{output_csv}' with {num_lines} lines.")


FASTA file '../data/AMP_new/final_uniprotkb_nonAMP_2024_08_09.fa'' has been converted to TSV file '../data/AMP_new/final_uniprotkb_nonAMP_2024_08_09.tsv' with 30205 lines.
FASTA file '../data/AMP_new/final_AMP.fa'' has been converted to TSV file '../data/AMP_new/final_AMP.tsv' with 25004 lines.
FASTA file '../data/AMP_new/final_AMP.fa'' has been converted to TSV file '../data/AMP_new/final_AMP_anti_Gram-.tsv' with 15288 lines.
FASTA file '../data/AMP_new/final_AMP.fa'' has been converted to TSV file '../data/AMP_new/final_AMP_anti_Gram+.tsv' with 14864 lines.
FASTA file '../data/AMP_new/final_AMP.fa'' has been converted to TSV file '../data/AMP_new/final_AMP_antifungal.tsv' with 6305 lines.
FASTA file '../data/AMP_new/final_AMP.fa'' has been converted to TSV file '../data/AMP_new/final_AMP_antiviral.tsv' with 2499 lines.


In [43]:
all_labels[1] = ''
for label, sufix in all_labels.items():
    input_tsv_active = input_fasta.replace('.fa', '.tsv') if label < 2 \
    else input_fasta.replace('.fa', f'_{all_labels[label]}.tsv')
    input_tsv_non_active = fasta_files[0].replace('.fa', '.tsv')
    list_fasta = ' '.join([input_tsv_non_active, input_tsv_active])
    out_tsv = f'../data/AMP_new/AMP_2024_09_13_{sufix}.tsv'
    !cat {list_fasta} > {out_tsv}
    !wc -l {out_tsv}

   45493 ../data/AMP_new/AMP_2024_09_13_anti_Gram-.tsv
   45069 ../data/AMP_new/AMP_2024_09_13_anti_Gram+.tsv
   36510 ../data/AMP_new/AMP_2024_09_13_antifungal.tsv
   32704 ../data/AMP_new/AMP_2024_09_13_antiviral.tsv
   55209 ../data/AMP_new/AMP_2024_09_13_.tsv


In [38]:
df = pd.read_csv(out_fname, sep='\t', header=None)

# Assume the last column is the target variable
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Perform the 80-20 split first
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Split the remaining 20% into validation and test sets (10% each of the original data)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

# Combine X and y for each set and save to TSV files
train_df = pd.concat([X_train, y_train], axis=1)
val_df = pd.concat([X_val, y_val], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

# Write to TSV files
train_df.to_csv('../data/AMP_new/AMP_2024_08_09_train.tsv', sep='\t', index=False, header=False)
val_df.to_csv('../data/AMP_new/AMP_2024_08_09_validation.tsv', sep='\t', index=False, header=False)
test_df.to_csv('../data/AMP_new/AMP_2024_08_09_test.tsv', sep='\t', index=False, header=False)