In [30]:
import pandas as pd
import numpy as np
import re
from scipy.stats import gmean

## Process Original Dataset

In [31]:
grampa = pd.read_csv("labeled_data/raw_data/origin_data/grampa_v2.csv").drop("Unnamed: 0", axis=1)
uniprot = pd.read_csv("labeled_data/raw_data/origin_data/origin_negative.csv")

In [32]:
uniprot = uniprot.rename(columns={"Sequence": "sequence"})
uniprot = uniprot[~uniprot["sequence"].str.contains("B|X|Z|O|U")].reset_index(drop=True)
uniprot["Keywords"] = uniprot["Keywords"].fillna("")

uniprot["sequence"] = uniprot.apply(lambda row: row['sequence'] + "_amd" if "amidation" in row["Keywords"].lower() else row["sequence"], axis=1)

# uniprot[uniprot["Sequence"].duplicated(keep=False)].groupby("Sequence").size().reset_index(name='counts')
uniprot = uniprot[~uniprot["sequence"].duplicated()].reset_index(drop=True)

uniprot["ID"] = list("UID" + str(idx) for idx in list(uniprot.index + 1))
uniprot["immunogenic"] = ["No"] * len(uniprot)
uniprot["MIC"] = [-9999] * len(uniprot)

uniprot.to_csv("labeled_data/raw_data/uniprot.csv", columns = ["ID", "sequence", "immunogenic", "MIC"], index=False)

In [33]:
grampa = grampa[grampa["bacterium"] == "E. coli"]
grampa = grampa.assign(MIC = lambda x: 10**x['value'])
grampa["sequence"] = grampa.apply(lambda row: row['sequence'] + "_amd" if row["has_cterminal_amidation"] else row["sequence"], axis=1)
grampa = grampa[~grampa["has_unusual_modification"]]

grampa = grampa.groupby("sequence").agg({"MIC": gmean}).reset_index()
grampa["ID"] = list("GID" + str(idx) for idx in list(grampa.index + 1))
grampa["immunogenic"] = ['Yes'] * len(grampa)

grampa.to_csv("labeled_data/raw_data/grampa.csv", columns = ["ID", "sequence", "immunogenic", "MIC"], index=False)

## Create *immuno_peptides.txt*

In [34]:
# Export to immuno_peptides.txt
col_to_exp = ["ID","sequence","immunogenic"]
immuno = pd.concat([grampa[col_to_exp], uniprot[col_to_exp]], sort=False)
immuno = immuno[immuno["sequence"].duplicated(keep=False) == False]

immuno.to_csv("labeled_data/tables/immuno_peptides.txt", index=None, sep=",", mode="w")

## Create *[ID]_graph.txt* Files

In [35]:
# Get list of IDs & Sequences
ids = list(immuno["ID"])
sequences = list(immuno["sequence"])
num_sequences = len(sequences)

In [36]:
for i in range(num_sequences):

    filename = "labeled_data/dataset/classification/" + str(ids[i]) + "_graph.txt"

    data = []  
    data.append("MONOMERS")

    # Split sequence by capital letters (denotes different AA)
    seq = sequences[i]
    split_seq = re.findall('[A-Z][^A-Z]*', seq)
    peptide_len = len(split_seq)

    for AA in range(peptide_len):
        line = str(AA + 1) + " " + str(split_seq[AA])
        data.append(line)

    data.append("\n")
    data.append("BONDS")
    
    for j in range(1,peptide_len):
        data.append(str(j) + " " + str(j + 1) + " AMB")

    with open(filename, "w") as txt_file:
        for line in data:
            txt_file.write("".join(line) + "\n")    