# Data Collection and Preprocessing

In [None]:
#Import
import os
import pandas as pd

### Load and combine data from NESG dataset

In [None]:
#Load NESG accessions and labels
NESG_df = pd.read_csv("./nesg_original/nesg.csv", sep=",")
NESG_df = NESG_df.drop(['exp'], axis=1)

In [None]:
#Load NESG sequences 
seq_list = [None] * len(NESG_df)
seq_count = 0
with open("./nesg_original/nesg.fasta", "r") as nesg_fasta:
    for line in nesg_fasta:
        if line.startswith(">"):
            
            #Catch former sequence
            if seq_count != 0:
                idx = NESG_df.index[NESG_df['id'] == header].tolist()
                assert len(idx) == 1
                sequence = sequence.split()
                sequence = "".join(sequence)
                seq_list[idx[0]] = sequence
            
            #initialize new sequence
            header = line[1:-1]
            sequence = ""
            seq_count += 1
            print(seq_count, end="\r")
        else: 
            sequence += line
            
#Catch last sequence
idx = NESG_df.index[NESG_df['id'] == header].tolist()
assert len(idx) == 1
seq_list[idx[0]] = sequence

In [None]:
#Combine NESG ID and sequence
NESG_df["sequence"] = seq_list
NESG_df = NESG_df.rename(columns={"id": "AC", "sol": "NESG_label"})
NESG_df

### Load and combine data from PSI-Bio dataset

In [None]:
#Load PSI-Bio accessions and labels
PSI_BIO_df = pd.read_csv("./PSI_Biology/psi_biology_nesg_partitioning_wl_th025_amT.csv", sep=",")
PSI_BIO_df = PSI_BIO_df.drop(['priority', "between_connectivity", "cluster"], axis=1)
PSI_BIO_df

In [None]:
#Load PSI-Bio sequences 
seq_list = [None] * len(PSI_BIO_df)
seq_count = 0
with open("./PSI_Biology/pET_full_without_his_tag.fa", "r") as psibio_fasta:
    for line in psibio_fasta:
        if line.startswith(">"):
            
            #Catch former sequence
            if seq_count != 0:
                idx = PSI_BIO_df.index[PSI_BIO_df['AC'] == header].tolist()
                if len(idx) == 1:
                    sequence = sequence.split()
                    sequence = "".join(sequence)
                    seq_list[idx[0]] = sequence
            
            #initialize new sequence
            header = line[1:].split("_")[0]
            sequence = ""
            seq_count += 1
            print(seq_count, end="\r")
        else: 
            sequence += line
            
#Catch last sequence
idx = PSI_BIO_df.index[PSI_BIO_df['AC'] == header].tolist()
if len(idx) == 1:
    seq_list[idx[0]] = sequence

In [None]:
#Combine PSI-bio ID and sequence
PSI_BIO_df["sequence"] = seq_list
PSI_BIO_df = PSI_BIO_df.rename(columns={"label-val": "PSI_BIO_label"})
PSI_BIO_df

### Combine NESG and PSI-bio dataframes

In [None]:
#Reformat to dict
psibio_dict = PSI_BIO_df.T.to_dict()
nesg_dict = NESG_df.T.to_dict()
print(len(psibio_dict))
print(len(nesg_dict))

In [None]:
#Make new dict with combined info from previous dicts
collect_dict = dict()
collect_count = 0
for key, value in psibio_dict.items():
    print(collect_count, end="\r")

    #Collect psibio knowledge
    seq = value["sequence"]
    if len(seq) > 1000:
        seq = None
    psibio_label = int(value["PSI_BIO_label"])
    AC = value["AC"]
 
    #Check if sequence is found in nesg data
    idx = NESG_df.index[NESG_df['sequence'] == seq].tolist()
    if len(idx) != 0:
        for i in idx:
            collect_count += 1
            collect_dict[collect_count] = {"ID":"nAC"+ str(collect_count),
                                                       "PSI_BIO_label":psibio_label,
                                                    "NESG_label":int(NESG_df.at[i,'NESG_label']),
                                                    "sequence":seq}
    else:
        collect_count += 1
        collect_dict[collect_count] = {"ID":"nAC"+ str(collect_count),
                                       "PSI_BIO_label":psibio_label,
                                                    "NESG_label":None,
                                                    "sequence":seq}
                                                    
    
#Get the last sequences that are not seen in psibio dataset        
for key, value in nesg_dict.items():
    print(collect_count, end="\r")
    
    seq = value["sequence"]
    if len(seq) > 1000:
        seq = None
    nesg_label = int(value["NESG_label"])
    AC = value["AC"]
    
    idx = PSI_BIO_df.index[PSI_BIO_df['sequence'] == seq].tolist()
    if len(idx) != 0:
        for i in idx:
            collect_count += 1
            collect_dict[collect_count] = {"ID":"nAC"+ str(collect_count),
                                           "PSI_BIO_label":int(PSI_BIO_df.at[i,'PSI_BIO_label']),
                                                       "NESG_label":nesg_label,
                                                       "sequence":seq}
    else:
        collect_count += 1
        collect_dict[collect_count] = {"ID":"nAC"+ str(collect_count),
                                                   "PSI_BIO_label":None,
                                                  "NESG_label":nesg_label,
                                                  "sequence":seq}   


In [None]:
#Check new dataframe
collect_df = pd.DataFrame.from_dict(collect_dict, orient='index')
collect_df

In [None]:
# Remove duplicates
collect_df = collect_df.drop_duplicates(subset = collect_df.columns.difference(['ID']))
collect_df

In [None]:
# Check for same sequence different labels
sequence_list = collect_df["sequence"]
duplicate_dict = dict()
seq_count = -1
for seq in sequence_list:
    seq_count += 1
    print(seq_count, end="\r")
    idx = collect_df.ID[collect_df['sequence'] == seq]
    if len(idx) != 1:
        duplicate_dict[seq_count] = idx

In [None]:
#Rename and reindex for easier iteration
duplicate_df = pd.DataFrame.from_dict(duplicate_dict, orient='index')
duplicate_df = duplicate_df.reset_index(drop=True)
column_header = duplicate_df.columns.tolist()
rename_dict = dict()
count = 0
for name in column_header:
    rename_dict[name] = count
    count += 1
duplicate_df = duplicate_df.rename(columns=rename_dict)
duplicate_df

In [None]:
# Get rid of sequences with contradictory labels, as this could cause frustration in the system
row_id = []
for i in range(len(duplicate_df.columns)):
    row_id += duplicate_df[i].tolist()
row_id = set(row_id)
collect_df = collect_df[~collect_df['ID'].isin(row_id)]

In [None]:
collect_df = collect_df.reset_index(drop=True)
collect_df

In [None]:
#Double check that there are no instances of the same sequence appearing twice
seqs = collect_df["sequence"]
assert len(seqs) == len(set(seqs))

#Check how many sequences are overlapping
total = 0
for i in range(len(collect_df.index)):
    if collect_df.iloc[i].isnull().sum() == 0:
        total += 1
print(f"Total overlapping sequences: {total}")

In [None]:
#Save this to an csv file
collect_df.to_csv("CleanedData.csv",index=False)