In [9]:
import pandas as pd 
import random
import pickle
scop_seqs = pd.read_csv('data_scop/scop_seqs_with_tags.gz')
#drop duplicates on sequence
scop_seqs = scop_seqs.drop_duplicates(subset='Sequence')

In [10]:
#create kw as a column of scop_seqs: that is a list of the elements in the columns named 'TP', 'CL' , 'CF' , 'SF' and 'FA'

#scop_seqs['kw'] = scop_seqs[['TP', 'CL' , 'CF' , 'SF' , 'FA']].values.tolist()
scop_seqs['kw'] = scop_seqs[['SF', 'FA', 'CL' , 'CF' , 'TP']].values.tolist()

scop_seqs['Entry'] = range(0,len(scop_seqs))

In [11]:
def convert_to_dict(dataframe):
    #convert the dataframes to dictionaries
    dictionary = {}
    for _, row in dataframe.iterrows():
        sub_dict = {}
        sub_dict["kw"] = row["kw"]
        sub_dict["ex"] = 4
        sub_dict["seq"] = row["Sequence"]
        sub_dict["len"] = row["seq_length"]
        dictionary[row["Entry"]] = sub_dict
    # convert the dictionary items to a list, shuffle the list, and convert it back to a dictionary
    items = list(dictionary.items())
    random.shuffle(items)
    return dict(items)

def save_to_pickle(dictionary, path):
    #save the dictionary to a pickle file
    with open(path, "wb") as file:
        pickle.dump(dictionary, file)


In [12]:
path = "data_scop/scop_seqs_with_tags.p"
dictionary = convert_to_dict(scop_seqs)
save_to_pickle(dictionary, path)
print("Done")

Done


In [None]:
scop_seqs.head()

In [14]:
#Create a vocab.txt file, containing all tags from TP, CL, CF, FA, and SF   
#The vocab.txt file will be used to create a vocabulary for the model
#Its comprised of:
#TAG [INCREMENTAL NUMBER]\n
#TAG [INCREMENTAL NUMBER]\n

#Also add the letters of the amino acids to the vocab.txt file at the end and the PAD token
#The amino acids are all the letters of the alphabet



vocab = pd.concat([scop_seqs["TP"], scop_seqs["CL"], scop_seqs["CF"], scop_seqs["FA"], scop_seqs["SF"]] ).unique()
#add the amino acids to the vocab
vocab = list(vocab)
vocab.extend(['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'PAD'])



TP_POS = 0
CL_POS = scop_seqs["TP"].unique().size
CF_POS = scop_seqs["CL"].unique().size + CL_POS
FA_POS = scop_seqs["CF"].unique().size + CF_POS
SF_POS = scop_seqs["FA"].unique().size + FA_POS
AA_POS = scop_seqs["SF"].unique().size + SF_POS
PAD_POS = len(vocab) - 1

print(f'''
      TP goes from 0 to {CL_POS},
      CL goes from {CL_POS} to {CF_POS},
      CF goes from {CF_POS} to {FA_POS},
      FA goes from {FA_POS} to {SF_POS},
      SF goes from {SF_POS} to {AA_POS},
      AA goes from {AA_POS} to {PAD_POS}
      ''')

vocab = pd.DataFrame(vocab)

#include index before converting to csv
vocab = vocab.reset_index()
vocab.columns = ["index", "tag"]
vocab["tag"] = vocab["tag"].astype(str) + " " + vocab["index"].astype(str)
vocab = vocab["tag"]

vocab.to_csv('mapping_files/vocab.txt', index=False, header=False)


      TP goes from 0 to 4,
      CL goes from 4 to 9,
      CF goes from 9 to 1540,
      FA goes from 1540 to 7330,
      SF goes from 7330 to 10035,
      AA goes from 10035 to 10060
      
