In [None]:
structure_path = "raw_data/alphafold_structures"

In [None]:
# Returns list of (uniprot_id, file_name) e.g., [("P49723", "P49723.pdb"), ...]
def get_uniprot_id_and_file_name_tuple_in_path(path):
    from os import listdir
    return {file.split(".")[0]: file
            for file in listdir(path)}  # "raw_data/alphafold_structures"

In [3]:
feature_names = ["#Gene_Name", "#Uniprot_ID"]

def make_prot_feat_df(gene_seq_path, prot_seq_path, annotation_path):
    import pandas as pd
    import numpy as np
    from Bio import SeqIO
    from Bio.Seq import Seq
    from Bio.SeqRecord import SeqRecord
    from Bio.SeqUtils.ProtParam import ProteinAnalysis
    
    identifiers = list()
    uniprot_ids = dict()
    all_feat_dict = dict()
    
    # Load annotation file
    df = pd.read_csv(annotation_path, sep="\t")
    pdb_id_and_file_names = get_uniprot_id_and_file_name_tuple_in_path(structure_path)
    
    print("Working...")
    for seq_record, prot_record in zip(SeqIO.parse(gene_seq_path, "fasta"), SeqIO.parse(prot_seq_path, "fasta")):
        identifier = seq_record.name
        uniprot_id = df[df["#Gene_Name"]==identifier]["#Uniprot_ID"].values[0]
        
        if uniprot_id == "-" or uniprot_id not in pdb_id_and_file_names.keys():
            continue
            
        identifiers.append(identifier)
        uniprot_ids[identifier] = uniprot_id
    
    for identifier in identifiers:
        attribute_list = list()
        attribute_list.append(identifier)
        attribute_list.append(uniprot_ids[identifier])
        
        all_feat_dict[identifier] = attribute_list
        
    df = pd.DataFrame.from_dict(all_feat_dict, orient='index').reset_index(drop=True)
    
    print(df)
    
    df.columns = feature_names
    print("Done.")

In [4]:
ess_df = make_prot_feat_df("processed_ess/degseq-e.dat", "processed_ess/degaa-e.dat", "processed_ess/degannotation-e-uniprot.dat")
ness_df = make_prot_feat_df("processed_ness/degseq-ne.dat", "processed_ness/degaa-ne.dat", "processed_ness/degannotation-ne-uniprot.dat")
ess_df["label"] = 1
ness_df["label"] = 0

Working...
                0       1
0     DEG20010001  P34111
1     DEG20010002  P32471
2     DEG20010003  P10962
3     DEG20010004  P28004
4     DEG20010005  P28005
...           ...     ...
1085  DEG20011106  P54999
1086  DEG20011107  P14020
1087  DEG20011108  P39933
1088  DEG20011109  P20435
1089  DEG20011110  P32349

[1090 rows x 2 columns]
Done.
Working...
              0           1
0       YGL195W      P33892
1       YLR454W      Q06179
2       YJR066W      P35169
3       YHR165C      P33334
4       YFR019W      P34756
...         ...         ...
3940    YDL232W      Q99380
3941    YDR461W      P34165
3942  YCL054W-A  A0A0B7P221
3943  YDL133C-A      P0CX87
3944    YDL184C      P0CX86

[3945 rows x 2 columns]
Done.


In [5]:
import pickle

ess_df.to_csv("processed_data/yeast_ess_feat_tsv.tsv", sep="\t", index=False)
ness_df.to_csv("processed_data/yeast_ness_feat_tsv.tsv", sep="\t", index=False)

pickle.dump(ess_df, open("processed_data/yeast_ess_feat_df.pickle", "wb"))
pickle.dump(ness_df, open("processed_data/yeast_ness_feat_df.pickle", "wb"))