In [1]:
import pandas as pd

In [2]:
uniqu_feat_cnt_d = {}

In [3]:
gene_df = pd.read_pickle("./data/gene_df.pkl")
uniqu_feat_cnt_d["genes"] = len(gene_df.GENE_ID.unique())
uniqu_feat_cnt_d

{'genes': 4694}

In [4]:
# Only using intergenic regions and not inter-feature regions (genomic features).
# When assigning genomic features, if the mutation doesn't land in an explicit feature,
# it's given the intergernic region rather than the inter-feature region.
intergenic_region_df = pd.read_pickle("./data/intergenic_region_df.pkl")
uniqu_feat_cnt_d["intergenic"] = len(intergenic_region_df["RegulonDB ID"].unique())
uniqu_feat_cnt_d

{'genes': 4694, 'intergenic': 3744}

In [5]:
TU_df = pd.read_pickle("./data/TU_df.pkl")
uniqu_feat_cnt_d["TU"] = len(TU_df["TRANSCRIPTION_UNIT_ID"].unique())
uniqu_feat_cnt_d

{'genes': 4694, 'intergenic': 3744, 'TU': 3560}

In [6]:
operon_df = pd.read_pickle("./data/operon_df.pkl")
uniqu_feat_cnt_d["operon"] = len(operon_df["OPERON_ID"].unique())
uniqu_feat_cnt_d

{'genes': 4694, 'intergenic': 3744, 'TU': 3560, 'operon': 2619}

In [7]:
tfbs_df = pd.read_pickle("./data/TFBS_df.pkl")
uniqu_feat_cnt_d["TFBS"] = len(tfbs_df[0].unique())
uniqu_feat_cnt_d

{'genes': 4694, 'intergenic': 3744, 'TU': 3560, 'operon': 2619, 'TFBS': 207}

In [8]:
promoter_df = pd.read_pickle("./data/promoter_df.pkl")
uniqu_feat_cnt_d["promoter"] = len(promoter_df[0].unique())
uniqu_feat_cnt_d

{'genes': 4694,
 'intergenic': 3744,
 'TU': 3560,
 'operon': 2619,
 'TFBS': 207,
 'promoter': 8617}

In [9]:
att_term_df = pd.read_pickle("./data/att_term_df.pkl")
uniqu_feat_cnt_d["att_term"] = len(att_term_df['RegulonDB ID'].unique())
uniqu_feat_cnt_d

{'genes': 4694,
 'intergenic': 3744,
 'TU': 3560,
 'operon': 2619,
 'TFBS': 207,
 'promoter': 8617,
 'att_term': 752}

In [10]:
terminator_df = pd.read_pickle("./data/term_df.pkl")
uniqu_feat_cnt_d["terminator"] = len(terminator_df[0].unique())
uniqu_feat_cnt_d

{'genes': 4694,
 'intergenic': 3744,
 'TU': 3560,
 'operon': 2619,
 'TFBS': 207,
 'promoter': 8617,
 'att_term': 752,
 'terminator': 306}

In [11]:
RBS_df = pd.read_pickle("./data/RBS_df.pkl")
uniqu_feat_cnt_d["RBS"] = len(RBS_df["SHINE_DALGARNO_ID"].unique())
uniqu_feat_cnt_d

{'genes': 4694,
 'intergenic': 3744,
 'TU': 3560,
 'operon': 2619,
 'TFBS': 207,
 'promoter': 8617,
 'att_term': 752,
 'terminator': 306,
 'RBS': 179}

In [12]:
gene_pathway_df = pd.read_pickle("./data/gene_pathway_df.pkl")
uniqu_feat_cnt_d["pathway"] = len(gene_pathway_df["pathway_name"].unique())
uniqu_feat_cnt_d

{'genes': 4694,
 'intergenic': 3744,
 'TU': 3560,
 'operon': 2619,
 'TFBS': 207,
 'promoter': 8617,
 'att_term': 752,
 'terminator': 306,
 'RBS': 179,
 'pathway': 128}

In [13]:
gene_COG_df = pd.read_pickle("./data/COG_df.pkl")
uniqu_feat_cnt_d["COG"] = len(gene_COG_df["COG description"].unique())
uniqu_feat_cnt_d

{'genes': 4694,
 'intergenic': 3744,
 'TU': 3560,
 'operon': 2619,
 'TFBS': 207,
 'promoter': 8617,
 'att_term': 752,
 'terminator': 306,
 'RBS': 179,
 'pathway': 128,
 'COG': 22}

In [14]:
# Though there may exist a regulon data file with RegulonDB, this is the method I took within AVA to assign regulons
# therefore using this method to count regulons ensures accuracy between the assignment methods and count.
TU_objects_df = pd.read_pickle("./data/TU_objects_df.pkl")
regulator_df = TU_objects_df[TU_objects_df["TU_OBJECT_CLASS"]=="ST"]
uniqu_feat_cnt_d["regulon"] = len(regulator_df["TU_OBJECT_ID"].unique())
uniqu_feat_cnt_d

{'genes': 4694,
 'intergenic': 3744,
 'TU': 3560,
 'operon': 2619,
 'TFBS': 207,
 'promoter': 8617,
 'att_term': 752,
 'terminator': 306,
 'RBS': 179,
 'pathway': 128,
 'COG': 22,
 'regulon': 2776}

In [15]:
tot_unique_item_cnt = 0
for v in uniqu_feat_cnt_d.values():
    tot_unique_item_cnt += v
tot_unique_item_cnt

27604

In [17]:
import pickle

tot_unique_item_cnt

with open('./data/annot_cnt_d.pkl', 'wb') as handle:
    pickle.dump(uniqu_feat_cnt_d, handle, protocol=pickle.HIGHEST_PROTOCOL)