In [69]:
import pandas as pd

In [70]:
uniqu_feat_cnt_d = {}

In [71]:
gene_df = pd.read_pickle("./data/gene_df.pkl")
uniqu_feat_cnt_d["genes"] = len(gene_df.GENE_ID.unique())
uniqu_feat_cnt_d

{'genes': 4694}

In [72]:
# Only using intergenic regions and not inter-feature regions (genomic features).
# When assigning genomic features, if the mutation doesn't land in an explicit feature,
# it's given the intergernic region rather than the inter-feature region.
intergenic_region_df = pd.read_pickle("./data/intergenic_region_df.pkl")
uniqu_feat_cnt_d["intergenic regions"] = len(intergenic_region_df["RegulonDB ID"].unique())
uniqu_feat_cnt_d

{'genes': 4694, 'intergenic regions': 3744}

In [73]:
TU_df = pd.read_pickle("./data/TU_df.pkl")
uniqu_feat_cnt_d["transcription units"] = len(TU_df["TRANSCRIPTION_UNIT_ID"].unique())
uniqu_feat_cnt_d

{'genes': 4694, 'intergenic regions': 3744, 'transcription units': 3560}

In [74]:
operon_df = pd.read_pickle("./data/operon_df.pkl")
uniqu_feat_cnt_d["operons"] = len(operon_df["OPERON_ID"].unique())
uniqu_feat_cnt_d

{'genes': 4694,
 'intergenic regions': 3744,
 'transcription units': 3560,
 'operons': 2619}

In [75]:
tfbs_df = pd.read_pickle("./data/TFBS_df.pkl")
uniqu_feat_cnt_d["TF binding sites"] = len(tfbs_df[0].unique())
uniqu_feat_cnt_d

{'genes': 4694,
 'intergenic regions': 3744,
 'transcription units': 3560,
 'operons': 2619,
 'TF binding sites': 207}

In [76]:
promoter_df = pd.read_pickle("./data/promoter_df.pkl")
uniqu_feat_cnt_d["promoters"] = len(promoter_df[0].unique())
uniqu_feat_cnt_d

{'genes': 4694,
 'intergenic regions': 3744,
 'transcription units': 3560,
 'operons': 2619,
 'TF binding sites': 207,
 'promoters': 8617}

In [77]:
att_term_df = pd.read_pickle("./data/att_term_df.pkl")
uniqu_feat_cnt_d["attenuator terminators"] = len(att_term_df['RegulonDB ID'].unique())
uniqu_feat_cnt_d

{'genes': 4694,
 'intergenic regions': 3744,
 'transcription units': 3560,
 'operons': 2619,
 'TF binding sites': 207,
 'promoters': 8617,
 'attenuator terminators': 752}

In [78]:
terminator_df = pd.read_pickle("./data/term_df.pkl")
uniqu_feat_cnt_d["terminators"] = len(terminator_df[0].unique())
uniqu_feat_cnt_d

{'genes': 4694,
 'intergenic regions': 3744,
 'transcription units': 3560,
 'operons': 2619,
 'TF binding sites': 207,
 'promoters': 8617,
 'attenuator terminators': 752,
 'terminators': 306}

In [79]:
RBS_df = pd.read_pickle("./data/RBS_df.pkl")
uniqu_feat_cnt_d["ribosomal binding sites"] = len(RBS_df["SHINE_DALGARNO_ID"].unique())
uniqu_feat_cnt_d

{'genes': 4694,
 'intergenic regions': 3744,
 'transcription units': 3560,
 'operons': 2619,
 'TF binding sites': 207,
 'promoters': 8617,
 'attenuator terminators': 752,
 'terminators': 306,
 'ribosomal binding sites': 179}

In [80]:
gene_pathway_df = pd.read_pickle("./data/gene_pathway_df.pkl")
uniqu_feat_cnt_d["pathways"] = len(gene_pathway_df["pathway_name"].unique())
uniqu_feat_cnt_d

{'genes': 4694,
 'intergenic regions': 3744,
 'transcription units': 3560,
 'operons': 2619,
 'TF binding sites': 207,
 'promoters': 8617,
 'attenuator terminators': 752,
 'terminators': 306,
 'ribosomal binding sites': 179,
 'pathways': 128}

In [81]:
gene_COG_df = pd.read_pickle("./data/COG_df.pkl")
uniqu_feat_cnt_d["COGs"] = len(gene_COG_df["COG description"].unique())
uniqu_feat_cnt_d

{'genes': 4694,
 'intergenic regions': 3744,
 'transcription units': 3560,
 'operons': 2619,
 'TF binding sites': 207,
 'promoters': 8617,
 'attenuator terminators': 752,
 'terminators': 306,
 'ribosomal binding sites': 179,
 'pathways': 128,
 'COGs': 22}

In [82]:
# The regulon file used here represents the authority and maximum amount of regulons,
# though not all of these are getting mapped to places on the genome.
regulon_df = pd.read_csv(
    "./data/RegulonDB10/regulon_tmp.txt",
    sep="\t",
    comment='#',
    header=None,
    quoting=3
)
regulon_df.columns = ["REGULON_ID", "REGULON_NAME", "KEY_ID_ORG"]
regulon_df

Unnamed: 0,REGULON_ID,REGULON_NAME,KEY_ID_ORG
0,EC12R0000073,"GalR,GalS",ECK12
1,EC12R0000075,"MraZ,PdhR",ECK12
2,EC12R0000077,CRP,ECK12
3,EC12R0000079,"CRP,Cra",ECK12
4,EC12R0000081,"ArgR,NtrC",ECK12
...,...,...,...
488,EC12R0001121,"CRP,RcsB-BglJ",ECK12
489,EC12R0001124,NimR,ECK12
490,EC12R0001126,YjjQ,ECK12
491,EC12R0001129,CecR,ECK12


In [83]:
uniqu_feat_cnt_d["regulons"] = len(regulon_df["REGULON_ID"].unique())
uniqu_feat_cnt_d

{'genes': 4694,
 'intergenic regions': 3744,
 'transcription units': 3560,
 'operons': 2619,
 'TF binding sites': 207,
 'promoters': 8617,
 'attenuator terminators': 752,
 'terminators': 306,
 'ribosomal binding sites': 179,
 'pathways': 128,
 'COGs': 22,
 'regulons': 493}

In [84]:
tot_unique_item_cnt = 0
for v in uniqu_feat_cnt_d.values():
    tot_unique_item_cnt += v
tot_unique_item_cnt

25321

In [85]:
import pickle

tot_unique_item_cnt

with open('./data/annot_cnt_d.pkl', 'wb') as handle:
    pickle.dump(uniqu_feat_cnt_d, handle, protocol=pickle.HIGHEST_PROTOCOL)