In [3]:
import pandas as pd
from pathlib import Path
import os
import json
from tqdm import tqdm

In [6]:
base_path = Path("/home/unix/wangyanz/codon_usage/star_ribo")
data_pool = base_path.joinpath("data")
tag = "RIBO_STAR_rep23"
star_ribo_data = data_pool.joinpath(f"{tag}.h5ad")

In [7]:

table = {
        "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
        "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S",
        "TAT": "Y", "TAC": "Y",                           # noqa: E241
        "TGT": "C", "TGC": "C",             "TGG": "W",   # noqa: E241
        "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
        "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
        "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q",
        "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R",
        "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M",
        "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
        "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K",
        "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R",
        "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
        "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
        "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E",
        "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G",
    }
stop_codons=["TAA", "TAG", "TGA"]
start_codons=["TTG", "CTG", "ATG"]

df = pd.read_csv(data_pool.joinpath(f"cell_type_norm_mean_{tag}.csv"))
df = df.set_index('cell_type')

with open(data_pool.joinpath(f"gene_name_{tag}.txt"), "r") as fp:
    gene_name_scRNA = fp.read().split()

gene_seq_pool = Path("/home/unix/wangyanz/13.yanze/data/gene_seq")
gene_count_pool = Path("/home/unix/wangyanz/13.yanze/data/gene_count")
gene_cell_type_pool = data_pool.joinpath("cell_type")
if not os.path.exists(gene_cell_type_pool):
    os.mkdir(gene_cell_type_pool)

cell_types = df.index.tolist()

for cell_type in tqdm(cell_types, desc="cell type", position=0):
    print(cell_type)
    empty_data = {}
    for codon in table:
        empty_data[codon] = 0
    for stop_codon in stop_codons:
        empty_data[stop_codon] = 0

    query_cell_type = cell_type
    query_gene_expr = df.loc[query_cell_type]
    count = 0

    available_gene_seq = gene_seq_pool.glob("*.dat")

    for agene in tqdm(available_gene_seq, desc="gene", position=1):
        agene_index = agene.stem
        gene_name = agene_index.split("_")[0]
        if gene_name in gene_name_scRNA:
            gene_count_file = gene_count_pool.joinpath(f"{agene_index}.json")
            with open(gene_count_file, "r") as fp:
                data = json.load(fp)
            for key in data.keys():
                empty_data[key] += query_gene_expr[gene_name] * data[key]

    canonical_cell_name = "_".join(query_cell_type.split())
    with open(gene_cell_type_pool.joinpath(f"{canonical_cell_name}.json"), "w") as fp:
        json.dump(empty_data, fp, indent=4)


cell type:   0%|                                                                 | 0/12 [00:00<?, ?it/s]

AC


gene: 18195it [01:23, 218.02it/s]
cell type:   8%|████▊                                                    | 1/12 [01:23<15:18, 83.48s/it]

CHOR_EPEN


gene: 18195it [00:09, 1993.14it/s]
cell type:  17%|█████████▌                                               | 2/12 [01:32<06:37, 39.75s/it]

CHO_PEP


gene: 18195it [00:04, 3710.55it/s]
cell type:  25%|██████████████▎                                          | 3/12 [01:37<03:34, 23.85s/it]

DE_MEN


gene: 18195it [00:08, 2054.64it/s]
cell type:  33%|███████████████████                                      | 4/12 [01:46<02:23, 17.94s/it]

INH


gene: 18195it [00:04, 3715.66it/s]
cell type:  42%|███████████████████████▊                                 | 5/12 [01:51<01:32, 13.24s/it]

MLG


gene: 18195it [00:08, 2104.09it/s]
cell type:  50%|████████████████████████████▌                            | 6/12 [02:00<01:10, 11.69s/it]

OLG


gene: 18195it [00:04, 3705.70it/s]
cell type:  58%|█████████████████████████████████▎                       | 7/12 [02:04<00:47,  9.48s/it]

OPC


gene: 18195it [00:08, 2080.79it/s]
cell type:  67%|██████████████████████████████████████                   | 8/12 [02:13<00:37,  9.26s/it]

Other


gene: 18195it [00:04, 3672.05it/s]
cell type:  75%|██████████████████████████████████████████▊              | 9/12 [02:18<00:23,  7.92s/it]

PVM


gene: 18195it [00:09, 2003.40it/s]
cell type:  83%|██████████████████████████████████████████████▋         | 10/12 [02:27<00:16,  8.28s/it]

TEPN


gene: 18195it [00:04, 3668.74it/s]
cell type:  92%|███████████████████████████████████████████████████▎    | 11/12 [02:32<00:07,  7.27s/it]

VAS


gene: 18195it [00:09, 2018.90it/s]
cell type: 100%|████████████████████████████████████████████████████████| 12/12 [02:41<00:00, 13.48s/it]
