In [1]:
import matplotlib.pyplot as plt
import json
import numpy as np
import os
from pathlib import Path
import pandas as pd

In [4]:
base_path = Path("/home/unix/wangyanz/codon_usage/star_ribo")
data_pool = base_path.joinpath("data")
tag = "RIBO_STAR_rep23"
star_ribo_data = data_pool.joinpath(f"{tag}.h5ad")
cell_type_pool = data_pool.joinpath("cell_type")


In [5]:
cells = cell_type_pool.glob("*.json")
cell_data = {}

for cell in cells:
    with open(cell, "r") as fp:
        data = json.load(fp)
    cell_data[cell.stem] = data
print(cell_data)

{'Other': {'TTT': 1886.98727647788, 'TTC': 2718.810954412611, 'TTA': 634.9487313451402, 'TTG': 1543.3990518059445, 'TCT': 1963.9175551168412, 'TCC': 2281.8108663239846, 'TCA': 1320.416088666862, 'TCG': 604.8672623001203, 'TAT': 1456.748462409259, 'TAC': 2200.1489579031877, 'TGT': 1200.055123053877, 'TGC': 1515.420322116831, 'TGG': 1459.7178780560232, 'CTT': 1433.6219587777284, 'CTC': 2473.434381723608, 'CTA': 838.9666515697811, 'CTG': 5067.071030149625, 'CCT': 2249.088680606279, 'CCC': 2469.9385166940824, 'CCA': 2057.5615051170316, 'CCG': 866.1992804621225, 'CAT': 1177.9494550030615, 'CAC': 1858.4917743108545, 'CAA': 1307.4859209458373, 'CAG': 4460.065793177988, 'CGT': 665.2668747459103, 'CGC': 1422.2581455370218, 'CGA': 923.3817474861073, 'CGG': 1449.8749107038316, 'ATT': 2087.940200918239, 'ATC': 3354.693742070596, 'ATA': 712.9766876530219, 'ATG': 3040.860939529488, 'ACT': 1625.218693959899, 'ACC': 2690.9490309873613, 'ACA': 1950.7735779384386, 'ACG': 794.48153268389, 'AAT': 1923.048

In [6]:

table={
        "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
        "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S",
        "TAT": "Y", "TAC": "Y",                           # noqa: E241
        "TGT": "C", "TGC": "C",             "TGG": "W",   # noqa: E241
        "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
        "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
        "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q",
        "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R",
        "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M",
        "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
        "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K",
        "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R",
        "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
        "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
        "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E",
        "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G",
        "TAA": "stop", "TAG": "stop", "TGA": "stop"
    }


codon_dict = {
    'Ala': ['GCT', 'GCC', 'GCA', 'GCG'],
    'Arg': ['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'],
    'Asn': ['AAT', 'AAC'],
    'Asp': ['GAT', 'GAC'],
    'Cys': ['TGT', 'TGC'],
    'Gln': ['CAA', 'CAG'],
    'Glu': ['GAA', 'GAG'],
    'Gly': ['GGT', 'GGC', 'GGA', 'GGG'],
    'His': ['CAT', 'CAC'],
    'Ile': ['ATT', 'ATC', 'ATA'],
    'Leu': ['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG'],
    'Lys': ['AAA', 'AAG'],
    'Met': ['ATG'],
    'Phe': ['TTT', 'TTC'],
    'Pro': ['CCT', 'CCC', 'CCA', 'CCG'],
    'Ser': ['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC'],
    'Thr': ['ACT', 'ACC', 'ACA', 'ACG'],
    'Trp': ['TGG'],
    'Tyr': ['TAT', 'TAC'],
    'Val': ['GTT', 'GTC', 'GTA', 'GTG'],
    'stop': ['TAA', 'TAG', 'TGA']
}

In [7]:
result = {}
for codon in table:
    result[codon] = []
    for cell_type in cell_data:
        result[codon].append(cell_data[cell_type][codon])
    result[codon].append(table[codon])
# print(result)

#, 
df = pd.DataFrame(data=result, index=list(cell_data.keys())+["aminoacid"], columns=table.keys()).T
df = df.groupby("aminoacid").apply(lambda x: x/x.sum()*100)
df = df.astype(float)
aminoacid = [table[codon] for codon in df.index]
vmax = df.max(axis=1)
vmin = df.min(axis=1)
vstd = df.std(axis=1)
cmin = df.idxmin(axis=1)
cmax = df.idxmax(axis=1)



df.insert(0, "max freq cell type", cmax)
df.insert(0, "max freq", vmax)
df.insert(0, "min freq cell type", cmin)
df.insert(0, "min freq", vmin)
df.insert(0, "range_differnece", vmax-vmin)
df.insert(0, "std", vstd)
df = df.round(2)
df.insert(0, "aminoacid", aminoacid)
df.to_csv(data_pool.joinpath(f"codon_frequence_{tag}.csv"))
df


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df = df.groupby("aminoacid").apply(lambda x: x/x.sum()*100)


Unnamed: 0,aminoacid,std,range_differnece,min freq,min freq cell type,max freq,max freq cell type,Other,TEPN,OPC,CHO_PEP,INH,AC,MLG,PVM,CHOR_EPEN,VAS,OLG,DE_MEN
TTT,F,0.61,2.13,40.26,AC,42.39,OLG,40.97,41.70,41.01,41.74,41.44,40.26,40.91,41.65,42.20,40.86,42.39,41.16
TTC,F,0.61,2.13,57.61,OLG,59.74,AC,59.03,58.30,58.99,58.26,58.56,59.74,59.09,58.35,57.80,59.14,57.61,58.84
TTA,L,0.20,0.62,5.13,AC,5.75,CHOR_EPEN,5.30,5.69,5.39,5.62,5.55,5.13,5.32,5.56,5.75,5.22,5.70,5.48
TTG,L,0.20,0.65,12.50,AC,13.14,CHO_PEP,12.87,13.09,12.89,13.14,12.99,12.50,12.80,13.11,13.07,12.86,12.63,13.08
TCT,S,0.29,1.12,19.15,OPC,20.27,CHOR_EPEN,19.43,19.31,19.15,19.65,19.51,19.27,19.28,19.36,20.27,19.27,19.50,19.49
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GGA,G,0.53,2.11,23.85,AC,25.97,CHOR_EPEN,24.17,24.60,24.19,24.81,24.64,23.85,24.24,24.91,25.97,24.41,24.66,24.61
GGG,G,0.24,0.96,22.33,CHOR_EPEN,23.30,AC,23.01,22.95,22.99,22.79,22.78,23.30,22.85,22.87,22.33,22.79,22.60,23.02
TAA,stop,1.52,6.44,32.85,CHOR_EPEN,39.29,OLG,34.99,35.70,34.15,35.33,35.84,34.10,35.56,35.33,32.85,35.23,39.29,35.45
TAG,stop,0.52,1.71,18.04,OLG,19.76,OPC,18.57,19.49,19.76,19.22,19.22,18.53,18.85,19.74,18.98,19.31,18.04,19.43
