In [1]:
import matplotlib.pyplot as plt
import json
import numpy as np
import os
from pathlib import Path
import pandas as pd

In [2]:
basepath = Path("/home/unix/wangyanz/codon_usage/scRNA")
data_pool = basepath.joinpath("data")
cell_type_pool = data_pool.joinpath("cell_type")
pics_dir = data_pool.joinpath("pics")
if not os.path.exists(pics_dir):
    os.mkdir(pics_dir)
tag = "scRNA_Muris_10X_v2"

In [3]:
cells = cell_type_pool.glob("*.json")
cell_data = {}

for cell in cells:
    with open(cell, "r") as fp:
        data = json.load(fp)
    cell_data[cell.stem] = data
print(cell_data)


{'classical_monocyte': {'TTT': 20623.867497379513, 'TTC': 27113.198855384748, 'TTA': 7115.065471139918, 'TTG': 16322.761938900787, 'TCT': 21093.33966463588, 'TCC': 23354.568277581377, 'TCA': 13095.244060129002, 'TCG': 5588.850344563023, 'TAT': 16108.5923072808, 'TAC': 23195.790056029156, 'TGT': 14035.815776821628, 'TGC': 15317.677135125236, 'TGG': 15871.937249559687, 'CTT': 16065.721047947358, 'CTC': 24381.261835247904, 'CTA': 8541.9562313086, 'CTG': 55378.56778092508, 'CCT': 25453.797202296555, 'CCC': 25402.040139730045, 'CCA': 22382.60505330387, 'CCG': 8743.9214666373, 'CAT': 12872.934888830463, 'CAC': 19384.17054117776, 'CAA': 15044.521238908676, 'CAG': 47602.34358428966, 'CGT': 8906.544289753525, 'CGC': 15676.469334530593, 'CGA': 12046.731334326727, 'CGG': 15946.350702478643, 'ATT': 22419.706812888842, 'ATC': 34201.68082697007, 'ATA': 7926.6731153341, 'ATG': 33381.27419553631, 'ACT': 19745.63522148359, 'ACC': 27274.281753912273, 'ACA': 20169.723768904998, 'ACG': 7710.417277278013, 

In [4]:

table={
        "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
        "TCT": "S", "TCC": "S", "TCA": "S", "TCG": "S",
        "TAT": "Y", "TAC": "Y",                           # noqa: E241
        "TGT": "C", "TGC": "C",             "TGG": "W",   # noqa: E241
        "CTT": "L", "CTC": "L", "CTA": "L", "CTG": "L",
        "CCT": "P", "CCC": "P", "CCA": "P", "CCG": "P",
        "CAT": "H", "CAC": "H", "CAA": "Q", "CAG": "Q",
        "CGT": "R", "CGC": "R", "CGA": "R", "CGG": "R",
        "ATT": "I", "ATC": "I", "ATA": "I", "ATG": "M",
        "ACT": "T", "ACC": "T", "ACA": "T", "ACG": "T",
        "AAT": "N", "AAC": "N", "AAA": "K", "AAG": "K",
        "AGT": "S", "AGC": "S", "AGA": "R", "AGG": "R",
        "GTT": "V", "GTC": "V", "GTA": "V", "GTG": "V",
        "GCT": "A", "GCC": "A", "GCA": "A", "GCG": "A",
        "GAT": "D", "GAC": "D", "GAA": "E", "GAG": "E",
        "GGT": "G", "GGC": "G", "GGA": "G", "GGG": "G",
        "TAA": "stop", "TAG": "stop", "TGA": "stop"
    }


codon_dict = {
    'Ala': ['GCT', 'GCC', 'GCA', 'GCG'],
    'Arg': ['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'],
    'Asn': ['AAT', 'AAC'],
    'Asp': ['GAT', 'GAC'],
    'Cys': ['TGT', 'TGC'],
    'Gln': ['CAA', 'CAG'],
    'Glu': ['GAA', 'GAG'],
    'Gly': ['GGT', 'GGC', 'GGA', 'GGG'],
    'His': ['CAT', 'CAC'],
    'Ile': ['ATT', 'ATC', 'ATA'],
    'Leu': ['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG'],
    'Lys': ['AAA', 'AAG'],
    'Met': ['ATG'],
    'Phe': ['TTT', 'TTC'],
    'Pro': ['CCT', 'CCC', 'CCA', 'CCG'],
    'Ser': ['TCT', 'TCC', 'TCA', 'TCG', 'AGT', 'AGC'],
    'Thr': ['ACT', 'ACC', 'ACA', 'ACG'],
    'Trp': ['TGG'],
    'Tyr': ['TAT', 'TAC'],
    'Val': ['GTT', 'GTC', 'GTA', 'GTG'],
    'stop': ['TAA', 'TAG', 'TGA']
}

# Amino-acid composition 

In [5]:
result = {}
for codon in table:
    result[codon] = []
    for cell_type in cell_data:
        result[codon].append(cell_data[cell_type][codon])
    result[codon].append(table[codon])

df = pd.DataFrame(data=result, index=list(cell_data.keys())+["aminoacid"], columns=table.keys())

amino_result = {}
for cell in list(cell_data.keys()):
    amino_result[cell] = {}

for codon in df:
    amino_acid = df[codon]["aminoacid"]
    # print(amino_acid)
    for cell in df[codon].index:
        if cell == "aminoacid":
            continue
        if amino_acid in amino_result[cell]:
            amino_result[cell][amino_acid] += df[codon][cell]
        else:
            amino_result[cell][amino_acid] = df[codon][cell]

amino_acid_df = pd.DataFrame(amino_result)

amino_acid_df = amino_acid_df / amino_acid_df.sum(axis=0) * 100
vmax = amino_acid_df.max(axis=1)
vmin = amino_acid_df.min(axis=1)
vstd = amino_acid_df.std(axis=1)
vmean = amino_acid_df.mean(axis=1)
cmin = amino_acid_df.idxmin(axis=1)
cmax = amino_acid_df.idxmax(axis=1)

amino_acid_df.insert(0, "max freq cell type", cmax)
amino_acid_df.insert(0, "max freq", vmax)
amino_acid_df.insert(0, "min freq cell type", cmin)
amino_acid_df.insert(0, "min freq", vmin)
amino_acid_df.insert(0, "range_differnece", vmax-vmin)
amino_acid_df.insert(0, "mean", vmean)
amino_acid_df.insert(0, "std", vstd)

amino_acid_df = amino_acid_df.round(3)
amino_acid_df.to_csv(data_pool.joinpath(f"amino_acid_composition_{tag}.csv"))
amino_acid_df

Unnamed: 0,std,mean,range_differnece,min freq,min freq cell type,max freq,max freq cell type,classical_monocyte,lung_endothelial_cell,myeloid_cell,...,granulocyte,endocardial_cell,bladder_urothelial_cell,ciliated_columnar_cell_of_tracheobronchial_tree,dendritic_cell,leukocyte,fibroblast,blood_cell,erythroblast,skeletal_muscle_satellite_cell
F,0.18,3.429,1.047,3.056,cardiac_muscle_cell,4.102,hepatocyte,3.39,3.475,3.418,...,3.956,3.317,3.346,3.477,3.411,3.514,3.353,3.406,3.739,3.274
L,0.326,8.977,2.002,7.841,cardiac_muscle_cell,9.843,hepatocyte,9.077,9.36,9.12,...,9.056,9.003,8.867,9.404,9.128,9.167,8.826,9.03,9.174,8.429
S,0.27,7.463,1.071,6.988,T_cell,8.059,mesangial_cell,7.277,7.748,7.27,...,7.202,7.763,7.289,7.709,7.407,7.459,7.669,7.327,7.382,7.333
Y,0.089,2.65,0.466,2.462,bladder_cell,2.928,hepatocyte,2.792,2.715,2.713,...,2.609,2.634,2.646,2.642,2.681,2.706,2.623,2.688,2.644,2.538
C,0.166,1.926,0.707,1.595,cardiac_muscle_cell,2.302,mesenchymal_stem_cell,2.085,2.071,1.899,...,2.181,2.079,1.769,1.889,1.954,1.961,2.064,1.879,1.857,1.989
W,0.073,1.042,0.3,0.91,basal_cell_of_epidermis,1.211,kidney_proximal_straight_tubule_epithelial_cell,1.127,1.127,1.1,...,0.931,1.063,0.992,1.048,1.106,1.069,1.098,1.062,1.041,0.915
P,0.388,5.925,2.436,5.062,granulocyte,7.498,bladder_cell,5.823,6.021,5.893,...,5.062,6.071,5.904,5.848,5.879,5.874,6.396,5.934,5.684,6.03
H,0.105,2.319,0.823,1.843,cardiac_muscle_cell,2.666,granulocyte,2.291,2.369,2.336,...,2.666,2.367,2.327,2.326,2.383,2.392,2.31,2.342,2.521,2.252
Q,0.161,4.469,0.993,3.826,cardiac_muscle_cell,4.819,epithelial_cell,4.449,4.571,4.417,...,4.801,4.6,4.403,4.67,4.512,4.508,4.566,4.439,4.577,4.181
R,0.342,6.016,1.636,5.108,hepatocyte,6.743,T_cell,6.064,5.822,6.101,...,5.328,5.88,6.41,5.85,6.118,5.876,5.895,6.06,5.734,6.477


# Codon usage

In [6]:
result = {}
for codon in table:
    result[codon] = []
    for cell_type in cell_data:
        result[codon].append(cell_data[cell_type][codon])
    result[codon].append(table[codon])
# print(result)

#, 
df = pd.DataFrame(data=result, index=list(cell_data.keys())+["aminoacid"], columns=table.keys()).T
df.to_csv(data_pool.joinpath(f"codon_frequence_raw_{tag}.csv"))
df = df.groupby("aminoacid").apply(lambda x: x/x.sum()*100)
df = df.astype(float)
aminoacid = [table[codon] for codon in df.index]
vmax = df.max(axis=1)
vmin = df.min(axis=1)
vstd = df.std(axis=1)
cmin = df.idxmin(axis=1)
cmax = df.idxmax(axis=1)
vmean = df.mean(axis=1)


df.insert(0, "max freq cell type", cmax)
df.insert(0, "max freq", vmax)
df.insert(0, "min freq cell type", cmin)
df.insert(0, "min freq", vmin)
df.insert(0, "range_differnece", vmax-vmin)
df.insert(0, "mean", vmean)
df.insert(0, "std", vstd)
df = df.round(2)
df.insert(0, "aminoacid", aminoacid)
df.to_csv(data_pool.joinpath(f"codon_frequence_{tag}.csv"))
df


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df = df.groupby("aminoacid").apply(lambda x: x/x.sum()*100)


Unnamed: 0,aminoacid,std,mean,range_differnece,min freq,min freq cell type,max freq,max freq cell type,classical_monocyte,lung_endothelial_cell,...,granulocyte,endocardial_cell,bladder_urothelial_cell,ciliated_columnar_cell_of_tracheobronchial_tree,dendritic_cell,leukocyte,fibroblast,blood_cell,erythroblast,skeletal_muscle_satellite_cell
TTT,F,1.59,43.84,8.08,39.62,Langerhans_cell,47.70,proerythroblast,43.20,43.07,...,40.68,45.13,44.01,44.58,43.50,42.18,43.68,42.48,46.53,43.67
TTC,F,1.59,56.16,8.08,52.30,proerythroblast,60.38,Langerhans_cell,56.80,56.93,...,59.32,54.87,55.99,55.42,56.50,57.82,56.32,57.52,53.47,56.33
TTA,L,0.64,5.99,3.16,4.64,Langerhans_cell,7.80,cardiac_muscle_cell,5.57,5.92,...,4.86,6.80,5.70,6.46,6.02,5.49,6.18,5.64,5.18,5.74
TTG,L,0.63,13.24,4.29,11.12,type_II_pneumocyte,15.41,granulocyte,12.77,13.12,...,15.41,13.61,13.24,13.05,13.22,13.30,13.05,13.26,13.27,13.04
TCT,S,0.68,20.99,3.07,19.17,type_II_pneumocyte,22.24,granulocytopoietic_cell,20.59,20.04,...,22.01,21.00,20.98,20.17,21.35,20.86,21.09,21.35,21.69,21.10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GGA,G,1.05,24.27,5.12,22.26,Langerhans_cell,27.39,cardiac_muscle_cell,24.15,24.24,...,24.12,25.55,23.37,24.32,24.29,23.58,25.31,23.43,25.12,22.95
GGG,G,1.11,21.12,6.80,16.39,bladder_cell,23.18,granulocyte,21.51,22.38,...,23.18,21.58,20.65,22.44,21.86,21.48,19.76,21.11,20.74,20.21
TAA,stop,3.90,36.67,20.13,24.30,type_II_pneumocyte,44.42,T_cell,34.63,33.33,...,34.85,36.41,40.80,31.05,36.13,36.65,35.05,38.19,43.35,40.29
TAG,stop,4.49,22.08,23.92,17.36,Langerhans_cell,41.27,granulocyte,20.70,22.19,...,41.27,21.33,21.43,24.66,19.54,23.16,21.00,20.58,21.94,19.71


In [7]:
# # Iterate over the subplots and plot something on each one
# pics_dir = Path("/home/unix/wangyanz/codon_usage/scRNA/pics")
# for aa in codon_dict:
#     if len(codon_dict[aa]) < 2:
#         continue
#     # codon_freq = {}
#     # for codon in codon_dict[aa]:
#     #     codon_freq[codon] = 0
#     idx = 0
#     fig, axs = plt.subplots(11, 5, figsize=(12, 24))
#     for cell in cell_data:
#         codon_value = []
        
#         for codon in codon_dict[aa]:
#             codon_value.append(cell_data[cell][codon])
#         codon_value = np.array(codon_value)
#         codon_value = codon_value / codon_value.sum()
#         j = idx%5
#         i = idx//5
#         ax = axs[i, j]
#         # Plot something on the current subplot
#         ax.set_title(f'{cell}')
#         ax.pie(codon_value, labels=codon_dict[aa], autopct='%1.1f%%', shadow=True, startangle=90)
#         ax.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
#         idx += 1
#     plt.tight_layout()
#     plt.savefig(pics_dir.joinpath(f"{aa}.png"))
