In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# plt.rcParams['font.family'] = 'Times New Roman'
plt.rcParams['font.family'] = 'Arial'
import scanpy as sc
from pathlib import Path
from tqdm.auto import tqdm

In [None]:
import sys
sys.path.extend(['../../mylibs'])

In [None]:
results_dir = Path("../../data/ZT-238/").absolute()
solo_out_filtered = results_dir / "starsolo_outputs/Solo.out/GeneFull/filtered"
solo_out_raw = results_dir / "starsolo_outputs/Solo.out/GeneFull/raw"

In [None]:
adata = sc.read_h5ad(solo_out_filtered / "matrix.stats.h5ad")
adata_raw = sc.read_h5ad(solo_out_raw / "matrix.h5ad")

## Barcode Rank Plot

In [None]:
adata.obs['n_counts_umi_total'] = adata.X.sum(axis=1)
adata_raw.obs['n_counts_umi_total'] = adata_raw.X.sum(axis=1)
expected_num_cells = adata.n_obs
umi_counts = np.sort(adata_raw.obs['n_counts_umi_total'])[::-1]
barcode_rank = range(len(umi_counts))

In [None]:
adata.obs

In [None]:
grch38_counts = adata[adata.obs['species'] == 'GRCh38'].obs['n_counts_umi_total'].values
grcm39_counts = adata[adata.obs['species'] == 'GRCm39'].obs['n_counts_umi_total'].values
multiplet_counts = adata[adata.obs['species'] == 'Multiplet'].obs['n_counts_umi_total'].values

from joblib import Parallel, delayed

def get_index_by_value_or_nearest(array, val):
    idx_arr = np.where(array == val)[0]
    return idx_arr[0] if idx_arr.size > 0 else np.abs(array - val).argmin()

grch38_indices = Parallel(n_jobs=-1)(
    delayed(get_index_by_value_or_nearest)(umi_counts, val) for val in grch38_counts
)
grcm39_indices = Parallel(n_jobs=-1)(
    delayed(get_index_by_value_or_nearest)(umi_counts, val) for val in grcm39_counts
)
multiplet_indices = Parallel(n_jobs=-1)(
    delayed(get_index_by_value_or_nearest)(umi_counts, val) for val in multiplet_counts
)

In [None]:
fig, ax = plt.subplots(figsize=(10, 7), dpi=300)

# Draw the dashed gray line first so it's in the lowest layer
ax.loglog(barcode_rank, umi_counts, linewidth=3, color="#bababa", linestyle='--', zorder=0)

# Draw the red vertical line above the gray line
ax.axvline(x=expected_num_cells, linewidth=1, linestyle='--', color="#c00000", zorder=1)

human = ax.scatter(grch38_indices, umi_counts[grch38_indices], color="#f88ec6", s=100, marker='|', alpha=0.5, label="Human", zorder=2)
mouse = ax.scatter(grcm39_indices, umi_counts[grcm39_indices], color="#a46bb7", s=20, marker='|', alpha=0.5, label="Mouse", zorder=2)
multiplet = ax.scatter(multiplet_indices, umi_counts[multiplet_indices], color="#a6d7a3", s=300, marker='|', alpha=0.1, label="Multiplet", zorder=2)

ax.set_xlabel("Barcode rank", fontsize=10, fontweight='bold')
ax.set_ylabel("Total UMIs", fontsize=10, fontweight='bold')
ax.set_xlim(right=1e7)
ax.set_ylim(top=1e5)
ax.tick_params(axis='both', which='major', labelsize=10)
ax.tick_params(axis='both', which='minor', labelsize=10)

from matplotlib.lines import Line2D

custom_lines = [
    Line2D([0], [0], color="#f88ec6", marker='|', linestyle='None', markersize=10, label="Human"),
    Line2D([0], [0], color="#a46bb7", marker='|', linestyle='None', markersize=10, label="Mouse"),
    Line2D([0], [0], color="#a6d7a3", marker='|', linestyle='None', markersize=10, label="Multiplet")
]
ax.legend(handles=custom_lines, loc="lower left", frameon=True, fontsize=10, title_fontsize=10, title=None)

plt.grid(False, which="both")
for label in (ax.get_xticklabels() + ax.get_yticklabels()):
    label.set_fontweight('bold')
plt.show()

save_results_dir = Path("results/figures")
save_results_dir.mkdir(parents=True, exist_ok=True)
fig.savefig(save_results_dir / f"{results_dir.name}_barcode_rank.svg", dpi=300, bbox_inches='tight')