In [2]:
import scanpy as sc
import pandas as pd

Matplotlib is building the font cache; this may take a moment.


In [3]:
adata = sc.read_h5ad("fibroblast_CRISPRa_mean_pop.h5ad")

print(adata)                 # shape, layers
print(adata.obs.head())      # row/perturbation metadata (guide labels)
print(adata.var.head())      # column/gene metadata (gene symbols)
df = adata.to_df()           # DataFrame view of X
print(df.shape, df.index[:5], df.columns[:5])

AnnData object with n_obs × n_vars = 10916 × 4914
    obs: 'protospacer', 'target_gene', 'target_expr', 'active', 'cluster', 'cluster_name', 'cluster_description', 'gene_driven', 'sequence_driven', 'sequence_suffix', 'bulk_expr', 'expressed', 'p_expressed', 'de_genes', 'strength', 'cell_count', 'target_gene_id', 'masked_active', 'masked_cluster', 'masked_cluster_name', 'masked_cluster_description', 'masked_cluster_size', 'masked_cluster_num_genes', 'stricter_masked_cluster', 'stricter_masked_cluster_description'
    var: 'feature_types', 'genome', 'mean', 'in_matrix', 'std', 'cv', 'fano', 'pairwise_p_cost', 'mean_adjusted_pairwise_p_cost', 'excess_cv', 'pairwise_chosen', 'gene_id'
    layers: 'adj_p', 'masked', 'p'
                                    protospacer target_gene  target_expr  \
guide_identity                                                             
AATF_GCGCAGAAGGTTGAAGGGAT  GCGCAGAAGGTTGAAGGGAT        AATF          NaN   
AATF_GCGTGCGAGTGCGCGGGAAG  GCGTGCGAGTGCGCGGGAAG

In [4]:
# STEP 2: Filter for high-confidence guides (masked_active)

obs = adata.obs
print("obs columns:", obs.columns.tolist())

# adata.obs has per-guide metadata. We care about 'active' or 'masked_active' -> did this guide actually activate its target?
# masked active is the strictest version, for these the authors are most confident CRISPRa worked
# ended up only using masked active

n_guides_total = adata.n_obs
print(f"Total guides before filtering: {n_guides_total}")

# Prefer stricter 'masked_active' if available
if "masked_active" in obs.columns:
    print("Using 'masked_active' for filtering (strict).")
    guide_mask = obs["masked_active"].astype(bool)
elif "active" in obs.columns:
    print("WARNING: 'masked_active' not found, falling back to 'active'.")
    guide_mask = obs["active"].astype(bool)
else:
    print("WARNING: No 'masked_active' or 'active' found. Keeping all guides.")
    guide_mask = pd.Series(True, index=adata.obs_names)

n_guides_keep = guide_mask.sum()
print(f"Guides kept after filter: {n_guides_keep} "
      f"({n_guides_keep / n_guides_total:.2%} of total)")

adata_filt = adata[guide_mask].copy()
print(adata_filt)


obs columns: ['protospacer', 'target_gene', 'target_expr', 'active', 'cluster', 'cluster_name', 'cluster_description', 'gene_driven', 'sequence_driven', 'sequence_suffix', 'bulk_expr', 'expressed', 'p_expressed', 'de_genes', 'strength', 'cell_count', 'target_gene_id', 'masked_active', 'masked_cluster', 'masked_cluster_name', 'masked_cluster_description', 'masked_cluster_size', 'masked_cluster_num_genes', 'stricter_masked_cluster', 'stricter_masked_cluster_description']
Total guides before filtering: 10916
Using 'masked_active' for filtering (strict).
Guides kept after filter: 661 (6.06% of total)
AnnData object with n_obs × n_vars = 661 × 4914
    obs: 'protospacer', 'target_gene', 'target_expr', 'active', 'cluster', 'cluster_name', 'cluster_description', 'gene_driven', 'sequence_driven', 'sequence_suffix', 'bulk_expr', 'expressed', 'p_expressed', 'de_genes', 'strength', 'cell_count', 'target_gene_id', 'masked_active', 'masked_cluster', 'masked_cluster_name', 'masked_cluster_descriptio

In [5]:
tf_counts = adata_filt.obs["target_gene"].value_counts()
print(tf_counts.head())
print("Total TFs:", tf_counts.shape[0])


target_gene
ALX4     6
CDX2     6
HELZ2    6
GLIS3    6
EGR3     6
Name: count, dtype: int64
Total TFs: 213


**So we only used masked active above, and are left with 661 guides, which is 213 TFs good enough ig**

In [6]:
# STEP 3: guide-level data to TF-level profiles

# This gives us a matrix with:
#   rows = filtered guides (guide_identity)
#   cols = genes (gene_name)
#   values = some normalized expression
df_guides = adata_filt.to_df()
print("Guide-level expression matrix:", df_guides.shape)

# sanity check indices/cols
print("Example guide IDs:", df_guides.index[:5].tolist())
print("Example gene names:", df_guides.columns[:5].tolist())

# each guide targets some TF, stored in obs['target_gene']. thus there can be multiple guide rows for repeats
# We'll attach that as a column, then group by it
if "target_gene" not in adata_filt.obs.columns:
    raise ValueError("Expected 'target_gene' column in adata.obs, but it's missing.")

df_guides["TF"] = adata_filt.obs["target_gene"].values

# distinct TFs we have after filtering
tf_counts = df_guides["TF"].value_counts()
print(f"Number of unique TFs after filtering: {tf_counts.shape[0]}")
print("Top 10 TFs by number of guides:")
print(tf_counts.head(10))

MIN_GUIDES_PER_TF = 1  # if want QC
valid_tfs = tf_counts[tf_counts >= MIN_GUIDES_PER_TF].index
df_guides = df_guides[df_guides["TF"].isin(valid_tfs)]

print(f"Guides remaining after TF-level min-guide filter: {df_guides.shape[0]}")
print(f"TFs remaining after TF-level min-guide filter: {df_guides['TF'].nunique()}")

# group by TF and average gene expression across all its guides
df_tf = df_guides.groupby("TF").mean()

print("TF-level matrix shape (TFs x genes):", df_tf.shape)
print("Example TFs:", df_tf.index[:5].tolist())
print("Example genes:", df_tf.columns[:5].tolist())


Guide-level expression matrix: (661, 4914)
Example guide IDs: ['AFF1_GGAACCCGGTACCGCAGCTC', 'AFF1_GGGCGCCACCCAAGCGGCAA', 'AFF1_GTTGGGTGGCGCCAGCTAGA', 'ALX4_GAGAGAGAGGCCGGCGTGGA', 'ALX4_GCGGGCGGGGACGCGAGCGA']
Example gene names: ['DPM1', 'FUCA2', 'NIPAL3', 'ANKIB1', 'CYP51A1']
Number of unique TFs after filtering: 213
Top 10 TFs by number of guides:
TF
ALX4      6
CDX2      6
HELZ2     6
GLIS3     6
EGR3      6
SALL3     6
TP73      6
ZNF296    6
PRDM1     6
FOXE1     5
Name: count, dtype: int64
Guides remaining after TF-level min-guide filter: 661
TFs remaining after TF-level min-guide filter: 213
TF-level matrix shape (TFs x genes): (213, 4914)
Example TFs: ['AFF1', 'ALX4', 'ARID1A', 'ARID3A', 'BARX1']
Example genes: ['DPM1', 'FUCA2', 'NIPAL3', 'ANKIB1', 'CYP51A1']


  df_tf = df_guides.groupby("TF").mean()


In [7]:
# STEP 4: Make a (TF, gene, value) df for ML

# df_tf:
#   rows   = TFs
#   cols   = genes
#   values = expression-like values for that gene under that TF perturbation

# Reset index so TF becomes a column, then melt to long format.
df_tf_reset = df_tf.reset_index()      # 'TF' becomes a normal column
df_tf_reset = df_tf_reset.rename(columns={"TF": "TF"})

pairs = df_tf_reset.melt(
    id_vars="TF",          # keep TF as identifier
    var_name="gene",       # column name for genes
    value_name="expr"      # expression/response value
)

print("Long table shape (rows = TF-gene pairs):", pairs.shape)
print(pairs.head())

# Basic sanity checks
print("Number of unique TFs:", pairs["TF"].nunique())
print("Number of unique genes:", pairs["gene"].nunique())

# If there are any NaNs (shouldn't usually be many), you can drop them:
n_nan = pairs["expr"].isna().sum()
print(f"Number of TF-gene pairs with NaN expr: {n_nan}")
pairs = pairs.dropna(subset=["expr"])
print("Shape after dropping NaN expr rows:", pairs.shape)


Long table shape (rows = TF-gene pairs): (1046682, 3)
       TF  gene      expr
0    AFF1  DPM1 -0.068800
1    ALX4  DPM1 -0.107178
2  ARID1A  DPM1 -0.221361
3  ARID3A  DPM1  0.031947
4   BARX1  DPM1 -0.118612
Number of unique TFs: 213
Number of unique genes: 4914
Number of TF-gene pairs with NaN expr: 0
Shape after dropping NaN expr rows: (1046682, 3)


In [9]:
# save stuff for promoter seqs

import os

OUT_DIR = "data_processed"
os.makedirs(OUT_DIR, exist_ok=True)

# save TF–gene table
#    This will be the main input for the promoter sequence pulling script
pairs_path = os.path.join(OUT_DIR, "tf_gene_pairs.csv")
pairs.to_csv(pairs_path, index=False)
print(f"Saved TF-gene pairs to: {pairs_path}")
print(pairs.head())

# a unique gene list 
gene_list = sorted(pairs["gene"].unique())
genes_path = os.path.join(OUT_DIR, "genes_in_pairs.txt")
with open(genes_path, "w") as f:
    for g in gene_list:
        f.write(g + "\n")
print(f"Saved {len(gene_list)} unique genes to: {genes_path}")

# tf list
tf_list = sorted(pairs["TF"].unique())
tfs_path = os.path.join(OUT_DIR, "tfs_in_pairs.txt")
with open(tfs_path, "w") as f:
    for t in tf_list:
        f.write(t + "\n")
print(f"Saved {len(tf_list)} unique TFs to: {tfs_path}")


Saved TF-gene pairs to: data_processed/tf_gene_pairs.csv
       TF  gene      expr
0    AFF1  DPM1 -0.068800
1    ALX4  DPM1 -0.107178
2  ARID1A  DPM1 -0.221361
3  ARID3A  DPM1  0.031947
4   BARX1  DPM1 -0.118612
Saved 4914 unique genes to: data_processed/genes_in_pairs.txt
Saved 213 unique TFs to: data_processed/tfs_in_pairs.txt
