In [1]:
import pandas as pd
import os
import anndata as ad

In [2]:
scrna_path = "/mnt/c/Users/donna/Downloads/Thesis/data/scrnatest/eye.h5ad"

In [3]:
reference_df = pd.read_excel("/mnt/c/Users/donna/Downloads/Thesis/rankjes/paper_data.xlsx", usecols=["cell_type", "tissue"])
reference_df.dropna(subset=["cell_type", "tissue"], inplace=True)
reference_df["normalized_cell_type"] = [name.rsplit("_", 1)[0] for name in reference_df["cell_type"]]
reference_df["normalized_tissue"] = reference_df["tissue"].str.lower().str.replace(r'[-\s]+', '_', regex=True)

In [6]:
h5ad_files = [f for f in os.listdir(scrna_path) if f.endswith(".h5ad")]

In [4]:
cell_name_rows = []

# for file_name in h5ad_files:

# full_path = os.path.join(scrna_path, file_name)
adata = ad.read_h5ad(scrna_path)
adata.obs['cell_type'] = [name.replace(", ", "_").replace(" ", "_").replace("-", "_") for name in adata.obs['cell_type']] 
adata = adata[adata.obs["assay"] == "10x 3' v3", :]

tissue_name = os.path.basename("eye.h5ad").replace(".h5ad", "")

filtered_reference = reference_df[reference_df["normalized_tissue"] == tissue_name]
paper_types = sorted(filtered_reference["normalized_cell_type"].unique())
anndata_types = sorted(adata.obs["cell_type"].unique())

# Find matches
matches = sorted(set(paper_types).intersection(set(anndata_types)))
paper_only = sorted(set(paper_types) - set(matches))
anndata_only = sorted(set(anndata_types) - set(matches))

# Combine: matches first, then mismatches
combined_paper = matches + paper_only + [None] * (len(anndata_only) - len(paper_only)) if len(anndata_only) > len(paper_only) else matches + paper_only
combined_anndata = matches + anndata_only + [None] * (len(paper_only) - len(anndata_only)) if len(paper_only) > len(anndata_only) else matches + anndata_only
print(len(matches))
# Pad to equal length
max_len = max(len(combined_paper), len(combined_anndata))
combined_paper += [None] * (max_len - len(combined_paper))
combined_anndata += [None] * (max_len - len(combined_anndata))

# Build tissue-specific DataFrame
df_tissue = pd.DataFrame({
    f"{tissue_name}_paper_types": combined_paper,
    f"{tissue_name}_anndata_types": combined_anndata
})

cell_name_rows.append(df_tissue)


# Combine all tissue DataFrames horizontally
final_df = pd.concat(cell_name_rows, axis=1)

# Save to CSV
final_df.to_csv("/mnt/c/Users/donna/Downloads/Thesis/rankjes/celltype_eye.csv", index=False)

# Optional preview
final_df


21


Unnamed: 0,eye_paper_types,eye_anndata_types
0,B_cell,B_cell
1,CD4_positive_alpha_beta_T_cell,CD4_positive_alpha_beta_T_cell
2,CD8_positive_alpha_beta_T_cell,CD8_positive_alpha_beta_T_cell
3,Mueller_cell,Mueller_cell
4,T_cell,T_cell
5,conjunctival_epithelial_cell,conjunctival_epithelial_cell
6,corneal_epithelial_cell,corneal_epithelial_cell
7,eye_photoreceptor_cell,eye_photoreceptor_cell
8,fibroblast,fibroblast
9,keratocyte,keratocyte
