In [4]:
import scanpy as sc
import anndata as an
import pandas as pd

In [5]:
fpath = "/nfs/turbo/umms-indikar/shared/projects/HSC/data/pellin_2019/pellin.anndata.h5ad"

adata = sc.read_h5ad(fpath)
adata

AnnData object with n_obs × n_vars = 21412 × 20582
    obs: 'Barcode', 'Library', 'dataset'
    var: 'gene_name'
    layers: 'raw_counts'

In [None]:
break

In [None]:
def aggregate_gene_counts(adata):
    """
    Aggregates raw counts from an AnnData object at the gene level.

    Args:
        adata: An AnnData object containing gene expression data in a layer named 'raw_counts'.

    Returns:
        pandas.DataFrame: A DataFrame with gene names as index and summed raw counts as columns.
    """
    # Extract raw counts and transpose for gene-wise analysis
    df = adata.to_df(layer='raw_counts').T
    
    # Reset index to a column named 'gene_name'
    df.reset_index(names='gene_name', inplace=True)

    # Pre-aggregation info for debugging
    print(f"Initial shape: {df.shape}")

    # Extract base gene names (remove potential suffixes)
    df['gene_name'] = df['gene_name'].astype(str).str.split(".", n=1, expand=True)[0]  

    # Group by gene name and sum counts across all cells
    df = df.groupby('gene_name').sum()

    # Post-aggregation info
    print(f"Final shape: {df.shape}")

    return df.T

df = aggregate_gene_counts(adata)
df.head()

In [None]:
df = adata.to_df(layer='raw_counts').T
df = df.reset_index(names='gene_name')
print(f"{df.shape=}")
df['gene_name'] = df['gene_name'].apply(lambda x: x.split(".")[0])
df = df.groupby('gene_name').sum()
print(f"{df.shape=}")
df.head()

In [None]:
def create_anndata_from_dataframe(df, adata):
    """
    Creates an AnnData object from a pandas DataFrame and incorporates metadata from an existing AnnData object.

    Args:
        df: A pandas DataFrame where rows are features (e.g., genes) and columns are samples.
        adata: An existing AnnData object containing metadata to be transferred (obs and var_names).

    Returns:
        anndata.AnnData: The newly created AnnData object.
    """
    # Create AnnData from the DataFrame's numerical values
    pdf = an.AnnData(df.to_numpy())

    # Copy observation metadata (e.g., cell IDs, annotations)
    pdf.obs = adata.obs.copy()

    # Create variable annotation with Ensembl IDs
    pdf.var = pd.DataFrame({'ensembl_id': df.columns})

    # Set variable and observation names from the DataFrame
    pdf.var_names = df.columns
    pdf.obs_names = df.index

    return pdf

pdf = create_anndata_from_dataframe(df, adata)
pdf

In [None]:
pdf.obs.head()

In [None]:
pdf = an.AnnData(df.to_numpy())
pdf.obs = adata.obs.copy()
pdf.var = pd.DataFrame({'ensembl_id' : df.index})
pdf.var_names = df.index
pdf.obs_names = df.columns

pdf


In [None]:
pdf.var.head()

In [None]:
adata.var['']

In [None]:
fpath = "/nfs/turbo/umms-indikar/shared/projects/geneformer/data/rajapakse_lab_data_jpic.h5ad"

adata = sc.read_h5ad(fpath)
adata

In [None]:
adata.obs

In [None]:
fpath = "/scratch/indikar_root/indikar1/cstansbu/HSC/scanpy/clustered.anndata.h5ad"

adata = sc.read_h5ad(fpath)
adata

In [None]:
adata.obs['celltype'].value_counts()

In [None]:
adata.obs[adata.obs['celltype'] == 'FB']['organ_tissue'].value_counts()