In [None]:
"""
UMAP - all genes
"""

#imports
import scanpy as sc
import matplotlib.pyplot as plt

#Load
adata = sc.read_h5ad("GSE185862_SSv4_norm.h5ad")
print(adata)

#Filter
adata = adata[adata.obs['donor_sex_label'].isin(['M', 'F'])].copy()
print(f"Filtered shape: {adata.shape}")

# Normalize / log1p (skip if already done)
# sc.pp.normalize_total(adata, target_sum=1e4)
# sc.pp.log1p(adata)

sc.pp.scale(adata, max_value=10)

#PCA on all genes (no HVG selection)
sc.tl.pca(adata, svd_solver='arpack')

#Compute neighbors and UMAP
sc.pp.neighbors(adata, n_neighbors=15, n_pcs=30)
sc.tl.umap(adata)

#Plot
custom_palette = {
    'M': '#6baed6',    # light blue
    'F': '#e377c2'     # pink
}

fig, ax = plt.subplots(figsize=(8, 8))

for group in ['M', 'F']:
    adata_sub = adata[adata.obs['donor_sex_label'] == group]
    ax.scatter(
        adata_sub.obsm['X_umap'][:, 0],
        adata_sub.obsm['X_umap'][:, 1],
        s=30,
        label=group,
        color=custom_palette[group],
        alpha=0.6,
        edgecolors='none'
    )

#Styling
ax.set_xlabel('UMAP 1', fontsize=16, fontweight='bold')
ax.set_ylabel('UMAP 2', fontsize=16, fontweight='bold')
ax.tick_params(axis='both', labelsize=14, width=1.5, length=6)
for label in ax.get_xticklabels() + ax.get_yticklabels():
    label.set_fontweight('bold')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_linewidth(1.5)
ax.spines['bottom'].set_linewidth(1.5)

#Legend styling
leg = ax.legend(title='Sex', fontsize=14, title_fontsize=16, frameon=True)
leg.get_title().set_fontweight('bold')
for text in leg.get_texts():
    text.set_fontweight('bold')

#Save
plt.tight_layout()
plt.savefig('umap_sex_full_adata_600dpi.png', dpi=600, bbox_inches='tight')
plt.show()

In [None]:
"""
UMAP - gene subset
"""

#imports
import scanpy as sc
import matplotlib.pyplot as plt

#Load
adata = sc.read_h5ad("GSE185862_SSv4_norm.h5ad")
print(adata)

#Filter
adata = adata[adata.obs['donor_sex_label'].isin(['M', 'F'])].copy()
print(f"Filtered shape: {adata.shape}")

#Subset to the 6 genes
genes_of_interest = ['Tsix', 'Xist', 'Eif2s3y', 'Ddx3y', 'Uty', 'Kdm5d']
adata = adata[:, genes_of_interest].copy()
print(f"Subsetted shape: {adata.shape}")

sc.pp.scale(adata, max_value=10)

#PCA 
n_features = adata.shape[1]
if n_features > 2:
    n_comps = min(5, n_features - 1)
    print(f"Running PCA with n_comps = {n_comps}")
    sc.tl.pca(adata, n_comps=n_comps, svd_solver='arpack')
    sc.pp.neighbors(adata, n_neighbors=10, n_pcs=n_comps)
else:
    print("Skipping PCA, using raw X for neighbors.")
    sc.pp.neighbors(adata, use_rep='X')

#UMAP
sc.tl.umap(adata)

#Plot
custom_palette = {
    'M': '#6baed6',  # light blue
    'F': '#e377c2'   # pink
}

fig, ax = plt.subplots(figsize=(8, 8))

for group in ['M', 'F']:
    adata_sub = adata[adata.obs['donor_sex_label'] == group]
    ax.scatter(
        adata_sub.obsm['X_umap'][:, 0],
        adata_sub.obsm['X_umap'][:, 1],
        s=30,
        label=group,
        color=custom_palette[group],
        alpha=0.6,
        edgecolors='none'
    )

#Styling
ax.set_xlabel('UMAP 1', fontsize=16, fontweight='bold')
ax.set_ylabel('UMAP 2', fontsize=16, fontweight='bold')
ax.tick_params(axis='both', labelsize=14, width=1.5, length=6)
for label in ax.get_xticklabels() + ax.get_yticklabels():
    label.set_fontweight('bold')
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_linewidth(1.5)
ax.spines['bottom'].set_linewidth(1.5)

#Legend styling
leg = ax.legend(title='Sex', fontsize=14, title_fontsize=16, frameon=True)
leg.get_title().set_fontweight('bold')
for text in leg.get_texts():
    text.set_fontweight('bold')

#Save
plt.tight_layout()
plt.savefig('umap_sex_6genes_600dpi.png', dpi=600, bbox_inches='tight')
plt.show()

In [None]:
"""
LogREG & Barplot
"""

#paths
INPUT_H5AD = "GSE185862_SSv4_norm.h5ad"
OUTPUT_DIR = "DE_Results_LogReg"
GROUP_COLUMN = "donor_sex_label"
REFERENCE = "M"
QUERY = "F"
TOP_N_GENES = 20

#imports
import os
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams.update({
    "axes.titlesize": 16,
    "axes.labelsize": 14,
    "axes.labelweight": "bold",
    "xtick.labelsize": 12,
    "ytick.labelsize": 12,
    "legend.fontsize": 12,
    "legend.title_fontsize": 12,
    "axes.titleweight": "bold",
    "axes.edgecolor": "black",
    "axes.linewidth": 1.2,
})

os.makedirs(OUTPUT_DIR, exist_ok=True)

#load
adata = sc.read_h5ad(INPUT_H5AD)
print(f"Loaded {adata.n_obs} cells × {adata.n_vars} genes.")

#Filter F/M only
adata = adata[adata.obs[GROUP_COLUMN].isin([QUERY, REFERENCE])].copy()
print(f"After filtering for F/M: {adata.n_obs} cells.")

#helpers
def run_logreg_de(adata_sub, fname_prefix):
    counts = adata_sub.obs[GROUP_COLUMN].value_counts()
    print(f"Groups: {dict(counts)}")
    if (counts < 2).any():
        print("Too few cells in one group → skipping.")
        return None

    #HVGs only
    sc.pp.highly_variable_genes(adata_sub, n_top_genes=2000, subset=True)
    sc.pp.scale(adata_sub, max_value=10)

    #Logistic Regression test
    sc.tl.rank_genes_groups(
        adata_sub,
        groupby=GROUP_COLUMN,
        reference=REFERENCE,
        method="logreg",
        pts=True,
        key_added="logreg"
    )

    res = sc.get.rank_genes_groups_df(adata_sub, group=None, key='logreg')
    if res.empty:
        print("No results.")
        return None

    if 'group' in res.columns:
        res = res[res['group'] == QUERY]

    res = res[[c for c in ['names', 'scores', 'logfoldchanges'] if c in res.columns]]
    res = res.rename(columns={"scores": "score", "logfoldchanges": "log2FC"})

    res.to_csv(f"{fname_prefix}.csv", index=False)
    res.to_excel(f"{fname_prefix}.xlsx", index=False)
    print(f"Saved: {fname_prefix}.csv/.xlsx")
    return res


def plot_barplot(df, title, outpath, top_n=20):
    df = df.copy()
    if "score" not in df.columns:
        print(f"No 'score' column → skipping barplot for {title}.")
        return

    df = df[df["score"].notnull()]
    if df.empty:
        print(f"No valid scores to plot for {title}.")
        return

    top_genes = df.nlargest(top_n, "score")

    plt.figure(figsize=(8, 6), dpi=600)
    if "log2FC" in top_genes.columns:
        ax = sns.barplot(
            data=top_genes,
            x="score", y="names",
            hue="log2FC", dodge=False,
            palette="vlag"
        )
        ax.legend(title="log2FC", bbox_to_anchor=(1.05, 1), loc="upper left")
    else:
        ax = sns.barplot(
            data=top_genes,
            x="score", y="names",
            color="gray"
        )
        if ax.legend_:
            ax.legend_.remove()

    ax.set_title(f"Top {top_n} Markers: {title}", fontsize=16, weight="bold")
    ax.set_xlabel("Logistic Regression Score", fontsize=14, weight="bold")
    ax.set_ylabel("Gene", fontsize=14, weight="bold")
    plt.tight_layout()
    plt.savefig(outpath, dpi=600)
    plt.close()
    print(f"Barplot saved: {outpath}")

#LogREG
print("\n--- GLOBAL Logistic Regression ---")
global_out = os.path.join(OUTPUT_DIR, "GLOBAL")
os.makedirs(global_out, exist_ok=True)

global_res = run_logreg_de(adata, os.path.join(global_out, "DEG"))
if global_res is not None:
    tmp_res = global_res.copy()
    tmp_res = tmp_res[tmp_res["score"].notnull()]
    top_genes = tmp_res.nlargest(TOP_N_GENES, "score")["names"].tolist()

    if len(top_genes) > 0:
        plot_barplot(global_res, "Global", os.path.join(global_out, "barplot.png"), top_n=TOP_N_GENES)

print("\n DONE!")

In [None]:
"""
Wilcoxon & Volcano
"""

#paths
INPUT_H5AD    = "GSE185862_SSv4_norm.h5ad"
OUTPUT_DIR    = "DE_Results_Wilcoxon"
GROUP_COLUMN  = "donor_sex_label"
REFERENCE     = "M"
QUERY         = "F"
TOP_LABELS_VOLCANO = 10
DPI           = 600

#imports
import os
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rcParams

rcParams.update({
    "axes.titlesize": 18,
    "axes.labelsize": 16,
    "axes.labelweight": "bold",
    "xtick.labelsize": 14,
    "ytick.labelsize": 14,
    "legend.fontsize": 12,
    "legend.title_fontsize": 14,
    "axes.titleweight": "bold",
    "axes.edgecolor": "black",
    "axes.linewidth": 1.5,
})

os.makedirs(OUTPUT_DIR, exist_ok=True)

#load
adata = sc.read_h5ad(INPUT_H5AD)
print(f"Loaded {adata.n_obs} cells × {adata.n_vars} genes.")
adata = adata[adata.obs[GROUP_COLUMN].isin([QUERY, REFERENCE])].copy()
print(f"Filtered: {adata.obs[GROUP_COLUMN].value_counts().to_dict()}")

#helpers
def safe_rank_genes_groups(adata_sub, fname_prefix):
    counts = adata_sub.obs[GROUP_COLUMN].value_counts()
    print(f"Groups: {dict(counts)}")
    if (counts < 5).any():
        print("Too few cells - Skipping")
        return None

    sc.tl.rank_genes_groups(
        adata_sub,
        groupby=GROUP_COLUMN,
        reference=REFERENCE,
        method="wilcoxon",
        pts=True,
        key_added="wilcoxon"
    )

    res = sc.get.rank_genes_groups_df(adata_sub, group=None, key='wilcoxon')
    if res.empty:
        print("Empty results")
        return None

    if 'group' in res.columns:
        res = res[res['group'] == QUERY]

    res = res.rename(columns={'logfoldchanges': 'log2FC', 'pvals_adj': 'p_val_adj'})
    res = res.dropna(subset=["log2FC", "p_val_adj"])

    res.to_csv(f"{fname_prefix}.csv", index=False)
    res.to_excel(f"{fname_prefix}.xlsx", index=False)
    return res


def plot_volcano(df, title, outpath):
    df = df.copy()
    df["sig"] = "Not sig"
    df.loc[(df.p_val_adj < 0.05) & (df.log2FC > 1), "sig"] = "Up in F"
    df.loc[(df.p_val_adj < 0.05) & (df.log2FC < -1), "sig"] = "Down in F"

    df["-log10p"] = -np.log10(df.p_val_adj.clip(1e-300))

    plt.figure(figsize=(8,8), dpi=DPI)
    palette = {"Up in F": "red", "Down in F": "blue", "Not sig": "gray"}
    ax = sns.scatterplot(
        data=df, x="log2FC", y="-log10p",
        hue="sig", palette=palette, edgecolor=None, s=40
    )

    plt.axhline(-np.log10(0.05), ls="--", color="black")
    plt.axvline(1, ls="--", color="black")
    plt.axvline(-1, ls="--", color="black")

    plt.title(f"Volcano: {title}", weight="bold")
    plt.xlabel("log2FC", weight="bold")
    plt.ylabel("-log10(adj p)", weight="bold")

    # Label only top N
    top_hits = df[df["sig"] != "Not sig"].nlargest(TOP_LABELS_VOLCANO, "-log10p")
    for _, row in top_hits.iterrows():
        ax.text(row.log2FC, row["-log10p"], row.names,
                 fontsize=10, weight="bold", ha="center", va="bottom")

    ax.legend(title="Significance", loc="upper center", bbox_to_anchor=(0.5, -0.12), ncol=3, frameon=False)
    plt.tight_layout()
    plt.savefig(outpath, dpi=DPI)
    plt.close()

#Wilcoxon
print("\n--- GLOBAL Wilcoxon ---")
global_out = os.path.join(OUTPUT_DIR, "GLOBAL")
os.makedirs(global_out, exist_ok=True)
global_subset = adata.copy()

res = safe_rank_genes_groups(global_subset, os.path.join(global_out, "DEG"))
if res is not None:
    plot_volcano(res, "Global", os.path.join(global_out, "volcano.png"))

print("\n DONE!")

In [None]:
"""
Violinplots
"""

#imports
import scanpy as sc
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

#paths
H5AD_PATH = "GSE185862_SSv4_norm.h5ad"
GENES_OF_INTEREST = ["Tsix", "Xist", "Eif2s3y", "Ddx3y", "Uty", "Kdm5d"]
GROUP_COLUMN = "donor_sex_label"
GROUP_ORDER = ["M", "F"]  
PALETTE = {"M": "#56B4E9", "F": "#E78AC3"} 

#filter
adata = sc.read_h5ad(H5AD_PATH)
adata = adata[adata.obs[GROUP_COLUMN].isin(GROUP_ORDER)].copy()

if adata.raw is not None:
    adata_use = adata.raw[:, GENES_OF_INTEREST]
else:
    adata_use = adata[:, GENES_OF_INTEREST]
    
X = adata_use.X.toarray() if not isinstance(adata_use.X, np.ndarray) else adata_use.X
df = pd.DataFrame(X, columns=GENES_OF_INTEREST)
df[GROUP_COLUMN] = adata.obs[GROUP_COLUMN].values

#log transformation if needed
df[GENES_OF_INTEREST] = np.log1p(df[GENES_OF_INTEREST])

#plot
df_long = df.melt(id_vars=GROUP_COLUMN, var_name="Gene", value_name="log1p(Counts)")

g = sns.catplot(
    data=df_long,
    x=GROUP_COLUMN,
    y="log1p(Counts)",
    col="Gene",
    kind="violin",
    sharey=False,
    col_wrap=2,
    height=4,
    aspect=1,
    cut=0,
    scale="width",
    linewidth=1.2,
    palette=PALETTE,
    order=GROUP_ORDER
)

#style
g.set_titles(col_template="{col_name}", size=14, fontweight='bold', style='italic')
g.set_axis_labels("Sex", "log1p(Counts)")

for ax in g.axes.flat:
    ax.set_xlabel(ax.get_xlabel(), fontsize=12, fontweight='bold')
    ax.set_ylabel(ax.get_ylabel(), fontsize=12, fontweight='bold')
    ax.tick_params(axis='both', labelsize=11, width=1.5, length=6)
    for label in ax.get_xticklabels() + ax.get_yticklabels():
        label.set_fontweight('bold')

g.fig.suptitle("Violin Plots of Selected Genes by Sex", fontsize=16, fontweight='bold')
plt.subplots_adjust(top=0.88)

#save
g.fig.savefig("violinplots_sex_umapcolors.png", dpi=600, bbox_inches='tight')
plt.show()