In [None]:
"""
UMAP
"""

#import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
import umap
from collections import Counter

#path
csv_path = "H358_with_fixed_noise.csv"
print(f"Reading CSV: {csv_path}")

df_raw = pd.read_csv(csv_path, header=None)
print(f"Raw shape: {df_raw.shape}")

labels_row = df_raw.iloc[0, 1:].astype(float).astype(int).tolist()
unique_labels = set(labels_row)
print(f"Labels found in first row: {unique_labels}")
if unique_labels - {0, 1}:
    raise ValueError(f"ERROR: Invalid label values: {unique_labels}")

df_data = df_raw.iloc[1:, :].copy()
df_data.columns = df_raw.iloc[0, :]

gene_col = df_data.columns[0]
df_data = df_data.set_index(gene_col)
df_data = df_data.apply(pd.to_numeric)

df = df_data.T
print(f"Data shape (cells x genes): {df.shape}")

#mapping
label_mapping = {0: "H358", 1: "H358_noise"}
label_names = [label_mapping[l] for l in labels_row]

label_counts = Counter(label_names)
print("\n Labels and counts:")
for label, count in label_counts.items():
    print(f"  {label}: {count} cells")

#pca
print("\n Running PCA...")
X = df.values
y = np.array(label_names)
pca = PCA(n_components=50, random_state=42)
X_pca = pca.fit_transform(X)

#umap
print("Running UMAP...")
umap_model = umap.UMAP(n_neighbors=15, min_dist=0.3, random_state=42)
X_umap = umap_model.fit_transform(X_pca)

#plot
print("Plotting and Saving UMAP...")

# Sanfte Farben für H358 vs. Noise
palette = {
    "H358": "#0072B2",       # Blue
    "H358_noise": "#D55E00"  # Orange
}

plt.figure(figsize=(8, 6))
for label in sorted(label_counts.keys()):
    idx = y == label
    plt.scatter(
        X_umap[idx, 0], X_umap[idx, 1],
        label=f"{label} (n={label_counts[label]})",
        color=palette[label],
        alpha=0.5,
        s=12
    )

plt.xlabel("UMAP 1", fontsize=14, fontweight='bold')
plt.ylabel("UMAP 2", fontsize=14, fontweight='bold')
plt.xticks(fontsize=12, fontweight='bold')
plt.yticks(fontsize=12, fontweight='bold')

legend = plt.legend(frameon=True, fontsize=12)
legend.get_title().set_fontweight('bold')
for text in legend.get_texts():
    text.set_fontweight('bold')

plt.tight_layout()
plt.savefig("UMAP_H358_vs_noise.png", dpi=300)
plt.show()
print("UMAP saved as 'UMAP_H358_vs_noise.png'")

In [None]:
"""
LogREG
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

#paths
csv_path = "integrated_alpha_as_0_integrated_beta_as_1.csv"
df_raw = pd.read_csv(csv_path, header=None)
print(f"CSV loaded. Shape (raw): {df_raw.shape}")

#label
labels = df_raw.iloc[0, 1:].astype(int).values
labels_numeric = labels

if not set(labels_numeric).issubset({0, 1}):
    raise ValueError("Error: Label have to be 0 or 1.")

df_data = df_raw.iloc[1:, :].copy()
df_data.columns = df_raw.iloc[0, :] 
df_data = df_data.set_index("label")
df_data = df_data.apply(pd.to_numeric)

df = df_data.T
X = df.values
gene_names = df.columns.to_list()

print(f" Matrix: {X.shape[0]} cells × {X.shape[1]} genes")

#z-score
print("Scaling gene expression matrix (Z-score)...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

#logREG
print("Fitting Logistic Regression (L2, liblinear)...")
clf = LogisticRegression(penalty='l2', solver='liblinear', random_state=42, max_iter=1000)
clf.fit(X_scaled, labels_numeric)

coefs = clf.coef_[0]

logreg_df = pd.DataFrame({
    'gene': gene_names,
    'coefficient': coefs,
    'score': np.abs(coefs),
    'direction': np.where(coefs > 0, 'Up in 1', 'Up in 0')
})

logreg_df = logreg_df.sort_values(by='score', ascending=False)
logreg_df.to_csv("LogReg_results_scaled.csv", index=False)
print("saved LogReg_results_scaled.csv")

#plot
label_mapping = {0: "integrated_alpha", 1: "integrated_beta"}
palette = {
    "Up in integrated_alpha": "#4daf4a",  # grün
    "Up in integrated_beta": "#984ea3"    # lila
}

logreg_df["direction"] = np.where(
    logreg_df["coefficient"] > 0,
    f"Up in {label_mapping[1]}",
    f"Up in {label_mapping[0]}"
)

top_n = 10
top_logreg = logreg_df.sort_values(by="score", ascending=False).head(top_n)

plt.figure(figsize=(10, 6), dpi=600)
ax = sns.barplot(
    data=top_logreg,
    y='gene', x='score',
    hue='direction',
    dodge=False,
    palette=palette,
    edgecolor='black'
)

ax.set_title(f"Top {top_n} Genes by Logistic Regression", fontsize=16, weight='bold')
ax.set_xlabel('Absolute Coefficient (|β|, standardized)', fontsize=14, weight='bold')
ax.set_ylabel('Gene', fontsize=14, weight='bold')
ax.tick_params(axis='x', labelsize=12, width=1.5, length=6)
ax.tick_params(axis='y', labelsize=12, width=1.5, length=6)

for label in ax.get_xticklabels() + ax.get_yticklabels():
    label.set_fontweight('bold')

#Legend
leg = ax.legend(title="Direction", fontsize=12, title_fontsize=13, loc="lower right")
leg.get_title().set_fontweight('bold')
for text in leg.get_texts():
    text.set_fontweight('bold')

sns.despine()
plt.tight_layout()
plt.savefig("logreg_top10_scaled_colored.png", dpi=600)
plt.show()
print("saved logreg_top10_scaled_colored.png")

In [None]:
"""
Wilcoxon
"""

import pandas as pd
import numpy as np
from scipy.stats import ranksums
from statsmodels.stats.multitest import multipletests

#paths
INPUT_CSV = "H358_with_fixed_noise.csv"
OUTPUT_CSV = "DEG_Wilcoxon_results.csv"
LOG2FC_THRESHOLD = 1
ADJ_PVAL_THRESHOLD = 0.05

#load
print(f"Reading: {INPUT_CSV}")
df_raw = pd.read_csv(INPUT_CSV, header=None)
print(f"Raw shape: {df_raw.shape}")

#label
labels = df_raw.iloc[0, 1:].astype(int).values
if not set(labels).issubset({0, 1}):
    raise ValueError("Labels must be 0 or 1 only.")

header = df_raw.iloc[0, :].tolist()
header[0] = "label"  # erste Spalte = Genname
df = df_raw.iloc[1:, :].copy()
df.columns = header
if "label" not in df.columns:
    raise KeyError("Missing 'label' column with gene names.")

df = df.set_index("label")
df = df.apply(pd.to_numeric, errors="coerce").T  # Zellen = Zeilen

print(f"Expression matrix shape: {df.shape}")

#Wilcoxon
print("Running Wilcoxon rank-sum test...")
genes = df.columns
X = df.values
group_0 = labels == 0
group_1 = labels == 1

results = []
for i, gene in enumerate(genes):
    vals_0 = X[group_0, i]
    vals_1 = X[group_1, i]

    if np.all(vals_0 == vals_1):
        stat, p, log2fc = np.nan, 1.0, 0.0
    else:
        try:
            stat, p = ranksums(vals_0, vals_1)
            log2fc = np.log2(np.mean(vals_1) + 1e-9) - np.log2(np.mean(vals_0) + 1e-9)
        except Exception as e:
            stat, p, log2fc = np.nan, np.nan, 0.0
            print(f"Error for gene {gene}: {e}")

    results.append((gene, p, log2fc))

#results
res_df = pd.DataFrame(results, columns=["gene", "p_val", "log2FC"])
res_df["p_val_adj"] = multipletests(res_df["p_val"], method="fdr_bh")[1]
res_df["neglog10padj"] = -np.log10(res_df["p_val_adj"] + 1e-300)

#DEGs
res_df["Significance"] = "Not significant"
res_df.loc[
    (res_df["log2FC"] > LOG2FC_THRESHOLD) & (res_df["p_val_adj"] < ADJ_PVAL_THRESHOLD),
    "Significance"
] = "Up in label 1"

res_df.loc[
    (res_df["log2FC"] < -LOG2FC_THRESHOLD) & (res_df["p_val_adj"] < ADJ_PVAL_THRESHOLD),
    "Significance"
] = "Up in label 0"

res_df = res_df.sort_values("p_val_adj")

#save
res_df.to_csv(OUTPUT_CSV, index=False)
print(f"✅ DEG results saved to: {OUTPUT_CSV}")

In [None]:
"""
Violin
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#paths
csv_path = "integrated_alpha_as_0_integrated_beta_as_1.csv"
df_raw = pd.read_csv(csv_path, header=None)

#labels
labels = df_raw.iloc[0, 1:].astype(int).values
if not set(labels).issubset({0, 1}):
    raise ValueError("Error: Labels have to to be 0 or 1")

label_mapping = {0: "integrated_alpha", 1: "integrated_beta"}
label_names = [label_mapping[l] for l in labels]

df_data = df_raw.iloc[1:, :].copy()
df_data.columns = df_raw.iloc[0, :]  
df_data = df_data.set_index("label")
df_data = df_data.apply(pd.to_numeric).T  

#select genes
genes_of_interest = ["INS", "GCG", "GC"]
missing_genes = [g for g in genes_of_interest if g not in df_data.columns]
if missing_genes:
    raise ValueError(f"{missing_genes}")

df_plot = df_data[genes_of_interest].copy()
df_plot["Group"] = label_names

for gene in genes_of_interest:
    df_plot[gene] = np.log1p(df_plot[gene])

df_long = df_plot.melt(id_vars="Group", var_name="Gene", value_name="log1p(Counts)")

#plot
g = sns.catplot(
    data=df_long,
    x="Group",
    y="log1p(Counts)",
    col="Gene",
    kind="violin",
    sharey=False,
    col_wrap=3,
    height=4,
    aspect=1,
    cut=0,
    scale="width",
    linewidth=1.2,
    palette={"integrated_alpha": "#4daf4a", "integrated_beta": "#984ea3"}
)

#Style
g.set_axis_labels("Group", "log1p(Counts)")

for ax in g.axes.flat:
    ax.set_xlabel(ax.get_xlabel(), fontsize=12, fontweight='bold')
    ax.set_ylabel(ax.get_ylabel(), fontsize=12, fontweight='bold')
    ax.tick_params(axis='both', labelsize=11, width=1.5, length=6)
    for label in ax.get_xticklabels() + ax.get_yticklabels():
        label.set_fontweight('bold')

plt.subplots_adjust(top=0.88)
g.fig.suptitle('Violin Plots of Top Genes by Group', fontsize=16, fontweight='bold')

#save
g.fig.savefig("violinplots_top3_by_group_600dpi.png", dpi=600, bbox_inches='tight')
plt.show()