In [14]:
# 1) Patient-level aggregation: compute per-patient / per-timepoint features and save CSV
import pandas as pd
import numpy as np
from scipy.stats import entropy
from pathlib import Path
import scanpy as sc
import warnings
warnings.filterwarnings('ignore')

output_dir = Path('../Processed_Data')
output_dir.mkdir(parents=True, exist_ok=True)

# Ensure `adata` is available (load processed file if not in memory)
if 'adata' not in globals():
    processed_path = Path('../Processed_Data/processed_s_rna_seq_data.h5ad')
    if processed_path.exists():
        print(f'Loading processed AnnData from {processed_path}')
        adata = sc.read_h5ad(processed_path)
    else:
        raise FileNotFoundError('`adata` not in memory and processed file not found.')

# Identify patient and timepoint columns
patient_col = 'patient_id' if 'patient_id' in adata.obs.columns else ('patient' if 'patient' in adata.obs.columns else None)
timepoint_col = 'timepoint' if 'timepoint' in adata.obs.columns else ('sample_id' if 'sample_id' in adata.obs.columns else None)
if patient_col is None:
    raise RuntimeError('No patient identifier column found in adata.obs (expected `patient_id` or `patient`).')

# Prefer TRB clonotypes, fallback to TRA or generic clone id
clone_col = None
for c in ['cdr3_TRB','cdr3_TRA','clone_id','clonotype']:
    if c in adata.obs.columns:
        clone_col = c
        break

# Helper diversity functions
def simpson_index(counts):
    counts = np.asarray(counts)
    if counts.sum() == 0:
        return np.nan
    p = counts / counts.sum()
    return float((p ** 2).sum())

def gini_coefficient(counts):
    x = np.asarray(counts, dtype=float)
    if x.size == 0 or x.sum() == 0:
        return np.nan
    x = np.sort(x)
    n = x.size
    index = np.arange(1, n + 1)
    return float((2.0 * np.sum(index * x)) / (n * x.sum()) - (n + 1) / n)

# Heuristic cluster columns to produce cluster fractions
cluster_columns = [c for c in adata.obs.columns if ('cluster' in c.lower() or c.endswith('_clusters') or c == 'gene_expression_modules')]
if not cluster_columns:
    cluster_columns = [c for c in adata.obs.columns if pd.api.types.is_categorical_dtype(adata.obs[c]) and 2 <= adata.obs[c].nunique() <= 50]
cluster_columns = list(dict.fromkeys(cluster_columns))

group_cols = [patient_col] + ([timepoint_col] if timepoint_col is not None else [])
grp = adata.obs.groupby(group_cols)

records = []
for name, df in grp:
    rec = {}
    if isinstance(name, tuple):
        rec['patient_id'] = name[0]
        if timepoint_col is not None:
            rec['timepoint'] = name[1]
    else:
        rec['patient_id'] = name
        if timepoint_col is not None:
            rec['timepoint'] = ''
    rec['n_cells'] = int(df.shape[0])
    # sample/patient-level response if present
    if 'response' in df.columns:
        try:
            rec['response'] = str(df['response'].mode().iloc[0])
        except Exception:
            rec['response'] = ''
    else:
        rec['response'] = ''

    # clonotype metrics
    if clone_col is not None:
        clones = df[clone_col].dropna().astype(str)
        if len(clones) > 0:
            vc = clones.value_counts()
            counts = vc.values
            rec['n_unique_clones'] = int(vc.size)
            # Shannon (entropy), Simpson, Gini (clonality metrics)
            rec['shannon_diversity'] = float(entropy(counts / counts.sum()))
            rec['simpson_index'] = float(simpson_index(counts))
            rec['gini_clonality'] = float(gini_coefficient(counts))
            # top clonotypes
            top = vc.head(5)
            total = counts.sum() if counts.sum() > 0 else 1
            for i, (cl, v) in enumerate(top.items(), start=1):
                rec[f'top_clonotype_{i}_seq'] = cl
                rec[f'top_clonotype_{i}_count'] = int(v)
                rec[f'top_clonotype_{i}_frac'] = float(v / total)
            rec['frac_top1'] = float(top.iloc[0] / total) if len(top) > 0 else 0.0
            rec['frac_top5'] = float(top.sum() / total) if len(top) > 0 else 0.0
        else:
            rec['n_unique_clones'] = 0
            rec['shannon_diversity'] = np.nan
            rec['simpson_index'] = np.nan
            rec['gini_clonality'] = np.nan
            rec['frac_top1'] = np.nan
            rec['frac_top5'] = np.nan
    else:
        rec['n_unique_clones'] = np.nan
        rec['shannon_diversity'] = np.nan
        rec['simpson_index'] = np.nan
        rec['gini_clonality'] = np.nan
        rec['frac_top1'] = np.nan
        rec['frac_top5'] = np.nan

    # mean physicochemical properties (if encoded)
    phys_cols = [c for c in adata.obs.columns if c.startswith('tra_') or c.startswith('trb_')]
    for pc in phys_cols:
        rec[f'mean_{pc}'] = float(df[pc].dropna().mean()) if pc in df.columns else np.nan

    # cluster fractions + cluster entropy
    for ccol in cluster_columns:
        vc = df[ccol].value_counts(normalize=True)
        for label, frac in vc.items():
            rec[f'frac_{ccol}_{label}'] = float(frac)
        rec[f'{ccol}_entropy'] = float(entropy(vc.values)) if len(vc) > 0 else np.nan

    records.append(rec)

patient_df = pd.DataFrame(records).fillna(np.nan)
out_path = output_dir / 'patient_level_features.csv'
patient_df.to_csv(out_path, index=False)
print(f'Saved patient-level features to {out_path}')
patient_df.head()


Saved patient-level features to ..\Processed_Data\patient_level_features.csv


Unnamed: 0,patient_id,timepoint,n_cells,response,n_unique_clones,shannon_diversity,simpson_index,gini_clonality,top_clonotype_1_seq,top_clonotype_1_count,...,frac_leiden_integrated_14,frac_leiden_integrated_16,frac_leiden_integrated_19,frac_leiden_integrated_18,frac_leiden_integrated_21,frac_leiden_integrated_24,frac_leiden_integrated_23,frac_leiden_integrated_22,frac_leiden_integrated_20,leiden_integrated_entropy
0,PT1,Baseline,4008,Responder,3524,8.06662,0.00055,0.083556,CATSREGISGANVLTF,50,...,0.013473,0.011727,0.007236,0.005739,0.003244,0.000749,0.000749,0.000499,0.0,2.600359
1,PT1,Post-Chemo,3855,Responder,3133,7.86951,0.000833,0.144393,CASSEEGRATDTQYF,51,...,0.014527,0.019196,0.008301,0.014267,0.001297,0.000778,0.000259,0.012192,0.0,2.644746
2,PT2,Baseline,3127,Non-Responder,2748,7.853918,0.000528,0.071568,CASSGGNQPQHF,32,...,0.01535,0.019508,0.004157,0.008954,0.002558,0.00064,0.001919,0.00064,0.0,2.512519
3,PT2,Post-Chemo,2471,Non-Responder,1975,7.354026,0.001772,0.15354,CASSLGHYGYTF,71,...,0.011736,0.023472,0.010522,0.006475,0.002428,0.0,0.003238,0.0,0.0,2.64823
4,PT3,Baseline,915,Responder,844,6.707935,0.001325,0.043208,CSARDRTGNGYTF,9,...,0.009836,0.015301,0.006557,0.005464,0.001093,0.0,0.001093,0.0,0.0,2.429956


In [15]:
# 2) Grouped CV supervised pipeline (patient-level folds using GroupKFold)
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix
from joblib import dump
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import RandomizedSearchCV

pf = Path('../Processed_Data/patient_level_features.csv')
if not pf.exists():
    raise FileNotFoundError('Run the patient-level aggregation cell first (patient_level_features.csv not found).')
patient_df = pd.read_csv(pf)
# require a response column with values like 'Responder' / 'Non-Responder'
if 'response' not in patient_df.columns or patient_df['response'].isnull().all():
    raise RuntimeError('`response` column not present in patient-level table. Ensure patient-level aggregation included response.')
# normalize response strings
patient_df['response_norm'] = patient_df['response'].astype(str).str.lower().map(lambda s: 'Responder' if 'respon' in s else ('Non-Responder' if 'non' in s else s))
if patient_df['response_norm'].isnull().all():
    raise RuntimeError('No recognizable response labels after normalization.')
# binary label (1: Responder, 0: Non-Responder/Other)
patient_df['label'] = (patient_df['response_norm'] == 'Responder').astype(int)
# groups are patient_id (multiple timepoints per patient allowed)
groups = patient_df['patient_id']
# features: numeric columns only (drop identifiers and response)
drop_cols = ['patient_id','timepoint','response','response_norm','label']
num_cols = patient_df.select_dtypes(include=[np.number]).columns.tolist()
feature_cols = [c for c in num_cols if c not in drop_cols]
if len(feature_cols) == 0:
    raise RuntimeError('No numeric features found in patient-level table. Check that patient aggregation produced numeric features.')
X = patient_df[feature_cols].fillna(0)
y = patient_df['label']

n_groups = len(groups.unique())
if n_groups < 2:
    raise RuntimeError(f'Need at least 2 patients for GroupKFold; found {n_groups}')

n_splits = min(5, n_groups)
gkf = GroupKFold(n_splits=n_splits)
metrics = []
all_y_true = []
all_y_score = []
fold = 0
for train_idx, test_idx in gkf.split(X, y, groups):
    fold += 1
    X_train_df, X_test_df = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    groups_train = groups.iloc[train_idx]

    # pipeline with feature selection
    k_max = min(50, X.shape[1])
    sel_k_choices = sorted(list(set([max(1, min(10, X.shape[1])), max(1, min(20, X.shape[1])), k_max, X.shape[1]])))
    pipe = Pipeline([('scaler', StandardScaler()), ('sel', SelectKBest(f_classif, k=k_max)), ('clf', RandomForestClassifier(class_weight='balanced', random_state=42))])

    # nested/randomized search (inner GroupKFold) when sufficient groups available
    n_inner = min(3, len(np.unique(groups_train)))
    if n_inner >= 2:
        param_dist = {
            'sel__k': sel_k_choices,
            'clf__n_estimators': [100, 200, 500],
            'clf__max_depth': [None, 5, 10, 20]
        }
        search = RandomizedSearchCV(pipe, param_dist, n_iter=8, scoring='roc_auc', cv=GroupKFold(n_splits=n_inner), random_state=42, n_jobs=1)
        try:
            search.fit(X_train_df, y_train, groups=groups_train)
            best_pipe = search.best_estimator_
        except Exception as e:
            print('Nested search failed, falling back to default pipeline:', e)
            best_pipe = pipe
            best_pipe.fit(X_train_df, y_train)
    else:
        best_pipe = pipe
        best_pipe.fit(X_train_df, y_train)

    # scoring
    try:
        y_score = best_pipe.predict_proba(X_test_df)[:, 1]
    except Exception:
        try:
            y_score = best_pipe.decision_function(X_test_df)
        except Exception:
            y_score = np.zeros(len(X_test_df))
    y_pred = best_pipe.predict(X_test_df)
    try:
        auc_score = roc_auc_score(y_test, y_score)
    except Exception:
        auc_score = float('nan')
    # robust confusion matrix
    try:
        cm = confusion_matrix(y_test, y_pred, labels=[0,1])
        tn, fp, fn, tp = cm.ravel()
    except Exception:
        tn = fp = fn = tp = np.nan
    sens = tp / (tp + fn) if (not np.isnan(tp) and (tp + fn) > 0) else np.nan
    spec = tn / (tn + fp) if (not np.isnan(tn) and (tn + fp) > 0) else np.nan
    metrics.append({'fold': fold, 'auc': float(auc_score), 'sensitivity': float(sens) if not np.isnan(sens) else np.nan, 'specificity': float(spec) if not np.isnan(spec) else np.nan})
    all_y_true.extend(y_test.tolist())
    all_y_score.extend(y_score.tolist())

metrics_df = pd.DataFrame(metrics)
metrics_df.to_csv('../Processed_Data/patient_level_groupcv_results.csv', index=False)
print('Saved GroupKFold results to Processed_Data/patient_level_groupcv_results.csv')

# train final model on all data (with a small randomized search) and save
scaler = StandardScaler()
X_all = X
pipe_all = Pipeline([('scaler', scaler), ('sel', SelectKBest(f_classif, k=min(50, X.shape[1]))), ('clf', RandomForestClassifier(class_weight='balanced', random_state=42))])
param_dist_all = {'sel__k': [min(10, X.shape[1]), min(20, X.shape[1]), min(50, X.shape[1]), X.shape[1]], 'clf__n_estimators': [200, 500], 'clf__max_depth': [None, 10, 20]}
try:
    rs_all = RandomizedSearchCV(pipe_all, param_dist_all, n_iter=8, scoring='roc_auc', cv=GroupKFold(n_splits=min(3, n_groups)) if n_groups >=3 else 3, random_state=42, n_jobs=1)
    if n_groups >= 2:
        rs_all.fit(X_all, y, groups=groups)
        clf_full = rs_all.best_estimator_
    else:
        pipe_all.fit(X_all, y)
        clf_full = pipe_all
except Exception as e:
    print('Final model search failed, training default pipeline on all data:', e)
    pipe_all.fit(X_all, y)
    clf_full = pipe_all

dump({'pipeline': clf_full}, '../Processed_Data/patient_level_model_groupcv.joblib')
print('Saved trained model to Processed_Data/patient_level_model_groupcv.joblib')

# patient-level ROC curve (aggregated across folds)
from sklearn.metrics import roc_curve, auc
if len(set(all_y_true)) > 1:
    fpr, tpr, _ = roc_curve(all_y_true, all_y_score)
    roc_auc = auc(fpr, tpr)
    plt.figure(figsize=(6,6))
    plt.plot(fpr, tpr, label=f'GroupKFold ROC (AUC = {roc_auc:.3f})')
    plt.plot([0,1],[0,1],'--',color='grey')
    plt.xlabel('FPR'); plt.ylabel('TPR'); plt.legend()
    plt.tight_layout()
    plt.savefig('../Processed_Data/patient_level_groupcv_roc.png')
    plt.show()
    print('Saved ROC to Processed_Data/patient_level_groupcv_roc.png')
else:
    print('Not enough positive/negative samples across folds to compute ROC curve')


Saved GroupKFold results to Processed_Data/patient_level_groupcv_results.csv
Saved trained model to Processed_Data/patient_level_model_groupcv.joblib
Not enough positive/negative samples across folds to compute ROC curve


In [20]:
# Diagnostics: check patient/responders distribution
import pandas as pd
from pathlib import Path
pf = Path('../Processed_Data/patient_level_features.csv')
df = pd.read_csv(pf)
print('Total patient-level rows:', len(df))
print('Unique patient IDs:', df['patient_id'].nunique())
# compute per-patient response mode
patient_mode = df.groupby('patient_id')['response'].agg(lambda x: pd.Series(x).mode().iloc[0] if len(pd.Series(x).mode())>0 else None)
print('\nPer-patient response mode counts:')
print(patient_mode.value_counts(dropna=False))
print('\nRow-level response counts:')
print(df['response'].value_counts(dropna=False))
# list patients by mode
responder_patients = patient_mode[patient_mode.astype(str).str.lower().str.contains('respon', na=False)].index.tolist()
nonresponder_patients = patient_mode[patient_mode.astype(str).str.lower().str.contains('non', na=False)].index.tolist()
print(f'Patients with responder mode: {len(responder_patients)}, Non-responder mode: {len(nonresponder_patients)}')
print('Responder patient IDs (sample):', responder_patients[:10])
print('Non-responder patient IDs (sample):', nonresponder_patients[:10])


Total patient-level rows: 10
Unique patient IDs: 6

Per-patient response mode counts:
response
Responder        4
Non-Responder    2
Name: count, dtype: int64

Row-level response counts:
response
Responder        7
Non-Responder    3
Name: count, dtype: int64
Patients with responder mode: 6, Non-responder mode: 2
Responder patient IDs (sample): ['PT1', 'PT11', 'PT2', 'PT3', 'PT4', 'PT5']
Non-responder patient IDs (sample): ['PT2', 'PT4']


In [16]:
# 3) Pseudobulk differential expression + gene-set scoring (interferon/complement)
import scanpy as sc
import pandas as pd
import numpy as np
from pathlib import Path
from statsmodels.stats.multitest import multipletests
import warnings
warnings.filterwarnings('ignore')

output_dir = Path('../Processed_Data')
output_dir.mkdir(parents=True, exist_ok=True)

# Ensure adata is available
if 'adata' not in globals():
    processed_path = Path('../Processed_Data/processed_s_rna_seq_data.h5ad')
    if processed_path.exists():
        adata = sc.read_h5ad(processed_path)
    else:
        raise FileNotFoundError('`adata` not found; run preprocessing first or provide processed_h5ad file.')

# Build pseudobulk sample labels (prefer sample_id; fallback to patient_id::timepoint)
if 'sample_id' in adata.obs.columns:
    sample_labels = adata.obs['sample_id'].astype(str)
else:
    tp = adata.obs['timepoint'].astype(str) if 'timepoint' in adata.obs.columns else ''
    if 'patient_id' in adata.obs.columns:
        sample_labels = adata.obs['patient_id'].astype(str) + '::' + tp
    elif 'patient' in adata.obs.columns:
        sample_labels = adata.obs['patient'].astype(str) + '::' + tp
    else:
        sample_labels = pd.Series(adata.obs_names, index=adata.obs_names)

# Use adata.raw if available for counts else adata
adata_counts = adata.raw if getattr(adata, 'raw', None) is not None else adata
if getattr(adata_counts, 'var_names', None) is None:
    raise RuntimeError('adata.var_names (gene names) not found in adata or adata.raw. Cannot run pseudobulk DE.')

X = adata_counts.X
genes = list(adata_counts.var_names)
import scipy.sparse as sp

# Sum counts per sample
pseudobulk = []
samples = sample_labels.unique()
for s in samples:
    mask = (sample_labels == s).values
    sub = X[mask]
    if sp.issparse(sub):
        summed = np.asarray(sub.sum(axis=0)).ravel()
    else:
        summed = np.sum(sub, axis=0)
    pseudobulk.append(summed)
pseudobulk_counts = pd.DataFrame(np.vstack(pseudobulk).T, index=genes, columns=samples)
pseudobulk_counts.to_csv(output_dir / 'pseudobulk_counts.csv')
print('Saved pseudobulk counts to', output_dir / 'pseudobulk_counts.csv')

# Build sample metadata (response per sample by majority of cells)
sample_meta = []
for s in samples:
    mask = (sample_labels == s).values
    if 'response' in adata.obs.columns:
        resp = adata.obs.loc[mask, 'response']
    else:
        resp = pd.Series(dtype=object)
    mode = resp.mode() if len(resp) > 0 else pd.Series(dtype=object)
    sample_meta.append({'sample': s, 'response': mode.iloc[0] if len(mode) > 0 else None})
sample_meta = pd.DataFrame(sample_meta).set_index('sample')

# Normalize response labels to canonical 'Responder'/'Non-Responder'
def _norm_resp(x):
    if pd.isna(x):
        return None
    s = str(x).strip().lower()
    if s in ['', 'nan', 'none', 'na']:
        return None
    if 'respon' in s or s in ('r','1','yes','y','responder'):
        return 'Responder'
    if 'non' in s or s in ('nr','0','no','n','non-responder','nonresponder'):
        return 'Non-Responder'
    return s.capitalize()

sample_meta['response'] = sample_meta['response'].apply(_norm_resp)

# Differential testing (Responder vs Non-Responder) across pseudobulk samples
res_samples = sample_meta[sample_meta['response'] == 'Responder'].index.tolist()
nonres_samples = sample_meta[sample_meta['response'] == 'Non-Responder'].index.tolist()
results = []
from scipy.stats import mannwhitneyu
for gene in pseudobulk_counts.index:
    a = pseudobulk_counts.loc[gene, res_samples].values if len(res_samples) > 0 else np.array([])
    b = pseudobulk_counts.loc[gene, nonres_samples].values if len(nonres_samples) > 0 else np.array([])
    if len(a) > 0 and len(b) > 0:
        # CPM per sample then log2 fold (pseudo-cnt)
        a_cpm = (a / (a.sum()+1e-9)) * 1e6
        b_cpm = (b / (b.sum()+1e-9)) * 1e6
        try:
            stat, p = mannwhitneyu(a_cpm, b_cpm, alternative='two-sided')
        except Exception:
            stat, p = np.nan, np.nan
        mean_fc = np.log2((a_cpm.mean() + 1) / (b_cpm.mean() + 1))
    else:
        p = np.nan
        mean_fc = np.nan
    results.append({'gene': gene, 'log2FC': float(mean_fc) if not np.isnan(mean_fc) else np.nan, 'pval': float(p) if not np.isnan(p) else np.nan})
de_df = pd.DataFrame(results).set_index('gene')
de_df = de_df.dropna(subset=['pval'], how='all')
if not de_df.empty:
    de_df['fdr'] = multipletests(de_df['pval'].fillna(1), method='fdr_bh')[1]
    de_df.to_csv(output_dir / 'pseudobulk_DE_results.csv')
    print('Saved pseudobulk DE results to', output_dir / 'pseudobulk_DE_results.csv')
else:
    print('No DE results (too few samples in groups)')

# Gene-set scoring (interferon and complement) using Scanpy's score_genes
interferon_genes = ['IFIT1','IFIT2','IFIT3','MX1','ISG15','OAS1','OAS2','OASL','IFI44','IFI6']
complement_genes = ['C1QA','C1QB','C1QC','C2','C3','C4A','C4B','C5']
for name, geneset in [('Interferon',interferon_genes), ('Complement', complement_genes)]:
    genes_present = [g for g in geneset if g in adata.var_names]
    if len(genes_present) == 0:
        print(f'Skipping {name} scoring — none of the genes present in adata.var_names')
        continue
    use_raw_flag = True if getattr(adata, 'raw', None) is not None and getattr(adata.raw, 'var_names', None) is not None else False
    try:
        sc.tl.score_genes(adata, gene_list=genes_present, score_name=f'{name}_score', use_raw=use_raw_flag)
    except Exception as e:
        print(f'Could not score {name} gene set: {e}')

# Save sample-level gene set scores (average per sample)
scores = []
for s in samples:
    mask = (sample_labels == s).values
    row = {'sample': s}
    for name in ['Interferon','Complement']:
        col = f'{name}_score'
        row[col] = float(adata.obs.loc[mask, col].mean()) if col in adata.obs.columns else np.nan
    scores.append(row)
scores_df = pd.DataFrame(scores).set_index('sample')
scores_df.to_csv(output_dir / 'pseudobulk_gene_set_scores.csv')
print('Saved gene-set scores to', output_dir / 'pseudobulk_gene_set_scores.csv')


Saved pseudobulk counts to ..\Processed_Data\pseudobulk_counts.csv
No DE results (too few samples in groups)
Saved gene-set scores to ..\Processed_Data\pseudobulk_gene_set_scores.csv


In [17]:
# 4) Tumor–blood TCR overlap (Jaccard) — only runs if tumor TCR files are detected
import glob
import pandas as pd
import numpy as np
from pathlib import Path
from scipy.stats import mannwhitneyu
output_dir = Path('../Processed_Data')
output_dir.mkdir(parents=True, exist_ok=True)

data_dir = Path('../Data/GSE300475_RAW')
# look for 10x/Cellranger contig annotations files
tcr_files = list(data_dir.glob('*_all_contig_annotations.csv')) if data_dir.exists() else []
if len(tcr_files) == 0:
    print(f'No *_all_contig_annotations.csv files found in {data_dir} (skipping tumor–blood overlap).')
else:
    # prefer files explicitly mentioning tumor, otherwise use all contig files
    tumor_files = [f for f in tcr_files if 'tumor' in f.name.lower() or 'tumour' in f.name.lower()]
    if len(tumor_files) == 0:
        tumor_files = tcr_files
    print(f'Detected {len(tumor_files)} tumor/all contig files — computing overlaps')
    tumor_by_patient = {}
    for f in tumor_files:
        try:
            df_t = pd.read_csv(f)
            pid = f.stem.split('_')[0]
            # find plausible cdr3/sequence column
            seq_col = None
            for cand in ['cdr3', 'cdr3b', 'productive_cdr3', 'sequence', 'cdr3_nt', 'cdr3aa', 'sequence_complete']:
                matches = [c for c in df_t.columns if cand in c.lower()]
                if matches:
                    seq_col = matches[0]
                    break
            seqs = df_t[seq_col] if seq_col is not None else df_t.iloc[:, 0]
            tumor_by_patient[pid] = set(seqs.dropna().astype(str))
        except Exception as e:
            print(f'Could not read {f}: {e}')

    # Blood clonotypes per sample/patient from adata.obs (use best available column)
    clonotype_col = None
    for c in ['cdr3_TRB', 'cdr3_TRA', 'cdr3b', 'cdr3a', 'cdr3', 'productive_cdr3', 'clone_id']:
        if c in adata.obs.columns:
            clonotype_col = c
            break
    overlaps = []
    if clonotype_col is None:
        print('No clonotype column found in adata.obs (cdr3_TRB/TRA). Cannot compute overlap.')
    else:
        # ensure patient_df available for response merging
        pf = Path('../Processed_Data/patient_level_features.csv')
        patient_df = pd.read_csv(pf) if pf.exists() else None
        if patient_df is None:
            print('Patient-level features not available; overlap table will be saved but response association skipped.')

        # choose grouping columns
        group_cols = []
        if 'patient_id' in adata.obs.columns:
            group_cols.append('patient_id')
        if 'timepoint' in adata.obs.columns:
            group_cols.append('timepoint')
        elif 'sample_id' in adata.obs.columns:
            group_cols.append('sample_id')

        if len(group_cols) == 0:
            print('No patient/sample grouping columns found in adata.obs; skipping overlap.')
        else:
            grp = adata.obs.groupby(group_cols)
            for name, df in grp:
                pid = name[0] if isinstance(name, tuple) else name
                blood_set = set(df[clonotype_col].dropna().astype(str))
                # try to match tumor_by_patient by several heuristics
                candidate_keys = [k for k in tumor_by_patient.keys() if pid in k or k in pid or k.split('-')[0] == pid or pid.split('-')[0] == k]
                if len(candidate_keys) == 0:
                    continue
                tumor_set = tumor_by_patient[candidate_keys[0]]
                inter = len(blood_set & tumor_set)
                union = len(blood_set | tumor_set)
                jaccard = inter / union if union > 0 else 0.0
                overlaps.append({'patient_id': pid, 'timepoint': name[1] if isinstance(name, tuple) and len(name) > 1 else '', 'jaccard': jaccard, 'n_shared': inter, 'n_blood': len(blood_set), 'n_tumor': len(tumor_set)})

            overlaps_df = pd.DataFrame(overlaps)
            overlaps_df.to_csv(output_dir / 'tumor_blood_overlap.csv', index=False)
            print('Saved tumor–blood overlap table to', output_dir / 'tumor_blood_overlap.csv')

            # test association with response if available
            if patient_df is not None and 'response' in patient_df.columns and not overlaps_df.empty:
                merged = overlaps_df.merge(patient_df[['patient_id','response']].drop_duplicates(), on='patient_id', how='left')
                # normalize response labels
                merged['response_norm'] = merged['response'].astype(str).str.lower().map(lambda s: 'Responder' if 'respon' in s else ('Non-Responder' if 'non' in s else s))
                res = merged[merged['response_norm']=='Responder']['jaccard'].dropna()
                non = merged[merged['response_norm']=='Non-Responder']['jaccard'].dropna()
                if len(res) > 0 and len(non) > 0:
                    stat, p = mannwhitneyu(res, non, alternative='two-sided')
                    print(f'Jaccard overlap R vs NR: U={stat:.2f}, p={p:.3e}')
                else:
                    print('Not enough samples in response groups to test association')
            else:
                print('Patient-level response not available or overlaps empty; skipping association test.')


Detected 10 tumor/all contig files — computing overlaps


Saved tumor–blood overlap table to ..\Processed_Data\tumor_blood_overlap.csv
Patient-level response not available or overlaps empty; skipping association test.


In [18]:
# 5) Multiple-testing corrected k-mer enrichment (pseudobulk per sample; FDR via BH)
import numpy as np
import pandas as pd
from pathlib import Path
from statsmodels.stats.multitest import multipletests
import warnings
warnings.filterwarnings('ignore')
output_dir = Path('../Processed_Data')
output_dir.mkdir(parents=True, exist_ok=True)

# Build pseudobulk k-mer counts per sample from adata.obs cdr3_TRB (k=3)
def kmer_counts_from_seq_list(seqs, k=3):
    counts = {}
    for s in seqs.dropna().astype(str):
        for i in range(len(s)-k+1):
            kmer = s[i:i+k]
            counts[kmer] = counts.get(kmer, 0) + 1
    return counts

if 'cdr3_TRB' not in adata.obs.columns and 'cdr3_TRA' not in adata.obs.columns:
    print('No CDR3 sequences found in adata.obs (skipping k-mer enrichment).')
else:
    seq_col = 'cdr3_TRB' if 'cdr3_TRB' in adata.obs.columns else 'cdr3_TRA'
    if 'sample_id' in adata.obs.columns:
        sample_labels = adata.obs['sample_id'].astype(str)
    else:
        tp = adata.obs['timepoint'].astype(str) if 'timepoint' in adata.obs.columns else ''
        sample_labels = adata.obs['patient_id'].astype(str) + '::' + tp
    samples = sample_labels.unique()
    k=3
    # build dataframe of k-mer counts per sample
    rows = []
    for s in samples:
        mask = sample_labels == s
        seqs = adata.obs.loc[mask, seq_col]
        kc = kmer_counts_from_seq_list(seqs, k=k)
        row = {'sample': s}
        row.update(kc)
        rows.append(row)
    kmer_df = pd.DataFrame(rows).fillna(0).set_index('sample')
    # attach response label per sample
    sample_meta = []
    for s in samples:
        mask = sample_labels == s
        resp = adata.obs.loc[mask, 'response'] if 'response' in adata.obs.columns else pd.Series(dtype=object)
        mode = resp.mode() if len(resp) > 0 else pd.Series(dtype=object)
        sample_meta.append({'sample': s, 'response': mode.iloc[0] if len(mode) > 0 else None})
    sample_meta = pd.DataFrame(sample_meta).set_index('sample')
    # normalize responses
    def _norm_resp(x):
        if pd.isna(x):
            return None
        s = str(x).strip().lower()
        if 'respon' in s or s in ('r','1','yes','y'):
            return 'Responder'
        if 'non' in s or s in ('nr','0','no','n'):
            return 'Non-Responder'
        return None
    sample_meta['response'] = sample_meta['response'].apply(_norm_resp)

    # perform Mann-Whitney U test per k-mer across samples (Responder vs Non-Responder)
    res_samples = sample_meta[sample_meta['response']=='Responder'].index.tolist()
    non_samples = sample_meta[sample_meta['response']=='Non-Responder'].index.tolist()
    from scipy.stats import mannwhitneyu
    results = []
    n_samples = len(samples)
    # prevalence filter: require k-mer present in at least min_samples samples
    min_samples = max(2, int(0.1 * n_samples))
    cols = [c for c in kmer_df.columns if (kmer_df[c] > 0).sum() >= min_samples]
    if len(cols) == 0:
        print('No k-mers pass prevalence filter; relaxing to top-occurring k-mers')
        cols = list(kmer_df.columns)
    for kmer in cols:
        a = kmer_df.loc[res_samples, kmer].values if len(res_samples) > 0 else np.array([])
        b = kmer_df.loc[non_samples, kmer].values if len(non_samples) > 0 else np.array([])
        if len(a) > 0 and len(b) > 0:
            try:
                stat, p = mannwhitneyu(a, b, alternative='two-sided')
            except Exception:
                stat, p = np.nan, np.nan
            mean_diff = float(a.mean() - b.mean())
        else:
            stat, p, mean_diff = np.nan, np.nan, np.nan
        results.append({'kmer': kmer, 'stat': stat, 'pval': p, 'mean_diff': mean_diff})
    res_df = pd.DataFrame(results).set_index('kmer').dropna(subset=['pval'])
    if not res_df.empty:
        res_df['fdr'] = multipletests(res_df['pval'], method='fdr_bh')[1]
        res_df.sort_values('fdr', inplace=True)
        res_df.to_csv(output_dir / 'kmer_enrichment_results.csv')
        print('Saved k-mer enrichment results to', output_dir / 'kmer_enrichment_results.csv')
        significant = res_df[res_df['fdr'] < 0.05]
        print(f'{len(significant)} k-mers with FDR < 0.05')
    else:
        print('No k-mer statistics computed (insufficient samples)')


No k-mer statistics computed (insufficient samples)


In [19]:
# 6) Batch correction and integration (Harmony primary, bbknn fallback) and re-clustering
import scanpy as sc
from pathlib import Path
import warnings
import numpy as np
warnings.filterwarnings('ignore')
output_dir = Path('../Processed_Data')
output_dir.mkdir(parents=True, exist_ok=True)
# Ensure adata is present
if 'adata' not in globals():
    path = Path('../Processed_Data/processed_s_rna_seq_data.h5ad')
    if path.exists():
        adata = sc.read_h5ad(path)
    else:
        raise FileNotFoundError('No adata in memory and no processed file found.')

import scipy.sparse as sp
# Filter low-prevalence genes to avoid HVG failures
try:
    sc.pp.filter_genes(adata, min_cells=3)
except Exception as e:
    print('filter_genes failed or not applicable:', e)

if adata.n_vars < 50:
    print('Too few genes after filtering; skipping HVG selection and using all genes')
else:
    try:
        sc.pp.normalize_total(adata, target_sum=1e4)
        sc.pp.log1p(adata)
        sc.pp.highly_variable_genes(adata, n_top_genes=2000, subset=True)
    except Exception as e:
        print('HVG selection failed, falling back to variance-based selection:', e)
        Xmat = adata.X.toarray() if sp.issparse(adata.X) else adata.X
        var = np.nanvar(Xmat, axis=0)
        if np.all(np.isnan(var)):
            raise RuntimeError('All gene variances are NaN; check adata.X for valid counts')
        idx = np.argsort(-np.nan_to_num(var))[:min(2000, Xmat.shape[1])]
        adata = adata[:, idx].copy()

sc.pp.scale(adata, max_value=10)
sc.tl.pca(adata, n_comps=50, svd_solver='arpack')

use_rep = 'X_pca'
# Try Harmony integration if available
try:
    import harmonypy as hm
    print('Running Harmony integration on `sample_id`...')
    ho = hm.run_harmony(adata.obsm['X_pca'], adata.obs, 'sample_id')
    adata.obsm['X_pca_harmony'] = ho.Z_corr.T
    use_rep = 'X_pca_harmony'
except Exception as e:
    print('Harmony not available or failed, falling back to ComBat/bbknn as available:', e)
    try:
        sc.pp.combat(adata, key='sample_id')
        sc.tl.pca(adata, n_comps=50, svd_solver='arpack')
        adata.obsm['X_pca_harmony'] = adata.obsm['X_pca']
        use_rep = 'X_pca_harmony'
    except Exception as e2:
        print('ComBat failed or not appropriate:', e2)

# Build neighbors on integrated representation and cluster
try:
    sc.pp.neighbors(adata, use_rep=use_rep)
except Exception as e:
    print('sc.pp.neighbors failed; trying bbknn fallback:', e)
    try:
        import bbknn
        if use_rep in adata.obsm:
            bbknn.bbknn(adata, batch_key='sample_id', use_rep=use_rep)
        else:
            print('bbknn fallback not possible; using default neighbors on X_pca')
            sc.pp.neighbors(adata, use_rep='X_pca')
    except Exception as e3:
        print('bbknn not available or failed:', e3)
        sc.pp.neighbors(adata, use_rep='X_pca')

sc.tl.umap(adata)
sc.tl.leiden(adata, resolution=1.0, key_added='leiden_integrated')
# Rank markers for integrated clusters
try:
    sc.tl.rank_genes_groups(adata, groupby='leiden_integrated', method='wilcoxon')
    # export marker tables per cluster
    for g in adata.obs['leiden_integrated'].cat.categories:
        try:
            df_mark = sc.get.rank_genes_groups_df(adata, group=g)
            df_mark.to_csv(output_dir / f'leiden_integrated_markers_group_{g}.csv', index=False)
        except Exception:
            pass
    print('Saved integrated clustering and marker tables')
except Exception as e:
    print('rank_genes_groups failed on integrated data:', e)

# Save integrated AnnData
adata.write_h5ad(output_dir / 'processed_s_rna_seq_data_integrated.h5ad')
print('Saved integrated AnnData to', output_dir / 'processed_s_rna_seq_data_integrated.h5ad')


HVG selection failed, falling back to variance-based selection: Bin edges must be unique: Index([nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
       nan, nan, nan, nan, nan, nan, nan],
      dtype='float64').
You can drop duplicate edges by setting the 'duplicates' kwarg


RuntimeError: All gene variances are NaN; check adata.X for valid counts