# DeepSearch Consistency and GO Coverage Analysis

This notebook summarizes run-to-run stability of DeepSearch gene programs and how those programs align with GO enrichment results provided in the `Comparisons/` folder.

In [None]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

plt.style.use('seaborn-v0_8')
DATA_DIR = Path('..') / 'data'


## Run-to-run gene program consistency

In [None]:
matches = pd.read_csv(DATA_DIR / 'deepsearch_program_matches.csv')
runs = pd.read_csv(DATA_DIR / 'deepsearch_runs.csv')
unmatched_programs = pd.read_csv(DATA_DIR / 'deepsearch_unmatched_programs.csv')

match_summary = (
    matches.groupby(['metamodule', 'annotation', 'folder'])
    .agg(
        avg_similarity=('combined_similarity', 'mean'),
        median_similarity=('combined_similarity', 'median'),
        avg_gene_jaccard=('gene_jaccard', 'mean'),
        matched_pairs=('combined_similarity', 'size')
    )
    .reset_index()
)

run_counts = (
    runs.groupby(['metamodule', 'annotation', 'folder'])
    .agg(total_programs=('program_count', 'sum'))
    .reset_index()
)

unmatched_counts = (
    unmatched_programs.groupby(['metamodule', 'annotation', 'folder'])
    .size()
    .reset_index(name='unmatched_programs')
)

consistency = (
    match_summary
    .merge(run_counts, on=['metamodule', 'annotation', 'folder'])
    .merge(unmatched_counts, on=['metamodule', 'annotation', 'folder'])
)
consistency['matched_fraction'] = (consistency['matched_pairs'] * 2) / consistency['total_programs']
consistency['unmatched_fraction'] = consistency['unmatched_programs'] / consistency['total_programs']
consistency = consistency.sort_values('avg_similarity', ascending=False)
consistency


In [None]:
fig, ax = plt.subplots(figsize=(10, 4))
ax.bar(consistency['annotation'], consistency['avg_similarity'], color='#4C72B0')
ax.set_ylabel('Average combined similarity')
ax.set_xlabel('Gene set annotation')
ax.set_title('Run-to-run similarity per gene set')
ax.set_ylim(0, 1)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


Run-to-run similarity uses a weighted score (0.6 gene-set Jaccard + 0.4 text similarity). Even when similarity is high, unmatched program fractions highlight how stochasticity introduces novel programs per run.

## GO enrichment coverage from comparison reports

In [None]:
comparison_summary = pd.read_csv(DATA_DIR / 'comparison_summary.csv')
mapping = pd.read_csv('geneset_folder_mapping.csv')
all_sets = mapping[['metamodule', 'annotation', 'new_folder']].rename(columns={'new_folder': 'folder'})
missing_sets = all_sets[~all_sets['metamodule'].isin(comparison_summary['metamodule'])]
comparison_summary['go_match_pct'] = comparison_summary['matched_go_terms_estimated'] / comparison_summary['total_gsea_terms'] * 100
comparison_summary['go_unmatched_pct'] = comparison_summary['unmatched_go_terms_reported'] / comparison_summary['total_gsea_terms'] * 100
comparison_summary = comparison_summary.sort_values('go_match_pct', ascending=False)
comparison_summary


In [None]:
fig, ax = plt.subplots(figsize=(10, 4))
ax.bar(comparison_summary['annotation'], comparison_summary['go_match_pct'], color='#55A868')
ax.set_ylabel('% GO terms matched')
ax.set_xlabel('Gene set annotation')
ax.set_title('GO coverage reported in comparison files')
ax.set_ylim(0, 110)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

if not missing_sets.empty:
    display(missing_sets)


Gene set `geneset_9` (metamodule 8) lacks a comparison table, so it is excluded from the coverage stats above.

## Novel gene programs without GO counterparts

In [None]:
novel_programs = pd.read_csv(DATA_DIR / 'comparison_novel_programs.csv')
program_catalog = pd.read_csv(DATA_DIR / 'deepsearch_programs.csv')
run1_catalog = program_catalog[program_catalog['run_index'] == 1][['folder', 'program_name', 'supporting_gene_count']]
novel_programs = novel_programs.merge(run1_catalog, on=['folder', 'program_name'], how='left', suffixes=('', '_run_data'))
novel_programs['supporting_gene_count'] = novel_programs['supporting_gene_count'].fillna(novel_programs['supporting_gene_count_run_data'])
novel_ranked = novel_programs.sort_values(['supporting_gene_count', 'annotation'], ascending=[False, True])
novel_ranked.head(20)


The table ranks unmatched programs by the number of supporting genes reported in the DeepSearch output (run 1 for each gene set).