In [None]:
import pandas as pd
from gensim.models.coherencemodel import CoherenceModel
from gensim.models import LdaModel
from gensim.corpora import Dictionary
from pprint import pprint
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
from IPython.display import display
import random
import pyLDAvis
import pyLDAvis.gensim  # For older gensim versions
import pyLDAvis.gensim_models  # For gensim 4+


## LDA sur rapports génétiques

In [None]:
DATA_PATH = ""
HPO_TERMS = "../data/hpoterms08022021.txt"

mat = pd.read_csv(DATA_PATH, index_col=0)

#### FILTER WITH SMALL TERMS
mdMat = 

(6037, 4085)
(5951, 4085)


In [6]:
def conversion_matrix_EHRxHPO_to_coherence(matrix):
    """
    Convert an EHRxHPO binary matrix (DataFrame) into a list of lists for coherence analysis.
    
    Each row (EHR) is transformed into a list of present HPO terms.

    Args:
        matrix (pd.DataFrame): A binary matrix where rows are EHRs and columns are HPO terms (0/1).

    Returns:
        list of list of str: A list where each sublist contains active HPO terms for each EHR.
    """
    # Convert each row into a list of HPO terms where the value is 1
    topics = [list(matrix.columns[row == 1]) for _, row in matrix.iterrows()]
    
    return topics

def removing_empty_ehr(topics):
    new_topics = []

    for topic in topics:
        if topic != []:
            new_topics.append(topic)
    return new_topics
            

In [None]:
texts = conversion_matrix_EHRxHPO_to_coherence(mdMat)

texts = removing_empty_ehr(texts)

# Create the dictionary from your tokenized documents
dictionary = Dictionary(texts)

# Create the corpus (BoW representation)
corpus = [dictionary.doc2bow(text) for text in texts]

## Recherche du nombre de topics

In [None]:
perplexities = []
coherence_scores = []

start_k = 2
n_topics = 200

topic_range = range(start_k, n_topics + 1)

# Test topic numbers from 2 to 200
for num_topics in topic_range:
    print("num_topics :", num_topics)
    # Train LDA model
    lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=42)
    
    # Calculate Perplexity
    perplexity = lda_model.log_perplexity(corpus)
    perplexities.append(perplexity)
    
    # Calculate Coherence
    coherence_model = CoherenceModel(model=lda_model, texts=texts, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    coherence_scores.append(coherence_score)


num_topics : 1
num_topics : 2
num_topics : 3
num_topics : 4
num_topics : 5
num_topics : 6
num_topics : 7
num_topics : 8
num_topics : 9
num_topics : 10
num_topics : 11
num_topics : 12
num_topics : 13
num_topics : 14
num_topics : 15
num_topics : 16
num_topics : 17
num_topics : 18
num_topics : 19
num_topics : 20
num_topics : 21
num_topics : 22
num_topics : 23
num_topics : 24
num_topics : 25
num_topics : 26


KeyboardInterrupt: 

In [None]:
# Create figure and axes
fig, ax1 = plt.subplots()


# Plot Perplexity
ax1.set_xlabel('Number of Topics')
ax1.set_ylabel('Perplexity', color='tab:blue')
ax1.plot(topic_range, perplexities, color="tab:blue", label="Perplexity")
ax1.tick_params(axis='y', labelcolor='tab:blue')

# Create second y-axis to plot Coherence
ax2 = ax1.twinx() 
ax2.set_ylabel('Coherence Score', color='tab:green')
ax2.plot(topic_range, coherence_scores, label="Coherence", color="tab:green")
ax2.tick_params(axis='y', labelcolor='tab:green')

# Title and legend
plt.suptitle('Perplexity and Coherence vs. Number of Topics \n With filtering rare tokens')
fig.tight_layout()
plt.savefig("../output/coherence_perplexity_with_filtering_150_300groupes_2025_03_19.png")

plt.show()

## Observing LDA

In [None]:
# Adding HPO terms dictionary
def reading_hpo_terms(path: str) -> dict:
    d = {}
    with open(path, "r") as fh:
        for line in fh:
            line = line.strip()  # Remove newline & spaces
            if not line:  
                continue  # Skip empty lines
            
            # Split on last space/tab to get (name, ID)
            try:
                val, key = line.rsplit(maxsplit=1)  # Works for both space & tab-separated
            except ValueError:
                print(f"Skipping invalid line: {line}")  # Debugging bad format
                continue
            
            # Keep only the first occurrence of the key
            if key not in d:
                d[key] = val 

    return d

hpo_dict = reading_hpo_terms(HPO_TERMS)

In [None]:
## Convert IDs to names
texts_named = [[hpo_dict.get(term, term) for term in doc] for doc in texts]

In [None]:
## Choix TOPICS
num_topics = 1
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=42)
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary, mds="mmds")