In [1]:
import pdb
import pathlib
import pandas as pd
import numpy as np
from scipy import sparse
from qa_metrics.prompt_llm import CloseLLM
import os
from dotenv import load_dotenv
import time
from gensim import corpora

### Paths

In [2]:
model_path = pathlib.Path(
    "/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/models/LDA/passage/rosie_lg_lda_1_20"
)

In [42]:
path_env = pathlib.Path(os.getcwd()).parent.parent / '.env'
load_dotenv(path_env)
api_key = os.getenv("OPENAI_API_KEY")
gpt_model = CloseLLM()
gpt_model.set_openai_api_key(api_key)

In [43]:
prompt_template = load_prompt_template("/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/src/mapping/promt.txt")

### Auxiliary functions

In [44]:
def get_most_representative_per_tpc(mat, topn=10):
    # Find the most representative document for each topic based on a matrix mat
    top_docs_per_topic = []

    for doc_distr in mat.T:
        sorted_docs_indices = np.argsort(doc_distr)[::-1]
        top = sorted_docs_indices[:topn].tolist()
        top_docs_per_topic.append(top)
    return top_docs_per_topic

In [45]:
def load_prompt_template(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        file_contents = file.read()
    return file_contents

### Read corpus

In [3]:
path_corpus = model_path / "train_data" / "corpus_EN.txt"
with path_corpus.open("r", encoding="utf-8") as f:
    lines = [line for line in f.readlines()]
corpus = [line.rsplit(" 0 ")[1].strip().split() for line in lines]

ids = [line.split(" 0 ")[0] for line in lines]
df = pd.DataFrame({"lemmas": [" ".join(doc) for doc in corpus]})
df["doc_id"] = ids
df["len"] = df['lemmas'].apply(lambda x: len(x.split()))

In [47]:
df

Unnamed: 0,lemmas,doc_id,len
0,decrease initiation prevalence smoking hungary...,EN_492297_60866-9,63
1,value add table calculated entrance_exit skin ...,EN_143330_25224-123,61
2,outbreak duval_county begin april peak october...,EN_524864_63868-2,58
3,broder rapid communication bethesda system rep...,EN_518687_63341-14,13
4,opportunity meet social_worker child life spec...,EN_569477_70415-6,9
...,...,...,...
1393108,acip cdc determine priority group rank_tier ba...,EN_485225_59974-3,61
1393109,surgical lie operating table intravenous_intra...,EN_1219267_277263-11,53
1393110,cure rate people stage tumor people stage intr...,EN_1238215_279154-29,11
1393111,suggested_citation article schneider kl lapane...,EN_321620_48080-1,21


In [4]:
raw = pd.read_parquet("/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/source/corpus_rosie/passages/df_1.parquet")
raw

Unnamed: 0,id_preproc,lemmas,doc_id,text,lang
0,0,decrease initiation prevalence smoking hungary...,EN_492297_60866-9,To decrease the initiation and prevalence of s...,EN
1,1,value add table calculated entrance_exit skin ...,EN_143330_25224-123,Values added to Table 3-3; The calculated entr...,EN
2,2,outbreak duval_county begin april peak october...,EN_524864_63868-2,The outbreak in Duval County began in April 19...,EN
3,3,broder rapid communication bethesda system rep...,EN_518687_63341-14,Broder S. Rapid communication: the Bethesda Sy...,EN
4,4,opportunity meet social_worker child life spec...,EN_569477_70415-6,You will have the opportunity to meet with our...,EN
...,...,...,...,...,...
1062898,1062898,depresión_posparto ayudar averiguar depresión ...,ES_948641_119976-6,¿Para qué se usa: Una evaluación de depresión ...,ES
1062899,1062899,comprobar recipiente uses microondas etiquetad...,ES_550575_61526-8,Comprueba que cualquier recipiente que uses en...,ES
1062900,1062900,descubrar_estrategia programa hogar centrar co...,ES_234978_26512-2,Descubra estrategias para que las evaluaciones...,ES
1062901,1062901,tratamiento necesitar medicamento combatir tox...,ES_973721_122671-12,Tratamiento: Usted necesitará medicamento para...,ES


In [5]:
df_merge = df.merge(raw, how="inner", on="doc_id")[
    ["doc_id", "id_preproc", "lemmas_x", "text", "len"]]
df_merge

Unnamed: 0,doc_id,id_preproc,lemmas_x,text,len
0,EN_492297_60866-9,0,decrease initiation prevalence smoking hungary...,To decrease the initiation and prevalence of s...,63
1,EN_143330_25224-123,1,value add table calculated entrance_exit skin ...,Values added to Table 3-3; The calculated entr...,61
2,EN_524864_63868-2,2,outbreak duval_county begin april peak october...,The outbreak in Duval County began in April 19...,58
3,EN_518687_63341-14,3,broder rapid communication bethesda system rep...,Broder S. Rapid communication: the Bethesda Sy...,13
4,EN_569477_70415-6,4,opportunity meet social_worker child life spec...,You will have the opportunity to meet with our...,9
...,...,...,...,...,...
1393108,EN_485225_59974-3,1393108,acip cdc determine priority group rank_tier ba...,"ACIP and CDC determined the priority groups, r...",61
1393109,EN_1219267_277263-11,1393109,surgical lie operating table intravenous_intra...,For a surgical biopsy:\n- You'll lie on an ope...,53
1393110,EN_1238215_279154-29,1393110,cure rate people stage tumor people stage intr...,The cure rates for people with stage III tumor...,11
1393111,EN_321620_48080-1,1393111,suggested_citation article schneider kl lapane...,Suggested citation for this article: Schneider...,21


In [6]:
# Read thetas
thetas = sparse.load_npz(model_path.joinpath(
    'mallet_output/EN/thetas.npz')).toarray()
thetas.shape

(1393113, 20)

In [7]:
# Get topic keys
with open(model_path.joinpath('mallet_output/EN/topickeys.txt'), 'r') as file:
    lines = file.readlines()
    
# Strip newline characters and any leading/trailing whitespace from each line
topic_keys = [line.split("\t")[-1].split() for line in lines]

In [52]:
responses = []
for topic in range(len(topic_keys)):
    #print(f"Topic {topic}: {topic_keys[topic]}")
    most_repr = get_most_representative_per_tpc(thetas, topn=3)[topic]
    #print(most_repr)
    most_repr_docs = [df_merge[df_merge.id_preproc == id].text.values.tolist()[0][:500] for id in most_repr]
    
    time.sleep(1)
    this_tpc_promt = prompt_template.format(
        topic_keys[topic],
        *most_repr_docs
    )
    llm_response = gpt_model.prompt_gpt(
        prompt=this_tpc_promt, model_engine='gpt-3.5-turbo', temperature=0.1, max_tokens=500
    )  
    label, add, rationale = llm_response.split(" - ")
    
    responses.append(
        [topic_keys[topic], "\n".join(most_repr_docs), label, add, rationale]
    )

In [53]:
responses_df_en = pd.DataFrame(responses, columns=["topic", "most_repr_docs", "label", "add", "rationale"])

In [62]:
responses_df_en[responses_df_en["add"] == "False"]

Unnamed: 0,topic,most_repr_docs,label,add,rationale
6,"[bladder, surgery, urine, pain, problem, infec...",Overview: Proctitis is inflammation of the lin...,Urinary Tract Conditions,False,"The provided documents focus on proctitis, dou..."
8,"[review, document, section, include, niosh, pa...",Revision Includes: Revised to incorporate addi...,Workplace Safety and Dose Management,False,The topic of the provided documents revolves a...
9,"[cell, study, gene, research, human, disease, ...","2011: Kirino Y, Vourekas A, Khandros E, Mourel...",Genetic Research and Disease Mechanisms,False,The provided documents focus on molecular biol...
12,"[health, cdc, public, response, country, globa...","*In southern Sudan, NIDs were implemented with...",Global Health Response,False,The documents primarily focus on disease outbr...
18,"[worker, safety, work, employee, niosh, health...",Discussion: The employer should develop and im...,Occupational Safety and Health,False,"The documents focus on workplace safety, emplo..."
22,"[tobacco, smoking, smoke, cigarette, smoker, r...",Current drinking was defined as consuming one ...,Tobacco and Alcohol Use Analysis,False,The documents primarily focus on alcohol consu...
30,"[information, website, contact, visit, health,...",What will it look like when I log in using two...,Health Information Access,False,The provided documents focus on topics related...
31,"[fda, product, food, device, drug, drug_admini...",To get a device cleared or approved by the FDA...,FDA Approval Process,False,The documents primarily focus on the FDA appro...
32,"[doctor_begin, focus, areas_focus, area, docto...",Find a doctor whose last name begins with the ...,Doctor Search,False,The topic of the provided documents focuses on...
36,"[victim, foot, work, incident, fall, worker, l...",Discussion: The utility has a two-way radio in...,Workplace Safety,False,The topic of the provided documents focuses on...


In [67]:
responses_df_en.iloc[80].rationale

'The provided documents focus on the Mayo Clinic, medical education, professional memberships, and academic achievements of individuals. These documents do not contain information relevant to maternal and infant health topics such as medication dosages, breastfeeding, postpartum depression, or developmental milestones.'

### Calculate S3 and get document's topic

In [31]:
def get_words_assigned(bow, thetas, betas, doc_id, tp_id):
    """Simulates LDA's word assignment process for a given document and topic."""

    words_doc_idx = [tup[0] for tup in bow[doc_id]]  # Words of document doc_id
    thetas_doc = thetas[doc_id]  # Doc-topic proportions of document doc_id
    # Get the words assigned to topic tp_id for document doc_id
    words_assigned = []
    for idx_w in words_doc_idx:
        try:
            p_z = np.multiply(thetas_doc, betas[:, idx_w])
            p_z_args = np.argsort(p_z)
            if p_z[p_z_args[-1]] > 20*p_z[p_z_args[-2]]:
                assignment = p_z_args[-1]
                if assignment == tp_id:
                    words_assigned.append(idx_w)
            else:
                sampling = np.random.multinomial(1, np.multiply(
                    thetas_doc, betas[:, idx_w])/np.sum(np.multiply(thetas_doc, betas[:, idx_w])))
                assignment = int(np.nonzero(sampling)[0][0])
                if assignment == tp_id:
                    words_assigned.append(idx_w)
        except Exception as e:
            print(e)
            continue
    return words_assigned

In [32]:
# Load betas
betas = np.load(model_path.joinpath('mallet_output/EN/betas.npy'))

def file_lines(fname):
    """
    Count number of lines in file

    Parameters
    ----------
    fname: Path
        the file whose number of lines is calculated

    Returns
    -------
    number of lines
    """
    with fname.open('r', encoding='utf8') as f:
        for i, l in enumerate(f):
            pass
    return i + 1


# Load vocab dictionaries
wtcFile = model_path.joinpath('mallet_output/EN/word-topic-counts.txt')
vocab_size = file_lines(wtcFile)
vocab = []
term_freq = np.zeros((vocab_size,))

with wtcFile.open('r', encoding='utf8') as fin:
    for i, line in enumerate(fin):
        elements = line.split()
        vocab.append(elements[1])
        for counts in elements[2:]:
            tpc = int(counts.split(':')[0])
            cnt = int(counts.split(':')[1])

vocab_w2id = {}
vocab_id2w = {}
for i, wd in enumerate(vocab):
    vocab_w2id[wd] = i
    vocab_id2w[str(i)] = wd

# Create Gensim dictionary
gensimDict = corpora.Dictionary(corpus)

# Get BoW
bow = [gensimDict.doc2bow(doc) for doc in corpus]

In [33]:
###################
# Create Gensim BoW with the same order as in the betas batrix
###################
# Dicionary to map Gensim IDs to Mallet IDs
gensim_to_mallet_ids = {word_tuple[0]: (vocab_w2id[gensimDict[word_tuple[0]]] if gensimDict[word_tuple[0]] in vocab_w2id.keys(
) else None) for doc in bow for word_tuple in doc}
gensim_to_mallet_ids = {
    key: value for key, value in gensim_to_mallet_ids.items() if value is not None}

sorted_bow = []
for doc in bow:
    new_doc = []
    for gensim_word_id, weight in doc:
        if gensim_word_id in gensim_to_mallet_ids.keys():
            new_doc.append(
                (gensim_to_mallet_ids[gensim_word_id], weight))
    new_doc = sorted(new_doc, key=lambda x: x[0])
    sorted_bow.append(new_doc)

In [34]:
# Create a sparse matrix for the BoW representation
num_docs = len(sorted_bow)
vocab_size = betas.shape[1]

# Initialize sparse matrix in COO format
row = []
col = []
data = []

for doc_id, doc in enumerate(sorted_bow):
    for word_id, weight in doc:
        row.append(doc_id)
        col.append(word_id)
        data.append(weight)

# Convert to COO matrix
bow_mat_sparse = sparse.coo_matrix((data, (row, col)), shape=(num_docs, vocab_size), dtype=np.int32)

# If needed, convert to CSR format for more efficient arithmetic operations
bow_mat_sparse = bow_mat_sparse.tocsr()

print(f"BoW sparse matrix shape: {bow_mat_sparse.shape}")
print(f"Sparse matrix memory usage: {bow_mat_sparse.data.nbytes / 1024**2:.2f} MB")

# Save copy of GenSim BoW (assuming bow is a dense matrix or similar structure)
gensim_bow = bow.copy()

# Save sorted_bow as the new bow
bow = sorted_bow

BoW sparse matrix shape: (1393113, 431517)
Sparse matrix memory usage: 118.77 MB


In [None]:
# Dar mayor prioridad como buenos representantes a los documentos más largos.  Al fin y al cabo, si se ajusta bien a la probabilidad con muchos tokens el ejemplo debe ser mejor.
# - Una métrica diferente, pero que también podría funcionar bien y que automáticamente prioriza los documentos largos: para el documento d y el tópico t, suma los pesos que asigna el tópico a cada una de las palabras de d.  El problema que puede tener esta es que te va a colar documentos que quizá son un poco menos buenos, pero que son más largos ...

print("Calculating approach 3...")
start = time.time()

sample = len(thetas)
num_topics = thetas.shape[1]
num_words = betas.shape[1]

S3 = np.zeros((sample, num_topics))

# Create a mask for the entire betas matrix to avoid recalculating it for each document
betas_masks = np.ones((num_topics, num_words), dtype=bool)

# For each topic, prepare masks for words assigned to topics across all documents
for topic in range(num_topics):
    for doc in range(sample):
        try:
            words_assigned = get_words_assigned(bow, thetas, betas, doc, topic)
        except Exception as e:
            print(e)
            print(doc,topic)
        betas_masks[topic, words_assigned] = False

# Compute S3 using vectorized operations
for doc in range(sample):
    for topic in range(num_topics):
        betas_doc = betas[topic].copy()
        betas_doc[betas_masks[topic]] = 0
        S3[doc, topic] = np.sum(betas_doc)

end = time.time()
print(f"Approach 3 calculated in {end - start:.2f} seconds")

#S3_sparse = sparse.csr_matrix(S3)
#sparse.save_npz(results_folder.joinpath('S3.npz'), S3_sparse)

Calculating approach 3...


  sampling = np.random.multinomial(1, np.multiply(


pvals < 0, pvals > 1 or pvals contains NaNs
pvals < 0, pvals > 1 or pvals contains NaNs
pvals < 0, pvals > 1 or pvals contains NaNs
pvals < 0, pvals > 1 or pvals contains NaNs
pvals < 0, pvals > 1 or pvals contains NaNs
pvals < 0, pvals > 1 or pvals contains NaNs
pvals < 0, pvals > 1 or pvals contains NaNs
pvals < 0, pvals > 1 or pvals contains NaNs
pvals < 0, pvals > 1 or pvals contains NaNs
pvals < 0, pvals > 1 or pvals contains NaNs
pvals < 0, pvals > 1 or pvals contains NaNs
pvals < 0, pvals > 1 or pvals contains NaNs
pvals < 0, pvals > 1 or pvals contains NaNs
pvals < 0, pvals > 1 or pvals contains NaNs
pvals < 0, pvals > 1 or pvals contains NaNs
pvals < 0, pvals > 1 or pvals contains NaNs
pvals < 0, pvals > 1 or pvals contains NaNs
pvals < 0, pvals > 1 or pvals contains NaNs
pvals < 0, pvals > 1 or pvals contains NaNs
pvals < 0, pvals > 1 or pvals contains NaNs
pvals < 0, pvals > 1 or pvals contains NaNs
pvals < 0, pvals > 1 or pvals contains NaNs
pvals < 0, pvals > 1 or pvals co

In [36]:
S3.shape

(1393113, 20)

In [37]:
S3_sparse = sparse.csr_matrix(S3)
sparse.save_npz('S3.npz', S3_sparse)