In [2]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
import pathlib
import os
from dotenv import load_dotenv
import time
import sys
import csv

csv.field_size_limit(100000000)

131072

In [3]:
from langchain_core.documents import Document
from langchain_core.retrievers import BaseRetriever
from langchain_core.callbacks import CallbackManagerForRetrieverRun
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA, RetrievalQAWithSourcesChain
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader
from langchain.chat_models import ChatOpenAI
from langchain.docstore.document import Document
#from langchain_core.prompts import PromptTemplate
from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI

## OpenAI APIKEY

In [4]:
path_env = pathlib.Path(os.getcwd()).parent.parent / '.env'
load_dotenv(path_env)
api_key = os.getenv("OPENAI_API_KEY")

os.environ["OPENAI_API_KEY"] = api_key

## Paths

In [5]:
path_orig_en = pathlib.Path("/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/source/corpus_rosie/corpus_pass_en_tr.parquet")
path_orig_es = pathlib.Path("/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/source/corpus_rosie/corpus_pass_es_tr.parquet")
path_source = pathlib.Path("/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/source/corpus_rosie/passages/df_1.parquet")

path_model = pathlib.Path("/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/models/LDA/passage/rosie_lg_lda_1_20")

path_corpus_en = path_model / "train_data" / "corpus_EN.txt"
path_corpus_es = path_model / "train_data" / "corpus_ES.txt"

persist_directory = (path_model / 'db11').as_posix()

## Read

In [6]:
raw_en = pd.read_parquet(path_source)
raw_en.head()

Unnamed: 0,id_preproc,lemmas,doc_id,text,lang
0,0,decrease initiation prevalence smoking hungary...,EN_492297_60866-9,To decrease the initiation and prevalence of s...,EN
1,1,value add table calculated entrance_exit skin ...,EN_143330_25224-123,Values added to Table 3-3; The calculated entr...,EN
2,2,outbreak duval_county begin april peak october...,EN_524864_63868-2,The outbreak in Duval County began in April 19...,EN
3,3,broder rapid communication bethesda system rep...,EN_518687_63341-14,Broder S. Rapid communication: the Bethesda Sy...,EN
4,4,opportunity meet social_worker child life spec...,EN_569477_70415-6,You will have the opportunity to meet with our...,EN


In [7]:
raw_en.columns

Index(['id_preproc', 'lemmas', 'doc_id', 'text', 'lang'], dtype='object')

In [8]:
with path_corpus_en.open("r", encoding="utf-8") as f:
    lines = [line for line in f.readlines()]
corpus_en = [line.rsplit(" 0 ")[1].strip().split() for line in lines]

ids = [line.split(" 0 ")[0] for line in lines]
df_en = pd.DataFrame({"lemmas": [" ".join(doc) for doc in corpus_en]})
df_en["doc_id"] = ids
df_en["len"] = df_en['lemmas'].apply(lambda x: len(x.split()))
df_en["id_top"] = range(len(df_en))

In [9]:
df_en_raw = df_en.merge(raw_en, how="inner", on="doc_id")[["doc_id", "id_top", "id_preproc", "lemmas_x", "text", "len"]]
df_en_raw

Unnamed: 0,doc_id,id_top,id_preproc,lemmas_x,text,len
0,EN_492297_60866-9,0,0,decrease initiation prevalence smoking hungary...,To decrease the initiation and prevalence of s...,63
1,EN_143330_25224-123,1,1,value add table calculated entrance_exit skin ...,Values added to Table 3-3; The calculated entr...,61
2,EN_524864_63868-2,2,2,outbreak duval_county begin april peak october...,The outbreak in Duval County began in April 19...,58
3,EN_518687_63341-14,3,3,broder rapid communication bethesda system rep...,Broder S. Rapid communication: the Bethesda Sy...,13
4,EN_569477_70415-6,4,4,opportunity meet social_worker child life spec...,You will have the opportunity to meet with our...,9
...,...,...,...,...,...,...
1393108,EN_485225_59974-3,1393108,1393108,acip cdc determine priority group rank_tier ba...,"ACIP and CDC determined the priority groups, r...",61
1393109,EN_1219267_277263-11,1393109,1393109,surgical lie operating table intravenous_intra...,For a surgical biopsy:\n- You'll lie on an ope...,53
1393110,EN_1238215_279154-29,1393110,1393110,cure rate people stage tumor people stage intr...,The cure rates for people with stage III tumor...,11
1393111,EN_321620_48080-1,1393111,1393111,suggested_citation article schneider kl lapane...,Suggested citation for this article: Schneider...,21


In [10]:
# Read thetas 
thetas = sparse.load_npz(path_model.joinpath(f"mallet_output/{'EN'}/thetas.npz")).toarray()
betas = np.load((path_model.joinpath(f"mallet_output/{'EN'}/betas.npy")))
def get_thetas_str(row,thetas):
    return " ".join([f"{id_}|{round(el, 4)}" for id_,el in enumerate(thetas[row]) if el!=0.0])

def get_most_repr_tpc(row,thetas):
    return np.argmax(thetas[row])

df_en_raw["thetas"] = df_en_raw.apply(lambda row: get_thetas_str(row['id_top'], thetas), axis=1)
df_en_raw["id_tpc"] = df_en_raw.apply(lambda row: get_most_repr_tpc(row['id_top'], thetas), axis=1)

In [11]:
df_en_raw.head()

Unnamed: 0,doc_id,id_top,id_preproc,lemmas_x,text,len,thetas,id_tpc
0,EN_492297_60866-9,0,0,decrease initiation prevalence smoking hungary...,To decrease the initiation and prevalence of s...,63,1|0.30140000581741333 6|0.04839999973773956 10...,10
1,EN_143330_25224-123,1,1,value add table calculated entrance_exit skin ...,Values added to Table 3-3; The calculated entr...,61,3|0.08810000121593475 7|0.45579999685287476 11...,7
2,EN_524864_63868-2,2,2,outbreak duval_county begin april peak october...,The outbreak in Duval County began in April 19...,58,1|0.22429999709129333 15|0.7408999800682068 17...,15
3,EN_518687_63341-14,3,3,broder rapid communication bethesda system rep...,Broder S. Rapid communication: the Bethesda Sy...,13,4|0.22939999401569366 6|0.004100000020116568 1...,19
4,EN_569477_70415-6,4,4,opportunity meet social_worker child life spec...,You will have the opportunity to meet with our...,9,4|0.0035000001080334187 6|0.005900000222027302...,14


In [12]:
# Load vocab dictionaries
vocab_w2id = {}
vocab_id2w = {}

# Open the file and read the lines
with open((path_model / "mallet_output/EN" / "vocab_freq.txt"), 'r', encoding='utf8') as file:
    for i, line in enumerate(file):
        # Strip leading and trailing whitespace
        stripped_line = line.strip()
        # Split the line into words and numbers
        parts = stripped_line.split()
        if parts:
            # Get the word (first part)
            wd = parts[0]
            # Populate the dictionaries
            vocab_w2id[wd] = i
            vocab_id2w[str(i)] = wd

In [13]:
print("Calculating approach 3...")
start = time.time()
S3 = np.zeros((len(thetas), len(betas)))

# For each document
for doc in range(len(thetas)):
    # For each topic
    for topic in range(thetas.shape[1]):

        # ids of the words of document doc in the vocabulary
        wd_ids = []
        for word in corpus_en[doc]:
            try:
                wd_ids.append(vocab_w2id[word])
            except Exception as e:
                #print(f"Word {word} not found in vocabulary") 
                continue

        # sum of the weights that topic assings to each word in the document
        S3[doc, topic] = np.sum(betas[topic, wd_ids])

print(f"S3 shape: {S3.shape}")

Calculating approach 3...
S3 shape: (1393113, 20)


## Bad topics

In [14]:
# Load topic-keys
keys = []
with open((path_model / "mallet_output/EN" / "topickeys.txt"), 'r') as file:
    for line in file:
        # Strip leading and trailing whitespace
        stripped_line = line.strip()
        # Split the line into parts and ignore the first two parts (number and float)
        parts = stripped_line.split(maxsplit=2)
        if len(parts) > 2:
            text_part = parts[2]
            keys.append(text_part)

In [15]:
keys

['test heart doctor_begin blood disease image doctor technology lung information procedure magnetic_resonance imaging diagnose surgery result provider body ultrasound condition',
 'age year high rate report health student death prevalence increase world organization state woman white adult person black population group',
 'food eat water outbreak product information people report healthy illness diet include ill store animal source drink technology fda day',
 'information technology skin symptom infection child people body common area severe bacteria treatment include condition treat pain day spread eye',
 'case infection patient health hiv test report person tuberculosis world disease organization testing laboratory result cdc virus risk treatment transmission',
 'medication information technology treatment medicine doctor drug treat symptom effect dose day prescribe pain provider time stop healthcare_provider talk child',
 'information technology child time family good health day par

In [16]:
lowest_indices = np.argsort(S3[:,-2], axis=0)[:10]
lowest_indices

array([ 949321,  752527,  558908, 1392968,  144049,  896003,  428385,
        428353,  346378,  949171])

In [17]:
df_en_raw[df_en_raw['id_top'].isin(lowest_indices)]["text"].values.tolist()

["Dr. Ashish Shah's office is located at 601 5th St S St Petersburg, FL 33701.",
 'NHPCC is governed by the Steering Committee, Advisory Committee and Regional Leadership Committee. Danielle Deery, regional coordinator, sits on the Advisory Committee and Regional Leadership Committee. Regina Butler, regional director, also sits on the Regional Leadership Committee.',
 'These services are currently available for patients admitted in the hospital at the Children’s Sheikh Zayed campus, as well as at the Children’s Montgomery County Regional Outpatient Center.',
 '6. Glass RI, Urrutia JJ, Sibony S, et al. Earthquake injuries related to housing in a Guatemalan village. Science 1977;197:638-43.',
 '1Significant linear decrease from 2007 through 2017 (p < 0.05).',
 'Reviewed on Jun 24, 2022: Dr. Blair is an exceptional radiation oncologist. We are so fortunate that he has been my physician through this ordeal.',
 'The thoracic outlet is the space between your collarbone (clavicle) and your fi

## Bad topics new implementation

In [18]:
# 0 = document's id
# 1 = document's name
# 3
# 4 = word
# 5 = topic to which the word belongs
import gzip
with gzip.open((path_model / "mallet_output/EN" / "topic-state.gz")) as fin:
    topic_state_df = pd.read_csv(fin, delim_whitespace=True,
                                 names=['docid', 'NA3','wd_idx_doc', 'wd_vocab','word', 'tpc'],
                                 header=None, skiprows=3)


In [19]:
topic_state_df

Unnamed: 0,docid,NA3,wd_idx_doc,wd_vocab,word,tpc
0,0,,0,0,decrease,1
1,0,,1,1,initiation,10
2,0,,2,2,prevalence,1
3,0,,3,3,smoking,1
4,0,,4,4,hungary,1
...,...,...,...,...,...,...
39480603,1393112,,15,4736,µg,15
39480604,1393112,,16,9415,diphtheria_toxoid,15
39480605,1393112,,17,507,single,15
39480606,1393112,,18,54,dose,15


In [20]:
z = topic_state_df.copy().groupby(['docid'])['tpc'].apply(list).reset_index(name='new')
z = z.new.values.tolist()

In [21]:
documents = topic_state_df.copy().groupby(['docid'])['wd_idx_doc'].apply(list).reset_index(name='new')
documents = documents.new.values.tolist()

In [22]:
documents_texts = topic_state_df.copy().groupby(['docid'])['word'].apply(list).reset_index(name='new')
documents_texts = documents_texts.new.values.tolist()

In [174]:
z = topic_state_df.copy().groupby(['docid'])['tpc'].apply(list).reset_index(name='new')

In [175]:
z

Unnamed: 0,docid,new
0,0,"[1, 10, 1, 1, 1, 10, 10, 10, 10, 1, 10, 10, 10..."
1,1,"[7, 11, 11, 7, 3, 7, 11, 7, 3, 7, 11, 11, 7, 7..."
2,2,"[15, 1, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15..."
3,3,"[19, 4, 19, 19, 19, 4, 18, 18, 19, 4, 19, 19, 19]"
4,4,"[14, 14, 14, 14, 14, 14, 14, 14, 14]"
...,...,...
1393092,1393108,"[15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 1..."
1393093,1393109,"[12, 12, 12, 12, 12, 12, 12, 12, 8, 12, 12, 12..."
1393094,1393110,"[17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17]"
1393095,1393111,"[19, 19, 19, 19, 19, 19, 19, 19, 19, 11, 12, 1..."


In [148]:
len(topic_state_df.docid.unique())

1393097

In [141]:
print(len(thetas))
print(len(z))

1393113
1393097


In [111]:
bad_document_indices[:20]

array([ 467830, 1258767, 1292988,  502588,  422843,   50612,  195516,
        481622,  528311,  205506,  801297,  326944,  490613, 1334574,
        234499,  950088,  929209,  431261, 1155056, 1356299])

In [112]:
df_en_raw[df_en.id_top == 326944].text.values

array(['Under ordinary circumstances, a person’s vision happens because their brain is processing signals sent by their eyes. Once the signals reach your occipital lobe, neurons in that part of your brain send and relay signals to other areas in your brain. Visual hallucinations are when neurons in the occipital lobe act as if they’re processing signals from your eyes, but in reality, they’re acting on their own without such signals.'],
      dtype=object)

In [114]:
keys[11]

'datum report study survey estimate include analysis population year state participant health base number rate age information group measure level'

In [183]:
z

Unnamed: 0,docid,new
0,0,"[1, 10, 1, 1, 1, 10, 10, 10, 10, 1, 10, 10, 10..."
1,1,"[7, 11, 11, 7, 3, 7, 11, 7, 3, 7, 11, 11, 7, 7..."
2,2,"[15, 1, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15..."
3,3,"[19, 4, 19, 19, 19, 4, 18, 18, 19, 4, 19, 19, 19]"
4,4,"[14, 14, 14, 14, 14, 14, 14, 14, 14]"
...,...,...
1393092,1393108,"[15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 1..."
1393093,1393109,"[12, 12, 12, 12, 12, 12, 12, 12, 8, 12, 12, 12..."
1393094,1393110,"[17, 17, 17, 17, 17, 17, 17, 17, 17, 17, 17]"
1393095,1393111,"[19, 19, 19, 19, 19, 19, 19, 19, 19, 11, 12, 1..."


In [232]:
# Approach 1: Identify bad documents using actual topic assignments and log-probabilities
def identify_bad_documents_v1_norm(thetas, log_betas, z, documents):
    D = len(documents)
    doc_log_probs = np.zeros(D)

    for d in range(D):
        doc = documents[d]
        topic_assignments = z[d]
        doc_log_prob = 0
        for i, word in enumerate(doc):
            topic = topic_assignments[i]
            doc_log_prob += log_betas[topic, word]
        # Normalize the log-probability by the length of the document
        doc_log_probs[d] = doc_log_prob / len(doc)

    # Rank documents by their normalized log-probabilities (lower is worse)
    bad_document_indices = np.argsort(doc_log_probs)
    return bad_document_indices, doc_log_probs[bad_document_indices]

# Timing the execution of each approach
print("Calculating approach 1 norm ...")
start = time.time()
bad_document_indices_v1_norm, scores_v1_norm = identify_bad_documents_v1_norm(thetas, log_betas, z, documents)
end = time.time()
print(f"Time taken for approach 1: {end - start:.2f} seconds")
print("Bad documents (indices):", bad_document_indices_v1_norm)


# Approach 1: Identify bad documents using actual topic assignments and log-probabilities
def identify_bad_documents_v1(thetas, log_betas, z, documents):
    D = len(documents)
    doc_log_probs = np.zeros(D)

    for d in range(D):
        doc = documents[d]
        topic_assignments = z[d]
        doc_log_prob = 0
        for i, word in enumerate(doc):
            topic = topic_assignments[i]
            doc_log_prob += log_betas[topic, word]
        doc_log_probs[d] = doc_log_prob

    # Rank documents by their log-probabilities (lower is worse)
    bad_document_indices = np.argsort(doc_log_probs)
    return bad_document_indices, doc_log_probs[bad_document_indices]

# Timing the execution of each approach
print("Calculating approach 1...")
start = time.time()
bad_document_indices_v1, scores_v1 = identify_bad_documents_v1(thetas, log_betas, z, documents)
end = time.time()
print(f"Time taken for approach 1: {end - start:.2f} seconds")
print("Bad documents (indices):", bad_document_indices_v1)

Calculating approach 1 norm ...
Time taken for approach 1: 6.97 seconds
Bad documents (indices): [ 428087 1141579  735761 ... 1063179 1226107  281344]
Calculating approach 1...
Time taken for approach 1: 6.27 seconds
Bad documents (indices): [ 467830 1258767 1292988 ... 1226107  237678  281344]


In [234]:
df_en_raw[df_en_raw['id_top'].isin(bad_document_indices_v1[:10])]["text"].values.tolist()

['Baby walkers give quick mobility (up to 4 feet per second) to young children before they are developmentally ready. Despite the decrease in baby walker-related injuries over the years, there are still too many serious injuries occurring related to this product.',
 'Cancer is the most serious, but also least likely, cause of bleeding after menopause. If testing finds cancerous cells or cell changes that could lead to cancer, your ob-gyn should refer you to a specialist called a gynecologic oncologist.',
 'Reviewed on Apr 5, 2023: Although Dr Li was able to quickly rule out MG, we still have not been able to fully understand why I am having some of the symptoms I have. I am hoping to find a Doctor that can help me soon. Dr. Li did recommend that I follow up with another neurologist in Cleveland Clinic. So, I will follow up with that Dr.',
 'Reviewed on Feb 22, 2023: Dr Svets always listen and helps me make the best choice for my health issues , She is awesome and I always recommend her

In [240]:
scores_v1[:10]

array([-21045.50423397, -14136.89039311, -12767.10933651, -10997.8950622 ,
       -10630.7427062 , -10378.35480469,  -9352.55861622,  -9123.01228571,
        -8843.52656937,  -8791.16231595])

In [241]:
df_en_raw[df_en_raw['id_top'].isin(bad_document_indices_v1_norm[:10])]["text"].values.tolist()

['*** The data on state Medicaid coverage of cessation medications in this report do not reflect this requirement because, as of the writing of this report, state Medicaid programs are still in the process of submitting state plan amendments to bring them into compliance with this provision.',
 'Compared with children living at or above poverty level, children living below poverty level had significantly lower coverage for all vaccines. Coverage for DTP3 for children living below poverty level compared with coverage for children living above poverty level was 93% and 97%, respectively (pless than 0.03); for polio, coverage was 90% and 92%, respectively (pless than 0.05); for Hib, coverage was 90% and 94%, respectively (pless than 0.03); for MCV, coverage was 86% and 92%, respectively, (pless than 0.03); and for hepatitis B vaccine, coverage was 80% and 85% (pless than 0.03), respectively.',
 'Lube may not be just for engines, but it gets lots of engines running, if you know what we mea

In [242]:
scores_v1_norm[:10]

array([-23.02585093, -23.02585093, -23.02585093, -23.02585093,
       -23.02585093, -23.02585093, -23.02585093, -23.02585093,
       -23.02585093, -23.02585093])

In [192]:
"""
epsilon = 1e-10
betas += epsilon
thetas += epsilon

# Log-probabilities for numerical stability
log_betas = np.log(betas)
log_thetas = np.log(thetas)
"""

# Approach 1: Identify bad documents using actual topic assignments and log-probabilities
def identify_bad_documents_v1(thetas, log_betas, z, documents):
    D = len(documents)
    doc_log_probs = np.zeros(D)

    for d in range(D):
        doc = documents[d]
        topic_assignments = z[d]
        doc_log_prob = 0
        for i, word in enumerate(doc):
            topic = topic_assignments[i]
            doc_log_prob += log_betas[topic, word]
        doc_log_probs[d] = doc_log_prob

    # Rank documents by their log-probabilities (lower is worse)
    bad_document_indices = np.argsort(doc_log_probs)
    return bad_document_indices

# Approach 2: Identify bad documents by summing weights assuming all words from one topic
def identify_bad_documents_v2(thetas, betas, corpus_en, vocab_w2id):
    D = len(thetas)
    K = len(betas)
    S3 = np.zeros((D, K))

    for doc in range(D):
        for topic in range(K):
            wd_ids = [vocab_w2id[word] for word in corpus_en[doc] if word in vocab_w2id]
            S3[doc, topic] = np.sum(betas[topic, wd_ids])
    
    # Summing the scores over topics for each document
    doc_scores = np.sum(S3, axis=1)
    
    # Rank documents by their scores (lower is worse)
    bad_document_indices = np.argsort(doc_scores)
    return bad_document_indices

def identify_bad_documents_v3(thetas, betas, z, documents, documents_texts, vocab_w2id):
    D = len(thetas)
    K = len(betas)
    S3 = np.zeros((D, K))

    for doc in range(D):
        for topic in range(K):
            try:
                wd_ids = [i for i, word in zip(documents[doc], documents_texts[doc]) if word in vocab_w2id and z[doc][i] == topic]
            except Exception as e: 
                #print(e)
                #print(doc,topic)
                continue
            S3[doc, topic] = np.sum(betas[topic, wd_ids])
    
    # Summing the scores over topics for each document
    doc_scores = np.sum(S3, axis=1)
    
    # Rank documents by their scores (lower is worse)
    bad_document_indices = np.argsort(doc_scores)
    return bad_document_indices

# Timing the execution of each approach
print("Calculating approach 1...")
start = time.time()
bad_document_indices_v1 = identify_bad_documents_v1(thetas, log_betas, z, documents)
end = time.time()
print(f"Time taken for approach 1: {end - start:.2f} seconds")
print("Bad documents (indices):", bad_document_indices_v1)

print("Calculating approach 2...")
start = time.time()
bad_document_indices_v2 = identify_bad_documents_v2(thetas, betas, corpus_en, vocab_w2id)
end = time.time()
print(f"Time taken for approach 2: {end - start:.2f} seconds")
print("Bad documents (indices):", bad_document_indices_v2)

print("Calculating approach 3...")
start = time.time()
bad_document_indices_v3 = identify_bad_documents_v3(thetas, betas, z, documents, documents_texts, vocab_w2id)
end = time.time()
print(f"Time taken for approach 3: {end - start:.2f} seconds")
print("Bad documents (indices):", bad_document_indices_v3)

Calculating approach 1...
Time taken for approach 1: 7.28 seconds
Bad documents (indices): [ 467830 1258767 1292988 ... 1226107  237678  281344]
Calculating approach 2...
Time taken for approach 2: 215.15 seconds
Bad documents (indices): [1200268 1374123 1102457 ...  467835  205508 1293002]
Calculating approach 3...
Time taken for approach 3: 207.53 seconds
Bad documents (indices): [1393112 1393111 1393110 ... 1292988 1258767  467830]


In [209]:
doc = 0
k = 0

In [223]:
for doc in range(D):
    wd_ids_v1 = []
    for word in corpus_en[doc]:
        #print(word)
        try:
            wd_ids_v1.append(vocab_w2id[word])
        except Exception as e:
            #print(f"Word {word} not found in vocabulary") 
            continue
    wd_ids_v2 = [vocab_w2id[word] for word in corpus_en[doc] if word in vocab_w2id]
    if len(wd_ids_v1) != len(wd_ids_v2):
        print("NO DIFF!!!!")

In [218]:
wd_ids_v2 = [vocab_w2id[word] for word in corpus_en[doc] if word in vocab_w2id]

In [219]:
len(wd_ids_v1) == len(wd_ids_v2)

True

In [97]:
# Approach 2: Identify bad documents by summing weights assuming all words from one topic
def identify_bad_documents_v2(thetas, betas, corpus_en, vocab_w2id):
    D = len(thetas)
    K = len(betas)
    S3 = np.zeros((D, K))

    for doc in range(D):
        for topic in range(K):
            wd_ids = [vocab_w2id[word] for word in corpus_en[doc] if word in vocab_w2id]
            S3[doc, topic] = np.sum(betas[topic, wd_ids])

    return S3

def identify_bad_documents_v2_norm(thetas, betas, corpus_en, vocab_w2id):
    D = len(thetas)
    K = len(betas)
    S3 = np.zeros((D, K))

    for doc in range(D):
        for topic in range(K):
            wd_ids = [vocab_w2id[word] for word in corpus_en[doc] if word in vocab_w2id]
            try:
                S3[doc, topic] = (1/len(wd_ids)) * np.sum(betas[topic, wd_ids])
            except:
                #print(doc,topic, wd_ids)
                continue
                
    return S3

print("Calculating approach 2...")
start = time.time()
s3_v1 = identify_bad_documents_v2(thetas, betas, corpus_en, vocab_w2id)
end = time.time()
print(f"Time taken for approach 2: {end - start:.2f} seconds")

print("Calculating approach 2...")
start = time.time()
s3_v1_norm = identify_bad_documents_v2_norm(thetas, betas, corpus_en, vocab_w2id)
end = time.time()
print(f"Time taken for approach 2: {end - start:.2f} seconds")

Calculating approach 2...
Time taken for approach 2: 204.41 seconds
Calculating approach 2...
Time taken for approach 2: 207.51 seconds


In [95]:
lowest_indices_s3_v1 = np.argsort(s3_v1[:,-2], axis=0)[:10]
print(s3_v1[lowest_indices_s3_v1,-2])
result_texts = df_en_raw[df_en_raw['id_top'].isin(lowest_indices_s3_v1)]["text"].values.tolist()
result_texts

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


["Dr. Ashish Shah's office is located at 601 5th St S St Petersburg, FL 33701.",
 'NHPCC is governed by the Steering Committee, Advisory Committee and Regional Leadership Committee. Danielle Deery, regional coordinator, sits on the Advisory Committee and Regional Leadership Committee. Regina Butler, regional director, also sits on the Regional Leadership Committee.',
 'These services are currently available for patients admitted in the hospital at the Children’s Sheikh Zayed campus, as well as at the Children’s Montgomery County Regional Outpatient Center.',
 '6. Glass RI, Urrutia JJ, Sibony S, et al. Earthquake injuries related to housing in a Guatemalan village. Science 1977;197:638-43.',
 '1Significant linear decrease from 2007 through 2017 (p < 0.05).',
 'Reviewed on Jun 24, 2022: Dr. Blair is an exceptional radiation oncologist. We are so fortunate that he has been my physician through this ordeal.',
 'The thoracic outlet is the space between your collarbone (clavicle) and your fi

In [96]:
lowest_indices_s3_v1_norm = np.argsort(s3_v1_norm[:,-2], axis=0)[:10]
print(s3_v1[lowest_indices_s3_v1_norm,-2])
result_texts = df_en_raw[df_en_raw['id_top'].isin(lowest_indices_s3_v1_norm)]["text"].values.tolist()
result_texts

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


['Brothers JA. Harris MA. Right Coronary Artery From Right Sinus of Valsalva and Ventricular Tachycardia. Ann Thorac Surg. 2014 Sep;98(3):1091-4. doi: 10.1016/j.athoracsur.2013.10.099.',
 'Reviewed on Apr 12, 2023: Dr. Saliba and his staff are outstanding!!',
 '† Minnesota intensity thresholds: IT50 = 255; IT90 = 511; IT98 = 778.',
 'Reviewed on Mar 6, 2023: Very nice Dr . Excellent service.',
 'James Jerger Award for Excellence in Student ResearchAmerican Academy of Audiology Convention, AudiologyNow!',
 'Schwartz DS, Keller MS: Maturational Descent of the Epiglottis. Arch Otolaryngol Head Neck Surg 123(6): 627-628, June 1997.',
 'To schedule an appointment for any of the services below, call 216.444.8500.',
 'Gurnaney HG. "Epidural analgesia for kasai portoenterostomy", Pediatric Anesthesiology 2021, Virtual Meeting. Feb 2021.',
 'Reviewed on Feb 9, 2023: *Dr. Ciltea is extremely attentive and interacts excellently.',
 '*Arizona, California, Colorado, Connecticut, Delaware, Florida, 

In [74]:
lowest_indices_s3_v1 = np.argsort(s3_v1[:,-2], axis=0)[:10]
lowest_indices_s3_v1_norm = np.argsort(s3_v1_norm[:,-2], axis=0)[:10]

print("Lowest indices in s3_v1:", lowest_indices_s3_v1)
print("Lowest indices in s3_v1_norm:", lowest_indices_s3_v1_norm)

Lowest indices in s3_v1: [ 949321  752527  558908 1392968  144049  896003  428385  428353  346378
  949171]
Lowest indices in s3_v1_norm: [1050666  884154 1030594  224924  157732  109661  481489  791054  353860
  889444]


In [92]:
def identify_bad_documents_v3(thetas, betas, z, documents, documents_texts, vocab_w2id):
    D = len(thetas)
    K = len(betas)
    S3 = np.zeros((D, K))

    for doc in range(D):
        for topic in range(K):
            try:
                wd_ids = [i for i, word in zip(documents[doc], documents_texts[doc]) if word in vocab_w2id and z[doc][i] == topic]
                S3[doc, topic] = np.sum(betas[topic, wd_ids])
            except Exception as e: 
                #print(e)
                #print(doc,topic)
                continue
    return S3

def identify_bad_documents_v3_norm(thetas, betas, z, documents, documents_texts, vocab_w2id):
    D = len(thetas)
    K = len(betas)
    S3 = np.zeros((D, K))

    for doc in range(D):
        for topic in range(K):
            try:
                wd_ids = [i for i, word in zip(documents[doc], documents_texts[doc]) if word in vocab_w2id and z[doc][i] == topic]
                S3[doc, topic] = (1/len(wd_ids))* np.sum(betas[topic, wd_ids])
            except Exception as e: 
                continue
    return S3

print("Calculating approach 3...")
start = time.time()
s3_v3 = identify_bad_documents_v3(thetas, betas, z, documents, documents_texts, vocab_w2id)
end = time.time()
print(f"Time taken for approach 3: {end - start:.2f} seconds")


print("Calculating approach 3...")
start = time.time()
s3_v3_norm = identify_bad_documents_v3_norm(thetas, betas, z, documents, documents_texts, vocab_w2id)
end = time.time()
print(f"Time taken for approach 3: {end - start:.2f} seconds")

Calculating approach 3...
Time taken for approach 3: 198.40 seconds
Calculating approach 3...
Time taken for approach 3: 95.34 seconds


In [66]:
np.argsort(s3_v3[:,0], axis=0)[-10]

1273171

In [65]:
np.argsort(s3_v3_norm[:,0], axis=0)[-10]

208653

In [93]:
lowest_indices_s3_v3 = np.argsort(s3_v3[:,-2], axis=0)[:10]
print(s3_v3[lowest_indices_s3_v3,-2])
result_texts = df_en_raw[df_en_raw['id_top'].isin(lowest_indices_s3_v3)]["text"].values.tolist()
result_texts

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


['Blood pressure was measured 3 times at 30-second intervals in the right arm after 5 minutes of rest with participants in a seated position by using an electronic sphygmomanometer (HEM-770AFuzzy, Omron, Kyoto, Japan) according to the American Heart Association standardized protocol. The average value of the 3 measurements was used for analyses (20).',
 'In recognition of Lung Cancer Awareness Month, Massachusetts General Hospital physicians answer common questions related to lung cancer.',
 'If the drainage angle is blocked, your eye cannot clear fluid. This causes fluid and pressure to build up within the eye. Over time, this can lead to glaucoma, optic nerve damage and blindness. The drainage angle is challenging to access, which is why gonioscopy is necessary.',
 'Osteoarthritis: Osteoarthritis is a condition that causes pain, swelling and reduced motion in your joints.',
 'One study in particular gave the Watkins hope: The Management of Myelomeningocele Study (MOMS) co-led by CHOP

In [94]:
lowest_indices_s3_v3_norm = np.argsort(s3_v3_norm[:,-2], axis=0)[:10]
print(s3_v3_norm[lowest_indices_s3_v3_norm,-2])
result_texts = df_en_raw[df_en_raw['id_top'].isin(lowest_indices_s3_v3_norm)]["text"].values.tolist()
result_texts

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


['Blood pressure was measured 3 times at 30-second intervals in the right arm after 5 minutes of rest with participants in a seated position by using an electronic sphygmomanometer (HEM-770AFuzzy, Omron, Kyoto, Japan) according to the American Heart Association standardized protocol. The average value of the 3 measurements was used for analyses (20).',
 'In recognition of Lung Cancer Awareness Month, Massachusetts General Hospital physicians answer common questions related to lung cancer.',
 'If the drainage angle is blocked, your eye cannot clear fluid. This causes fluid and pressure to build up within the eye. Over time, this can lead to glaucoma, optic nerve damage and blindness. The drainage angle is challenging to access, which is why gonioscopy is necessary.',
 'Osteoarthritis: Osteoarthritis is a condition that causes pain, swelling and reduced motion in your joints.',
 'One study in particular gave the Watkins hope: The Management of Myelomeningocele Study (MOMS) co-led by CHOP

In [199]:
"""
epsilon = 1e-10
betas += epsilon
thetas += epsilon

# Log-probabilities for numerical stability
log_betas = np.log(betas)
log_thetas = np.log(thetas)
"""

# Approach 2: Identify bad documents by summing weights assuming all words from one topic
def identify_bad_documents_v2(thetas, betas, corpus_en, vocab_w2id):
    D = len(thetas)
    K = len(betas)
    S3 = np.zeros((D, K))

    for doc in range(D):
        for topic in range(K):
            wd_ids = [vocab_w2id[word] for word in corpus_en[doc] if word in vocab_w2id]
            S3[doc, topic] = np.sum(betas[topic, wd_ids])

    return S3

def identify_bad_documents_v2_norm(thetas, betas, corpus_en, vocab_w2id):
    D = len(thetas)
    K = len(betas)
    S3 = np.zeros((D, K))

    for doc in range(D):
        for topic in range(K):
            wd_ids = [vocab_w2id[word] for word in corpus_en[doc] if word in vocab_w2id]
            S3[doc, topic] = (1/len(wd_ids))* np.sum(betas[topic, wd_ids])

    return S3

def identify_bad_documents_v3(thetas, betas, z, documents, documents_texts, vocab_w2id):
    D = len(thetas)
    K = len(betas)
    S3 = np.zeros((D, K))

    for doc in range(D):
        for topic in range(K):
            try:
                wd_ids = [i for i, word in zip(documents[doc], documents_texts[doc]) if word in vocab_w2id and z[doc][i] == topic]
            except Exception as e: 
                #print(e)
                #print(doc,topic)
                continue
            S3[doc, topic] = np.sum(betas[topic, wd_ids])
    
    return S3

print("Calculating approach 2...")
start = time.time()
s3_v1 = identify_bad_documents_v2(thetas, betas, corpus_en, vocab_w2id)
end = time.time()
print(f"Time taken for approach 2: {end - start:.2f} seconds")

print("Calculating approach 3...")
start = time.time()
s3_v2 = identify_bad_documents_v3(thetas, betas, z, documents, documents_texts, vocab_w2id)
end = time.time()
print(f"Time taken for approach 3: {end - start:.2f} seconds")

Calculating approach 2...
Time taken for approach 2: 212.13 seconds
Calculating approach 3...
Time taken for approach 3: 202.42 seconds


In [225]:
lowest_indices = np.argsort(S3[:,-2], axis=0)[:10]
lowest_indices

array([1200268,  995405, 1058703,  994729,  101285,  915806, 1102457,
        972411, 1371677, 1374123])

In [201]:
lowest_indices_v1 = np.argsort(s3_v1[:,-2], axis=0)[:10]
lowest_indices_v1

array([1200268,  995405, 1058703,  994729,  101285,  915806, 1102457,
        972411, 1371677, 1374123])

In [203]:
df_en_raw[df_en_raw['id_top'].isin(lowest_indices_v1)]["text"].values.tolist()

['Take what you need and come as often as you need.',
 'So I think we really â€“ we are seeing something different right now.',
 'Look at what you have done and what you can do!',
 "So, how would they see me, you're going to have to ask them.",
 'Do this, do this. There you go. Look at you.',
 "Don't assume that everyone knows everything they need to know.",
 'Now here they were once more. “How are we going to do this again?” they wondered.',
 "But why? That's what you need to find out next!",
 'I want to be able to go where I want, when I want, without depending on others.',
 'Answers:\\n- a) about 85 to 90 decibels\\n- b) between 15 to 45 minutes\\n- b) about 95 to 100 decibels\\n- c) between 1 to 5 minutes\\n- c) about 105 to 120 decibels\\n- d) between 1 to 30 seconds.']

In [207]:
lowest_indices_v2 = np.argsort(s3_v2[:,0], axis=0)[:10]
lowest_indices_v2

array([ 857258,  857268,  857267,  857266,  857265,  857264,  857262,
       1323925,  857260,  857259])

In [208]:
df_en_raw[df_en_raw['id_top'].isin(lowest_indices_v2)]["text"].values.tolist()

['How long does electrolysis treatment last: An electrolysis treatment lasts anywhere between 15 minutes and one hour.',
 "O'Byrne ML, Desai S, Lane M, McBride M, Paridon S, Goldmuntz E: Relationship Between Habitual Exercise and Performance on Cardiopulmonary Exercise Testing Differs Between Children With Single and Biventricular Circulations. Pediatric Cardiology 38(3): 472-483, Mar 2017.",
 'Influenza testing and empiric antimicrobial treatment. Influenza testing should be strongly considered, particularly during influenza season.§ It might be difficult to differentiate EVALI, a diagnosis of exclusion, from influenza or community-acquired pneumonia on initial assessment, and EVALI might co-occur with respiratory infections. Treatment with empiric antimicrobials, including antivirals, should be considered in accordance with established guidelines and local microbiology and resistance patterns for bacterial pneumonia (12–14). Persons with suspected influenza who are at high risk for i

In [194]:
len(bad_document_indices_v1)

1393097

In [196]:
df_en_raw[df_en_raw['id_top'].isin(bad_document_indices_v1[:10])]["text"].values.tolist()

['Baby walkers give quick mobility (up to 4 feet per second) to young children before they are developmentally ready. Despite the decrease in baby walker-related injuries over the years, there are still too many serious injuries occurring related to this product.',
 'Cancer is the most serious, but also least likely, cause of bleeding after menopause. If testing finds cancerous cells or cell changes that could lead to cancer, your ob-gyn should refer you to a specialist called a gynecologic oncologist.',
 'Reviewed on Apr 5, 2023: Although Dr Li was able to quickly rule out MG, we still have not been able to fully understand why I am having some of the symptoms I have. I am hoping to find a Doctor that can help me soon. Dr. Li did recommend that I follow up with another neurologist in Cleveland Clinic. So, I will follow up with that Dr.',
 'Reviewed on Feb 22, 2023: Dr Svets always listen and helps me make the best choice for my health issues , She is awesome and I always recommend her

In [229]:
df_en_raw[df_en_raw['id_top'].isin(bad_document_indices_v1_norm[:10])]["text"].values.tolist()

['*** The data on state Medicaid coverage of cessation medications in this report do not reflect this requirement because, as of the writing of this report, state Medicaid programs are still in the process of submitting state plan amendments to bring them into compliance with this provision.',
 'Compared with children living at or above poverty level, children living below poverty level had significantly lower coverage for all vaccines. Coverage for DTP3 for children living below poverty level compared with coverage for children living above poverty level was 93% and 97%, respectively (pless than 0.03); for polio, coverage was 90% and 92%, respectively (pless than 0.05); for Hib, coverage was 90% and 94%, respectively (pless than 0.03); for MCV, coverage was 86% and 92%, respectively, (pless than 0.03); and for hepatitis B vaccine, coverage was 80% and 85% (pless than 0.03), respectively.',
 'Lube may not be just for engines, but it gets lots of engines running, if you know what we mea

In [197]:
df_en_raw[df_en_raw['id_top'].isin(bad_document_indices_v2[:10])]["text"].values.tolist()

["But because some come in. Or others, they don't want to know that you're there.",
 'You may have to help others know how they can help.',
 'Thanks to all of you for being on the call.',
 'So I think we really â€“ we are seeing something different right now.',
 "So, how would they see me, you're going to have to ask them.",
 'Do this, do this. There you go. Look at you.',
 'Now here they were once more. “How are we going to do this again?” they wondered.',
 "But why? That's what you need to find out next!",
 'I want to be able to go where I want, when I want, without depending on others.',
 'Answers:\\n- a) about 85 to 90 decibels\\n- b) between 15 to 45 minutes\\n- b) about 95 to 100 decibels\\n- c) between 1 to 5 minutes\\n- c) about 105 to 120 decibels\\n- d) between 1 to 30 seconds.']

In [198]:
df_en_raw[df_en_raw['id_top'].isin(bad_document_indices_v3[:10])]["text"].values.tolist()

['Note: Currently, OSHA standard 1910.266 applies to pulpwood logging but does not apply to the logging of sawtimber-sized trees, as in this incident. OSHA is currently revising their logging regulations to include all types of logging operations. Although not enforceable, sections of 1910.266 of the pulpwood standard, particularly relating to safe work practices, do apply in this case.',
 'Learning Community for Partners: CDC provides learning support to partners, including technical assistance, coaching, subject matter expertise, peer-to-peer learning, and a central hub for outreach materials and resources.',
 'What can I do at home to treat formication: Formication isn’t a symptom you should try to treat at home, as it takes a trained healthcare provider to determine what’s causing it. You should also seek medical attention if you have this symptom unexpectedly, as this symptom can happen with certain dangerous conditions like stroke or drug overdoses. You should also contact a heal

## Create VectorDB for each topic

In [17]:
# Create a text splitter with specified chunk size and overlap
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)

# Select topic to analyze
topic = 18
df_topic = df_en_raw[df_en_raw.id_tpc == topic]

# Create documents and split them into chunks
#documents = [
#    Document(page_content=chunk, metadata={"url": "local", "source": "initial", "identifier": row['doc_id'], "id_top": row["id_top"]})
#    for idx, row in df_topic.iterrows()
#    for chunk in text_splitter.split_text(row['text'])
#]

# Get most representative document for that topic and assume it is free of contradictions
thetas_topic = thetas.T[topic]
top_doc_topic = np.argsort(thetas_topic)[::-1][0]
df_topic_doc = df_topic[df_topic.id_top == top_doc_topic]

# Create Langchain document for that doc
documents = [
    Document(page_content=chunk, metadata={"identifier": row['doc_id'], "source": row["id_top"]})
    for idx, row in df_topic_doc.iterrows()
    for chunk in text_splitter.split_text(row['text'])
]

In [18]:
df_topic_doc

Unnamed: 0,doc_id,id_top,id_preproc,lemmas_x,text,len,thetas,id_tpc
921550,EN_752626_108872-11,921550,921550,birth control pill work medicine talk doctor e...,Birth control pills may not work properly whil...,21,18|1.0,18


In [19]:
documents

[Document(page_content='Birth control pills may not work properly while you are taking this medicine. Talk to your doctor about using an extra method of birth control. Women who can still have children must use a reliable form of barrier contraception, like a condom or diaphragm.', metadata={'identifier': 'EN_752626_108872-11', 'source': 921550})]

In [20]:
start = time.time()
# Define embeddings
embedding = OpenAIEmbeddings()

# Create vector database with this document
vectorstore = Chroma.from_documents(
    documents=documents,
    embedding=embedding,
    persist_directory=persist_directory 
)
# Tiempo total de ejecución, tarda 51 mins en 300k docs
end = time.time()
print(f"Total time is {end - start} seconds")

  warn_deprecated(


Total time is 3.4222564697265625 seconds


In [21]:

prompt_template_text = """
Please analyze the given documents and compare them with the new document to identify any contradictions. Note that in detecting contradictions, only instances where the new document directly contradicts information present in the provided documents should be considered. As contradiction we understand: 'You should breastfeed your baby for six months' and 'Breastfeeding is only necessary for 1 month'. If the new document introduces new information not mentioned in the other documents, it should not be treated as a contradiction.Documents:

If you find any contradiction, your answer should have the following format: CONTRADICTION - CONTRADICTIONS contradiction_rationale: ids_contradiction, where contradiction_rationale is an explanation of the contradiction and ids_contradiction is a list with the 'source' of the documents in which the contradictions were found. Otherwise, your answer should just be 'CONSISTENT'.

--------------------
Examples:

Example 1:
Documents:

"Exercise for at least 30 minutes a day to maintain good health." (source: 1234)
"Regular physical activity helps in reducing stress and anxiety." (source: 5678)
New document:

"Exercise is harmful and should be avoided to maintain good health."
Answer:

CONTRADICTION - CONTRADICTIONS contradiction_rationale: The new document claims that exercise is harmful and should be avoided, which directly contradicts the statements in 1234 and 5678 about the health benefits of exercise. ids_contradiction: [1234, 5678]
Example 2:
Documents:

"A balanced diet includes a variety of fruits and vegetables." (source: 9101)
"Consuming whole grains is beneficial for digestion." (source: 1121)
New document:

"Avoid consuming fruits and vegetables as they are not necessary for a balanced diet."
Answer:

CONTRADICTION - CONTRADICTIONS contradiction_rationale: The new document advises against consuming fruits and vegetables, which directly contradicts the statement in 9101 about their importance in a balanced diet. ids_contradiction: [9101]
Example 3:
Documents:

"The Earth revolves around the Sun." (source: 3141)
"Our solar system includes eight planets orbiting the Sun." (source: 5161)
New document:

"The Earth is the center of the universe, and everything revolves around it."
Answer:

CONTRADICTION - CONTRADICTIONS contradiction_rationale: The new document claims that the Earth is the center of the universe and everything revolves around it, which directly contradicts the established scientific information in 3141 and 5161 about the Earth's position in the solar system. ids_contradiction: [3141, 5161]

--------------------
Documents:
{summaries}

New document:
{question}
--------------------
"""

prompt_template = PromptTemplate.from_template(template=prompt_template_text)

In [22]:
# Set up the turbo LLM
llm = ChatOpenAI(
    temperature=0,
    model_name='gpt-4o'
)

# Define retriever
nb_retrieval_docs = 5
retriever = vectorstore.as_retriever(
    search_kwargs={"k": nb_retrieval_docs}
)

# Crear la chain
chain = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    verbose=True,
    chain_type_kwargs={
        "verbose": False,
        "prompt": prompt_template
    }
)

In [37]:
df_topic.iloc[0].text

"Why it's done: Tubal ligation is one of the most commonly used surgical sterilization procedures for women. Tubal ligation permanently prevents pregnancy, so you no longer need any type of birth control. However, it does not protect against sexually transmitted infections."

In [116]:
retriever.get_relevant_documents('“The lab called me the next day and told me to take Arianna to the nearest hospital immediately,” Katrina says. The pathologist suspected cancer.')

Number of requested results 10 is greater than number of elements in index 1, updating n_results = 1


[Document(page_content='Birth control pills may not work properly while you are taking this medicine. Talk to your doctor about using an extra method of birth control. Women who can still have children must use a reliable form of barrier contraception, like a condom or diaphragm.', metadata={'identifier': 'EN_752626_108872-11', 'source': 921550})]

In [29]:
# Filter df by those predicted as postive
df_pos = predictions_with_text[predictions_with_text.predicted_label==1].merge(df_topic, how="inner", on="doc_id")
print(len(df_pos))
print(len(df_topic))

47237
47579


In [None]:
non_consistent = []
for index, row in df_pos.sample(n=1000, random_state=4).iterrows():
    if index % 100 == 0:
        print(f"-- -- Processed index {index / len(df_pos)}")
    #print("#"*100)
    #print(f"-- -- DOC: {row['text_x']}")
    d_response = chain({"question": row.text_x})
    #print(d_response)
    if d_response["answer"] != "CONSISTENT":
        print("Response: ", d_response["answer"])
        non_consistent.append({row.doc_id:d_response["answer"]})
    else:
        vectorstore.add_documents([Document(page_content=row['text_x'], metadata={"identifier": row['doc_id'], "source": row["id_top_x"]})])

## Consistencies

In [232]:
non_consistent[30:40]

[]

In [None]:
posible_candidates = [6]

In [229]:
df_pos[df_pos.doc_id == "EN_1389539_301249-57"].text_x

1027    Key points about pregnancy loss:\n- Pregnancy loss is the death of an unborn baby (fetus) at any time during pregnancy.\n- Pregnancy loss occurs in up to 1 in every 4 pregnancies. Most happen during the first trimester.\n- About half of early pregnancy losses are from defects in genes or chromosomes.\n- Vaginal bleeding is the most common symptom of pregnancy loss.\n- The loss of a baby at any time in pregnancy can be emotionally and physically hard for the mother and other members of the family. Counseling and support of the family are important.
Name: text_x, dtype: object

In [231]:
df_pos[df_pos.id_top_x == 1985].text_x

58    What is my risk of miscarriage by week: Your risk of pregnancy loss declines each week you’re pregnant. Around 15% of pregnancies end in miscarriage. Miscarriage risk in the second trimester (13 to 19 weeks) is between 1% and 5%. Many factors affect your risk of miscarriage such as your age and health. However, everyone’s risk of miscarriage declines each week of pregnancy if the pregnant person has no other health conditions.
Name: text_x, dtype: object

In [168]:
non_consistent[28]

{'EN_7751_462-40': "CONTRADICTION - Hormone treatment is not a replacement for birth control according to the new document, while document 3774 states that testosterone doesn't completely stop egg production, so some trans men can still get pregnant even while on hormone treatment. contradiction_rationale: The new document implies that hormone treatment alone is not sufficient for birth control, while document 3774 suggests that hormone treatment may not fully prevent pregnancy. ids_contradiction: 3774"}

In [25]:
import logging
import numpy as np
import pandas as pd
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from scipy.sparse import vstack, csr_matrix, hstack
import copy

class Blade(object):
    def __init__(
        self,
        thetas: np.ndarray,
        S3: np.ndarray,
        df: pd.DataFrame,
        logger: logging.Logger = None
    ):
        """_summary_

        Parameters
        ----------
        thetas: np.ndarray
            Document-topic distribution
        S3: np.ndarray
            For each document and topic, sum of the betas of the words in the document
        df: pd.DataFrame
            DataFrame containing document metadata
        logger: logging.Logger
        """
        
        self._logger = logger if logger else logging.getLogger(__name__)
        
        # Save input data
        self.thetas = thetas
        self.S3 = S3
        self.df_docs = df
        # Construct features as the concatenation of thetas and S3
        self.X = hstack(
            [csr_matrix(copy.deepcopy(thetas)).astype(np.float64),
             csr_matrix(copy.deepcopy(S3)).astype(np.float64)
            ], format='csr'
        )
        
        # Read keywords
        try:
            with open("words.txt", "r") as file:
                self.keys = [line.strip() for line in file]
        except:
            self.keys = [
                'infant',
                'postpartum',
                'pregnant',
            ]    
        
        # Initialize classifier
        self._init_classifier()
        
        # Placeholder for training data
        self.X_train = np.empty((0, self.X.shape[1]))
        self.y_train = np.array([])
        
        # Initialize pool
        self.X_pool = self.X
        self.df_pool = df.copy()
        
        # Preprocess positive and negative indices
        self._preprocess_indices()
        
    def _init_classifier(self):
        self.learner = SGDClassifier(loss="log_loss", penalty='l2', tol=1e-3, random_state=42, learning_rate="optimal", eta0=0.1, validation_fraction=0.2, alpha=0.000005)
        self._logger.info("-- -- Active Learner initialized.")
    
    def _preprocess_indices(self):
        # Identify probable positive samples
        self.positive_indices = self.df_pool[self.df_pool['text'].str.contains('|'.join(self.keys), case=False, na=False)].index.to_list()
        
        # Identify probable negative samples
        self.avg_S3 = np.mean(self.S3, axis=1)
        self.negative_indices = np.argsort(self.avg_S3).tolist()
    
    def update_indices(self, used_index):
        # Remove used index from positive_indices and negative_indices
        if used_index in self.positive_indices:
            self.positive_indices.remove(used_index)
        if used_index in self.negative_indices:
            self.negative_indices.remove(used_index)
    
    def preference_function(self, iteration):
        # Get prediction probabilities from the classifier
        if len(self.y_train) > 0:
            probas = self.learner.predict_proba(self.X_pool)
            # Calculate uncertainty as the entropy of the prediction probabilities
            uncertainty = -np.sum(probas * np.log(probas + 1e-10), axis=1)
        else:
            uncertainty = np.ones(self.X_pool.shape[0])
        
        selection_type = iteration % 3
        
        if selection_type == 0 and len(self.positive_indices) > 0:
            # Select probable positive samples
            positive_uncertainty = uncertainty[self.positive_indices]
            selected_idx = self.positive_indices[np.argmax(positive_uncertainty)]
        
        elif selection_type == 1 and len(self.negative_indices) > 0:
            # Select probable negative samples
            valid_negative_indices = [i for i in self.negative_indices if i < len(uncertainty)]
            combined_scores = uncertainty[valid_negative_indices] / (self.avg_S3[valid_negative_indices] + 1e-10)  # Adding a small value to avoid division by zero
            selected_idx = valid_negative_indices[np.argmax(combined_scores)]
            
        else:
            # Select based on uncertainty
            selected_idx = np.argmax(uncertainty)
        
        return [selected_idx]

    def request_labels(self, query_instances, indices):
        # Simulate user labeling
        labels = []
        for query_instance, idx in zip(query_instances, indices):
            doc_id = self.df_pool.iloc[idx]['id_top']
            doc_content = self.df_pool.iloc[idx]['text']  # Assuming there is a 'text' column
            print(f"Document ID: {doc_id}")
            print(f"Document Content: {doc_content}")
            label = int(input("Please provide the label for the queried instance (0 or 1): "))
            labels.append(label)
        return np.array(labels)

    def active_learning_loop(self, n_queries=10):
        for idx in range(n_queries):
            # Use preference function to get the preferred indices from the pool
            preferred_indices = self.preference_function(idx)
            
            # Select the most preferred instance from the pool
            query_idx = preferred_indices[0]
            query_instance = self.X_pool[query_idx].reshape(1, -1)
            label = self.request_labels(query_instance, [query_idx])

            # Add the queried instance to the training set
            self.X_train = vstack([self.X_train, query_instance])
            self.y_train = np.append(self.y_train, label)
            
            # Fit the classifier with the new data
            self.learner.partial_fit(self.X_train, self.y_train, classes=np.array([0, 1]))

            # Remove queried instance from the pool
            self.X_pool = vstack([self.X_pool[:query_idx], self.X_pool[query_idx+1:]])
            self.df_pool = self.df_pool.drop(self.df_pool.index[query_idx]).reset_index(drop=True)
            
            # Update indices
            self.update_indices(query_idx)

            # Log the process
            self._logger.info(f'Iteration {idx + 1}/{n_queries}, Document ID: {query_idx}')

    def evaluate(self):
        print("Evaluation is not applicable since there are no ground truth labels.")
        self._logger.info('No labeled data available to evaluate the model.')

    def predict(self):
        if len(self.y_train) > 0:
            predictions = self.learner.predict(self.X_pool)
            self.df_pool['predicted_label'] = predictions
            return self.df_pool[['id_top', 'text', 'predicted_label']]
        else:
            self._logger.info('No labeled data available to train the model.')
            return None

    def get_predictions_with_text(self):
        if len(self.y_train) > 0:
            predictions = self.learner.predict(self.X_pool)
            self.df_pool['predicted_label'] = predictions
            return self.df_pool
        else:
            self._logger.info('No labeled data available to train the model.')
            return None

In [26]:
blade = Blade(thetas, S3, df_en_raw)

In [27]:
blade.active_learning_loop()

Document ID: 2
Document Content: The outbreak in Duval County began in April 1991 and peaked in October 1991. The last case was reported in January 1992 (Figure 1). Seventy-five (51%) cases were reported from three inner-city zip code areas in Jacksonville. The overall incidence of measles in Duval County was 22 cases per 100,000 population. The age-specific incidence was highest for children aged less than 5 years (205 cases per 100,000). Of the 146 reported measles cases, 111 (76%) occurred among children aged less than 5 years, including 42 (29%) among children aged less than 12 months. Transmission between mother and infant was documented in 12 cases (six mother-infant pairs). School-aged children (aged 5-19 years) and adults aged greater than or equal to 20 years accounted for 15% and 9% of cases, respectively.


Please provide the label for the queried instance (0 or 1):  1


Document ID: 1371678
Document Content: Sep 01, 2015: Mark Peeples, PhD, and Octavio Ramilo, MD, both principal investigators in the Center for Vaccines and Immunity at Nationwide Children’s Hospital were recently awarded a $6.75 million grant from the National Institute of Allergy and Infectious Diseases.


Please provide the label for the queried instance (0 or 1):  0


Document ID: 1009508
Document Content: So parents need to know how to keep kids safe in and on the water — whether they're in the bathtub, on a boat, in your backyard pool, or out and about.


Please provide the label for the queried instance (0 or 1):  1


Document ID: 257164
Document Content: “Partners For Kids is at its best when we can successfully ‘fill in the gaps’ so that a complex health care system delivers better care to children who need it, ” says Sean Gleeson, MD, president of Partners For Kids.


Please provide the label for the queried instance (0 or 1):  0


Document ID: 1200271
Document Content: To dispose of used syringes, needles and cannulas: CAUTION: Proper disposal of needles and syringes is very important.


Please provide the label for the queried instance (0 or 1):  0


Document ID: 639735
Document Content: This test helps diagnose heart disease. A healthcare provider injects a small amount of a radioactive substance (called a tracer or radiopharmaceutical) into the bloodstream. Your blood vessels and heart muscle absorb the tracer, making them more visible in images. Then the provider uses a special camera to take pictures of blood flow in and around the heart.


Please provide the label for the queried instance (0 or 1):  1


Document ID: 663765
Document Content: On October 3, 2012, FDA issued a document that lists observations Cdc-pdf[PDF – 2 pages]External made by the FDA investigators during the inspection of Chamberlain Farms.


Please provide the label for the queried instance (0 or 1):  0


Document ID: 101286
Document Content: There are many options for birth control after you have a baby, such as pills, the birth control implant, or the intrauterine device (IUD). Many birth control options can be started right after giving birth.


Please provide the label for the queried instance (0 or 1):  1


Document ID: 990227
Document Content: Perivascular space dilation is associated with vascular amyloid-β accumulation in the overlying cortex.


Please provide the label for the queried instance (0 or 1):  1


Document ID: 361473
Document Content: Reported by: EJ Woo, MD, R Ball, MD, M Braun, MD, Center for Biologics Evaluation and Research, Food and Drug Admin, Rockville, Maryland. T Clark, MD, N Rosenstein Messonnier, MD, Div of Bacterial Diseases; M Wharton, MD, National Center for Immunization and Respiratory Diseases (proposed); C Vellozzi, MD, S Campbell, MSPH, E Weintraub, MPH, R Davis, MD, Immunization Safety Office, Office of the Chief Science Officer, CDC.


Please provide the label for the queried instance (0 or 1):  0


In [28]:
predictions = blade.predict()
#if predictions is not None:
#    print(predictions)

predictions_with_text = blade.get_predictions_with_text()
pd.set_option('display.max_colwidth', None)
predictions_with_text[predictions_with_text.predicted_label==0]

Unnamed: 0,doc_id,id_top,id_preproc,lemmas_x,text,len,thetas,id_tpc,predicted_label
3,EN_569477_70415-6,4,4,opportunity meet social_worker child life specialist nursing staff period,"You will have the opportunity to meet with our social worker, child life specialist and nursing staff during this period.",9,4|0.0035000001080334187 6|0.005900000222027302 9|0.004399999976158142 10|0.0052999998442828655 12|0.0032999999821186066 14|0.9776999950408936,14,0
6,EN_1068430_185949-22,7,7,information technology mean observe hospital ed provider speak observe goal observation care additional treatment rule illness care ed area observation unit people observation hour monitor meet doctor specialty condition receive treatment test condition diagnose care team admit_hospital inpatient additional medical care,"What does it mean to be “observed” in the hospital: Our ED provider may speak with you about being observed. The goal of observation care is to provide additional treatment and rule out serious illness. This type of care may be provided in either our ED area or upstairs in our observation unit. Most people are under observation for less than 24 hours. You will be monitored, meet with doctors from specialties related to your condition, receive treatment or get tests done. If a serious condition is diagnosed by your care team, you may be admitted into the hospital as an inpatient for additional medical care.",41,9|0.026000000536441803 14|0.9739999771118164,14,0
11,EN_37525_9590-4,12,12,cdc release editorial series american_journal public health highlight national centers_excellence youth violence prevention yvpcs series include yvpc lesson_learn youth violence prevention approach community level intervention model insight social_determinant health structural_racism social_norm impact youth read editorial today,"CDC released an editorial series with the American Journal of Public Health, highlighting the National Centers of Excellence in Youth Violence Prevention (YVPCs). This series includes YVPC lessons learned, youth violence prevention approaches, community-level intervention models, and insights on social determinants of health, structural racism, and social norms that impact youth. Read the editorials today!",36,10|1.0,10,0
12,EN_617595_89294-0,13,13,help children_hospital philadelphia_chop center celiac_disease recognize good nutrition lead well outcome nutritional management gluten_free diet treatment celiac_disease consultation medical team,"We can help: At Children’s Hospital of Philadelphia (CHOP), our Center for Celiac Disease recognizes that good nutrition can lead to better outcomes. Nutritional management via a gluten free diet is currently the only treatment for celiac disease, and should only been done in consultation with your medical team.",20,2|0.3880000114440918 8|0.5543000102043152 14|0.05779999867081642,8,0
13,EN_861579_118723-8,14,14,watch medication visit doctor health care professional check direct doctor symptom improve new symptom need lab work,What should I watch for while using this medication: Visit your doctor or health care professional for check ups as directed. Tell your doctor if your symptoms do not improve or if you get new symptoms. You will need to have lab work done regularly.,17,5|0.9932000041007996 6|0.003599999938160181 10|0.0031999999191612005,5,0
...,...,...,...,...,...,...,...,...,...
1393089,EN_673463_97758-29,1393099,1393099,forrest_cb fiks_ag bailey_lc localio_r grundmeier_rw richards t karavite_dj elden_l alessandrini_ea improve adherence otitis_medium guideline clinical decision support physician feedback pediatric pmid,"Forrest CB, Fiks AG, Bailey LC, Localio R, Grundmeier RW, Richards T, Karavite DJ, Elden L, Alessandrini EA. (2013). Improving adherence to otitis media guidelines with clinical decision support and physician feedback. Pediatrics, 131(4):e1071-81. PMID: 23478860.",21,10|0.20180000364780426 19|0.7982000112533569,19,0
1393090,EN_903635_122676-12,1393100,1393100,home treat formication formication symptom treat home information technology train healthcare_provider determine cause information technology seek medical attention symptom symptom happen certain dangerous condition stroke drug_overdose contact healthcare_provider symptom connection prescribed_medication,"What can I do at home to treat formication: Formication isn’t a symptom you should try to treat at home, as it takes a trained healthcare provider to determine what’s causing it. You should also seek medical attention if you have this symptom unexpectedly, as this symptom can happen with certain dangerous conditions like stroke or drug overdoses. You should also contact a healthcare provider if this symptom could have a connection with a prescribed medication you take.",31,5|0.8615000247955322 8|0.13850000500679016,5,0
1393091,EN_951247_150247-1,1393101,1393101,patient center care parent important health care team patient family receive specialized care support emotional work community physician continuity care,"We provide patient-centered care, with parents as an important part of the health care team. Patients and their families receive specialized care as well as support for their emotional well-being, and we work together with community physicians to provide continuity of care.",20,14|1.0,14,0
1393093,EN_694773_101263-20,1393103,1393103,healthy year old widni graduate niños_primeros salud nutrition program ready big adventure kindergarten,"Now a healthy 5-year-old, Widni is graduating from the Niños Primeros en Salud nutrition program and getting ready for her next big adventure: kindergarten.",13,6|0.49570000171661377 8|0.4932999908924103 9|0.0032999999821186066 10|0.004000000189989805 14|0.003700000001117587,6,0
