In [7]:
import pathlib
import pandas as pd
from scipy import sparse
from sklearn.preprocessing import normalize
import numpy as np
import matplotlib.pyplot as plt
import sys
import os
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
import ast
from gensim.matutils import corpus2csc

In [8]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [3]:
def get_doc_top_tpcs(doc_distr, topn=10):
    sorted_tpc_indices = np.argsort(doc_distr)[::-1]
    top = sorted_tpc_indices[:topn].tolist()
    top_weight = [(k, doc_distr[k]) for k in top]
    return top_weight

def get_doc_main_topc(doc_distr):
    sorted_tpc_indices = np.argsort(doc_distr)[::-1]
    top = sorted_tpc_indices[:1][0]
    return top

In [None]:
path_source = pathlib.Path("/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/source/corpus_rosie/passages/29_dec/all/df_1.parquet")
df_raw = pd.read_parquet(path_source)

In [None]:
df_raw_en = df_raw[df_raw["doc_id"].str.contains("EN")]

In [None]:
df_raw_en.columns

In [6]:
path_model = pathlib.Path("/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/models/29_dec/all/poly_rosie_1_20")

print("Reading thetas start")
thetas = sparse.load_npz(path_model / "mallet_output" / "thetas_EN.npz").toarray()
print("Reading thetas finshed")

print("Reading betas start")
betas = np.load(path_model / "mallet_output" / "betas_EN.npy")
print("Reading betas finshed")

Reading thetas start
Reading thetas finshed
Reading betas start
Reading betas finshed


In [None]:
ntopics = betas.shape[0]
size_vocab = betas.shape[1]
betas_ds = np.copy(betas)
if np.min(betas_ds) < 1e-12:
    betas_ds += 1e-12
deno = np.reshape((sum(np.log(betas_ds)) / ntopics), (size_vocab, 1))
deno = np.ones((ntopics, 1)).dot(deno.T)
betas_ds = betas_ds * (np.log(betas_ds) - deno)

# get weight of each word as maximum 
betas_word = np.max(betas, axis=0)
betas_word_ds = np.max(betas_ds, axis=0)

# read source
print("Reading documents")
#df = pd.read_parquet(path_source)
#df_lang = df[df["doc_id"].str.contains("EN")]

#documents_texts = df_lang["lemmas"].apply(lambda x: x.split()).tolist()
corpus_en_path = pathlib.Path("/export/usuarios_ml4ds/lbartolome/Repos/umd/LinQAForge/data/models/29_dec/all/poly_rosie_1_20/train_data/corpus_EN.txt")
with open(corpus_en_path) as file:
    lines = file.readlines()
lines_en = lines[:len(thetas)]
corpus_en = [el.strip().split(" EN")[1].split() for el in lines_en]

df_lang = pd.DataFrame(
    {
        "lemmas": corpus_en,
        "len": [len(el) for el in corpus_en]
    }
)

vocab_w2id = {}
vocab_id2w = {}
with open(path_model / "mallet_output" / "vocab_EN.txt") as file:
    for i, line in enumerate(file):
        # Strip leading and trailing whitespace
        stripped_line = line.strip()
        # Split the line into words and numbers
        parts = stripped_line.split()
        if parts:
            # Get the word (first part)
            wd = parts[0]
            # Populate the dictionaries
            vocab_w2id[wd] = i
            vocab_id2w[str(i)] = wd

documents_texts = df_lang["lemmas"].tolist()

# Use your existing vocabulary
documents_texts_join = [' '.join(doc) for doc in df_lang["lemmas"]]
custom_vocab = vocab_w2id if vocab_w2id else None

# Initialize the TfidfVectorizer with custom vocabulary (optional)
vectorizer = TfidfVectorizer(vocabulary=custom_vocab)

# Fit the vectorizer and transform the documents
tfidf_matrix = vectorizer.fit_transform(documents_texts_join)

# Retrieve the column index of the word
def get_tfidf_value(word, doc_index):
    if word not in custom_vocab:
        raise ValueError(f"Word '{word}' is not in the vocabulary.")
    word_index = custom_vocab[word]  # Get the column index of the word
    tfidf_value = tfidf_matrix[doc_index, word_index]  # Access the sparse value
    return tfidf_value


####
# SCORE
####

D = len(thetas)
doc_score = np.zeros((D, 1))

for doc in range(D):
    wd_ids = [vocab_w2id[word] for word in documents_texts[doc] if word in vocab_w2id]
    words_not_in_vocab = [word for word in documents_texts[doc] if word not in vocab_w2id]

    # try to split by '_' words in words_not_in_vocab
    additional_words = []
    for word in words_not_in_vocab:
        if "_" in word:
            additional_words += word.split("_")
            words_not_in_vocab.remove(word)
    wd_ids += [vocab_w2id[word] for word in additional_words if word in vocab_w2id]

    words_not_in_vocab = list(set(words_not_in_vocab))

    if len(words_not_in_vocab) >= 1:
        penalization = 1/len(words_not_in_vocab)
    else:
        penalization = 1

    #print("DOCUMENT: ", documents_texts[doc])
    #print("WORDS NOT IN VOCAB: ", words_not_in_vocab)
    
    # Skip documents with no valid words
    if len(wd_ids) <= 1:
        doc_score[doc] = 0  # Or any default score for empty documents
        continue

    #print("BETAS WORD: ", betas_word[wd_ids])
    #print("PENALIZATION: ", penalization)

    #denominator = len(wd_ids) #np.log(len(wd_ids) + 1e-10)  # Small epsilon to prevent log(0)
    denominator = len(list(set(wd_ids))) #np.log(len(wd_ids) + 1e-10)  # Small epsilon to prevent log(0)
    #print("DENOM: ", denominator)
    #weights = [betas_word_ds[wd_id] * get_tfidf_value(vocab_id2w[str(wd_id)], doc) for wd_id in wd_ids]
    doc_score[doc] = (np.sum(betas_word_ds[wd_ids]) *  penalization)/ denominator
    #doc_score[doc] = (np.sum(weights) *  penalization)/ denominator
    #print("DOC SCORE: ", doc_score[doc])

    #import pdb; pdb.set_trace()

In [None]:
df_lang["doc_score"] = doc_score
df_lang["text"] = df_raw_en["text"]
labelled = df_lang[df_lang.doc_score < np.percentile(doc_score,1)]
labelled["label"] = len(labelled) * [0]

In [None]:
labelled

In [None]:
#labelled.to_excel("labelled.xlsx")

In [None]:
# KDE of the scores
document_scores = doc_score
plt.figure(figsize=(10, 6))
sns.histplot(doc_score, kde=True, bins=30, color='blue', label='Document Scores')
plt.title('Histogram and KDE of Document Scores')
plt.xlabel('$\\xi_d$ Scores')
plt.ylabel('Frequency')
plt.legend()
plt.show()

# Step 3: Set a tentative threshold
mean_score = np.mean(document_scores)
std_score = np.std(document_scores)
threshold_mu_sigma = mean_score - std_score
threshold_percentile = np.percentile(document_scores, 10)

print(f"Mean (\u03bc): {mean_score:.3f}")
print(f"Standard Deviation (\u03c3): {std_score:.3f}")
print(f"Threshold (\u03bc - \u03c3): {threshold_mu_sigma:.3f}")
print(f"Threshold (10th Percentile): {threshold_percentile:.3f}")

# Step 4: Identify documents below the thresholds
irrelevant_mu_sigma = document_scores[document_scores < threshold_mu_sigma]
irrelevant_percentile = document_scores[document_scores < threshold_percentile]

print(f"Number of irrelevant documents (\u03bc - \u03c3): {len(irrelevant_mu_sigma)}")
print(f"Number of irrelevant documents (10th Percentile): {len(irrelevant_percentile)}")

# Step 5: Iterate and review (Manual inspection required)
# Example: Plot documents flagged as irrelevant for visual inspection
plt.figure(figsize=(10, 6))
sns.histplot(irrelevant_percentile, kde=False, bins=20, color='red', label='Flagged as Irrelevant')
plt.title('Documents Flagged as Irrelevant')
plt.xlabel('$\\xi_d$ Scores')
plt.ylabel('Frequency')
plt.legend()
plt.show()

In [None]:
removed = df_lang[df_lang.doc_score < np.percentile(doc_score,10)]#.sample(200)
#removed.to_excel("removed.xlsx")
#removed.to_excel("removed.xlsx")

In [15]:
annotated = pd.read_excel("removed (1).xlsx")
accuracy =  (1 - len(annotated[annotated.label == 1]) / len(annotated)) * 100
accuracy

67.0

In [13]:
annotated[annotated.label == 1]

Unnamed: 0.1,Unnamed: 0,lemmas,len,doc_score,text,label
3,651693,"['sunscreen_spf', 'high']",2,0.009958,"Use sunscreen with SPF 15 or higher, depending on how long you'll be outside.",1
4,1366430,"['feed', 'wash', 'bottle_nipple', 'ring', 'cap', 'bowl', 'hot_soapy', 'water_dishwasher', 'follow', 'package', 'instruction', 'rinse', 'let', 'air', 'dry', 'clean', 'bottle', 'brush_basin', 'bowl', 'hot_soapy', 'water']",21,0.015131,"After Feeding: Wash the bottle, nipple, rings, and cap in a bowl of hot soapy water or in a dishwasher (follow package instructions). Rinse completely and let air dry. Clean the bottle brush and basin or bowl with hot soapy water.",1
5,425018,"['vee', 'appear', 'venezuela', 'april', 'spread', 'case', 'colombia', 'recognize', 'august', 'common', 'border', 'country', 'virus', 'likely', 'transfer', 'viremic', 'equine', 'human', 'spread', 'western', 'coast', 'la_guajira', 'comparison', 'viral', 'strain', 'colombia_venezuela', 'indicate', 'close', 'genetic', 'relation']",30,0.009786,"VEE appeared in Venezuela in April 1995 and spread westward, with the first cases in Colombia recognized in August at the common border of the two countries. The virus most likely was transferred in a viremic equine or human and spread from there to the western coast of La Guajira. Comparisons of viral strains from Colombia and Venezuela indicate a close genetic relation (2).",1


In [18]:
annotated[annotated.label == 1].head(66)

Unnamed: 0.1,Unnamed: 0,lemmas,len,doc_score,text,label
3,651693,"['sunscreen_spf', 'high']",2,0.009958,"Use sunscreen with SPF 15 or higher, depending on how long you'll be outside.",1
4,1366430,"['feed', 'wash', 'bottle_nipple', 'ring', 'cap', 'bowl', 'hot_soapy', 'water_dishwasher', 'follow', 'package', 'instruction', 'rinse', 'let', 'air', 'dry', 'clean', 'bottle', 'brush_basin', 'bowl', 'hot_soapy', 'water']",21,0.015131,"After Feeding: Wash the bottle, nipple, rings, and cap in a bowl of hot soapy water or in a dishwasher (follow package instructions). Rinse completely and let air dry. Clean the bottle brush and basin or bowl with hot soapy water.",1
5,425018,"['vee', 'appear', 'venezuela', 'april', 'spread', 'case', 'colombia', 'recognize', 'august', 'common', 'border', 'country', 'virus', 'likely', 'transfer', 'viremic', 'equine', 'human', 'spread', 'western', 'coast', 'la_guajira', 'comparison', 'viral', 'strain', 'colombia_venezuela', 'indicate', 'close', 'genetic', 'relation']",30,0.009786,"VEE appeared in Venezuela in April 1995 and spread westward, with the first cases in Colombia recognized in August at the common border of the two countries. The virus most likely was transferred in a viremic equine or human and spread from there to the western coast of La Guajira. Comparisons of viral strains from Colombia and Venezuela indicate a close genetic relation (2).",1
12,703612,"['traditional', 'case_definition', 'cf', 'base', 'sweat_chloride', 'level', 'meq_l', 'pilocarpine', 'iontophoresis', 'sweat', 'test', 'presence', 'pulmonary', 'disease', 'pancreatic_insufficiency', 'sweat', 'testing', 'consist', 'electrical', 'chemical', 'stimulation', 'skin', 'produce', 'sweat', 'collection', 'sweat', 'gauze_pad', 'filter', 'paper', 'laboratory', 'analysis', 'chloride', 'content', 'collect', 'sweat', 'cff', 'consensus_panel', 'develop', 'new', 'case_definition', 'cf', 'base', 'multiple', 'criterion', 'presence', 'characteristic', 'phenotypic', 'feature', 'history', 'cf', 'sibling', 'positive', 'newborn_screening', 'test', 'laboratory', 'evidence', 'cftr', 'abnormality', 'document', 'elevate', 'sweat_chloride', 'concentration', 'identification', 'cftr_mutation', 'associate', 'cf', 'vivo', 'demonstration', 'characteristic', 'abnormality', 'ion_transport', 'nasal', 'epithelium', 'sweat_chloride', 'level', 'meq_l', 'diagnostic', 'infant', 'cf', 'initial', 'sweat', 'value', 'meq_l', 'sweat', 'testing', 'perform', 'majority', 'infant', 'age', 'week', 'infant', 'sufficient_quantity', 'sweat', 'reliable', 'testing']",95,0.010798,"The traditional case definition for CF was based on a sweat chloride level of >60 mEq/L from pilocarpine iontophoresis (sweat test) and the presence of pulmonary disease or pancreatic insufficiency. Sweat testing consists of electrical-chemical stimulation of skin to produce sweat, collection of sweat on gauze pads or filter paper, and laboratory analysis of chloride content in collected sweat. In 1999, a CFF consensus panel developed a new case definition for CF based on multiple criteria: the presence of >1 characteristic phenotypic feature or a history of CF in a sibling or a positive newborn screening test, together with laboratory evidence of a CFTR abnormality as documented by 1) elevated sweat chloride concentrations, 2) identification of two CFTR mutations associated with CF, or 3) in vivo demonstration of characteristic abnormalities in ion transport across the nasal epithelium (37,15). Although a sweat chloride level of 60 mEq/L is diagnostic, infants with CF often have initial sweat values of 30--59 mEq/L (38,39). Sweat testing can be performed accurately on the majority of infants at age 2--3 weeks; however, not all infants have sufficient quantities of sweat for reliable testing (40).",1
20,420982,"['human', 'health', 'effect', 'etu_ptu', 'low', 'environmental', 'dose_biomonitored', 'level', 'low', 'environmental', 'exposure', 'unknown', 'ebdcs_propineb', 'absorb', 'follow', 'ingestion_inhalation', 'dermal_exposure', 'ebdc', 'skin', 'inhale', 'produce', 'irritation', 'ebdcs_propineb', 'toxic', 'human', 'animal', 'ipcs', 'fungicide', 'breakdown', 'body', 'toxic', 'effect', 'primary', 'concern', 'etu_ptu', 'animal', 'study', 'etu', 'absorb', 'gastrointestinal_tract', 'excrete_urine', 'unchanged', 'etu', 'oxidative_metabolite', 'allen', 'camoni', 'houeto', 'iverson', 'propineb', 'administer', 'animal', 'metabolize', 'urinary_metabolite', 'include', 'ptu', 'animal', 'testing', 'etu_ptu', 'respective', 'parent', 'fungicide', 'produce', 'thyroid', 'hyperplasia', 'decrease', 'serum', 'thyroxine', 'level', 'increase', 'thyroid_stimulate', 'hormone', 'level', 'effect', 'attribute', 'inhibition', 'enzyme', 'thyroid_peroxidase', 'fao', 'world', 'health', 'organization', 'ntp', 'epa', 'small', 'number', 'rubber', 'manufacture', 'worker', 'expose', 'etu', 'report', 'low', 'serum', 'level', 'thyroxine', 'smith', 'increase', 'animal', 'follicular', 'cell', 'adenoma', 'carcinoma', 'liver', 'pituitary_tumor', 'follow', 'prolong', 'etu', 'dosing', 'ntp']",109,0.011677,"Human health effects from ETU or PTU at low environmental doses or at biomonitored levels from low environmental exposures are unknown. The EBDCs and propineb are absorbed following ingestion, inhalation, and dermal exposure. EBDCs applied to the skin or inhaled can produce irritation, but neither EBDCs nor propineb are highly toxic to humans or animals (IPCS, 1988). These fungicides breakdown rapidly in the body, so that the toxic effects of primary concern are from ETU and PTU, respectively. In animal studies ETU was well absorbed from the gastrointestinal tract and rapidly excreted in the urine as unchanged ETU and several oxidative metabolites (Allen et al., 1978; Camoni et al., 1984; Houeto et al., 1995; Iverson et al., 1980). Propineb administered to animals is rapidly metabolized and urinary metabolites include PTU. In animal testing, ETU and PTU, as well as the respective parent fungicides, have produced thyroid hyperplasia, decreased serum thyroxine levels, and increased thyroid stimulating hormone levels; these effects were attributed to inhibition of the enzyme thyroid peroxidase (FAO/WHO, 1999; NTP, 1992; U.S. EPA 2005a). A small number of rubber manufacturing workers exposed to ETU were reported to have lower serum levels of thyroxine (Smith, 1984). Increases in animal thyroidal follicular cell adenomas and carcinomas, liver and pituitary tumors have followed prolonged ETU dosing (NTP, 1992).",1
...,...,...,...,...,...,...
184,1108836,"['colon', 'size', 'time', 'specialist', 'large', 'hospital', 'brayden', 'home', 'montrose', 'pa', 'perform', 'test', 'boy', 'colon', 'know', 'large_intestine', 'large', 'average', 'adult', 'colon', 'information', 'technology', 'expel_waste', 'brayden', 'diagnose', 'gastrointestinal_gastrointestinal', 'motility_disorder']",27,0.014855,"A colon twice the size: This time, a specialist at a larger hospital near Brayden's home in Montrose, PA, performed several tests and discovered the boy's colon (also known as the large intestine) was twice as large as an average adult's colon. It was also not properly expelling waste. Brayden was diagnosed with a gastrointestinal (GI) motility disorder.",1
185,3548,"['valley_fever', 'call', 'coccidioidomycosis', 'sid_ee', 'oy_doh', 'koh_sis', 'infection', 'cause', 'breathe', 'fungus_coccidioide', 'sid_ee', 'oy_deze', 'coccidioide', 'live', 'soil', 'southwest', 'part', 'washington', 'state', 'central', 'south_america', 'information', 'technology', 'name', 'san_joaquin', 'valley', 'california', 'information', 'technology']",29,0.012264,"Valley fever, also called coccidioidomycosis (“cahk-sid-ee-oy-doh-my-KOH-sis”), is an infection caused by breathing in the fungus Coccidioides (“cahk-sid-ee-OY-deze”). Coccidioides lives in the soil in the southwest U.S., parts of Washington state, and Central and South America. It’s named for the San Joaquin Valley in California, where it was first discovered.",1
186,484853,"['editorial_note', 'cryptosporidium_chlorine', 'resistant', 'parasite', 'cause', 'illness', 'ingestion_oocyst', 'remain', 'infectious', 'month', 'moist_environment', 'outbreak', 'investigation', 'detection', 'identical', 'subtype_homini', 'specie', 'restrict', 'human', 'stool_specimen', 'patient', 'water', 'sample', 'sand_filter', 'drinking_fountain', 'implicate', 'ingestion', 'contaminate', 'splash', 'feature', 'drinking_fountain', 'water', 'illness', 'reported', 'exposure', 'occur', 'july', 'splash_park', 'water', 'collect', 'august', 'test', 'positive', 'cryptosporidium', 'initial', 'contamination', 'splash_park', 'water', 'ill', 'visitor', 'cause', 'persistent', 'contamination', 'splash_park', 'system', 'result', 'ongoing', 'transmission', 'similar', 'outbreak', 'occur', 'splash_park', 'lack', 'ultraviolet_ozone', 'treatment', 'system', 'inactivate_cryptosporidium', 'splash_park', 'operator', 'rely', 'high', 'flow', 'sand_filtration', 'chlorine_disinfection', 'protect', 'patron', 'cryptosporidium']",77,0.007462,"Editorial Note: Cryptosporidium, a chlorine-resistant parasite, can cause illness after ingestion of as few as 10 oocysts, and can remain infectious for up to 6 months in moist environments (8). In this outbreak investigation, detection of identical subtypes of C. hominis, a species primarily restricted to humans (9), in the stool specimens of patients and in water samples from the sand filters and drinking fountain implicated ingestion of fecally contaminated splash-feature and drinking fountain water as the cause of the illnesses. Because reported exposures occurred during July 23--August 10 and splash park water collected on August 20 tested positive for Cryptosporidium, initial contamination of splash park water by an ill visitor likely caused persistent contamination of the splash park system and resulted in ongoing transmission. Similar outbreaks have occurred at other splash parks that lacked ultraviolet or ozone treatment systems that can inactivate Cryptosporidium (1,3). Splash park operators cannot rely solely upon high-flow sand filtration and chlorine disinfection to protect patrons from Cryptosporidium.",1
190,731332,"['high', 'efficiency_filter', 'mean', 'filter', 'efficient_mono', 'disperesed', 'particle_µm', 'micrometer_diameter', 'high']",9,0.005252,** A high-efficiency filter means a filter that is at least 99.97% efficient against mono-disperesed particles of 0.3 µm (micrometers) in diameter or higher.,1
