In [197]:
import pandas as pd
import re
import string
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
import spacy
import random
from nltk.stem import PorterStemmer
from nltk.corpus import wordnet
import random

tqdm.pandas()
nlp = spacy.load("en_core_web_sm")

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\annap\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\annap\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\annap\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
from nltk.stem import SnowballStemmer
SnowballStemmer.languages

('arabic',
 'danish',
 'dutch',
 'english',
 'finnish',
 'french',
 'german',
 'hungarian',
 'italian',
 'norwegian',
 'porter',
 'portuguese',
 'romanian',
 'russian',
 'spanish',
 'swedish')

In [3]:
fn_files = ['cf74', 'cf75', 'cf76', 'cf77', 'cf78', 'cf79']
fn_queries = 'cfquery'

# Read Text

In [4]:
def extract_informations(filename):
    data = {}
    id_actual = None
    information = ""

    try:
        with open(filename, "r", encoding='ansi') as file:
            for line in file:
                if line.startswith("PN"):
                    # Se há uma informação anterior, armazena no dicionário
                    if id_actual and information:
                        data[id_actual] = information
                        information = ""

                    # Obtém o ID a partir da segunda palavra
                    id_actual = line.split()[1]
                elif line.startswith(("TI", "MJ", "MN", "AB", "EX")) and id_actual:
                    information += line[3:].strip()
                    while True:
                        next_line = file.readline()
                        if not next_line or re.match(r'\b[A-Z]{2}\s', next_line):
                            break
                        information += " " + next_line.strip()

            # Adiciona a última informação ao dicionário
            if id_actual and information:
                data[id_actual] = information

    except FileNotFoundError:
        print("File not founded.")

    return data

# Dicionário para armazenar os data
data_complete = {}

# Extrai as informações de cada file
for fn in fn_files:
    path_file = f"../data/{fn}"
    data = extract_informations(path_file)

    data_complete.update(data)

df_data = pd.DataFrame(list(data_complete.items()), columns=['ID', 'TEXT'])

df_data.head()

Unnamed: 0,ID,TEXT
0,74001,Pseudomonas aeruginosa infection in cystic fib...
1,74002,Amylase content of mixed saliva in children.SA...
2,74003,A clinical study of the diagnosis of cystic fi...
3,74004,A methodological study of the diagnosis of cys...
4,74005,Proteolytic activity in duodenal juice in infa...


# Read Queries

In [5]:
def read_file_query(file_path):
 
    data = {'QN': [], 'QU': [], 'NR': [], 'RD': []}

    qn_id = None
    qu_texto = None
    nr_numero = None
    rd_lista = []

    with open(file_path, 'r') as arquivo:
        for linha in arquivo:
            if linha.startswith('RD'):
                rd_lista = [int(x) for x in re.findall(r'\d+', linha)]
                while True:
                    try:
                        proxima_linha = next(arquivo)
                        if proxima_linha.startswith('QN'):
                            # Salva os dados acumulados até aqui
                            if qn_id is not None:
                                data['QN'].append(qn_id)
                                data['QU'].append(qu_texto)
                                data['NR'].append(nr_numero)
                                data['RD'].append(rd_lista)
                            # Reinicia as variáveis para o próximo 'QN'
                            qn_id = int(re.search(r'\d+', proxima_linha).group())
                            qu_texto = None
                            nr_numero = None
                            rd_lista = []
                            break
                        rd_lista.extend([int(x) for x in re.findall(r'\d+', proxima_linha)])
                    except StopIteration:
                        break
            elif linha.startswith('QN'):
                qn_id = int(re.search(r'\d+', linha).group())
            elif linha.startswith('QU'):
                qu_texto = linha[3:].strip()
            elif linha.startswith('NR'):
                nr_numero = int(re.search(r'\d+', linha).group())

    # Adiciona os últimos dados, se houver
    if qn_id is not None:
        data['QN'].append(qn_id)
        data['QU'].append(qu_texto)
        data['NR'].append(nr_numero)
        data['RD'].append(rd_lista)

    df = pd.DataFrame(data)
    return df

# Lê o arquivo e cria o DataFrame
df_query = read_file_query(f'../data/{fn_queries}')
df_query.head()

Unnamed: 0,QN,QU,NR,RD
0,1,What are the effects of calcium on the physica...,34,"[139, 1222, 151, 2211, 166, 1, 311, 1, 370, 10..."
1,2,Can one distinguish between the effects of muc...,7,"[169, 1000, 434, 1001, 454, 100, 498, 1000, 49..."
2,3,How are salivary glycoproteins from CF patient...,43,"[23, 1000, 40, 10, 139, 2122, 190, 1, 221, 1, ..."
3,4,What is the lipid composition of CF respirator...,9,"[503, 1, 538, 100, 539, 100, 540, 100, 553, 1,..."
4,5,Is CF mucus abnormal?,131,"[23, 2220, 47, 2221, 50, 1, 60, 1, 114, 11, 13..."


# Text preprocessing

In [213]:
pp_config = { 'convert': 'lower', # lower or upper 
              'reduce': 'steamming', # stemming or lemming  lemming
              'stop_words': True,  # remove if true else ~remove
              'punctuation':True, # remove if true, else ~remove
              'number': True,
             }

def lemmatize_text_spacy(text):
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc])

def stem_text(text):
    stemmer = PorterStemmer()
    return ' '.join([stemmer.stem(word) for word in word_tokenize(text)])

def remove_stopwords_from_text(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

def remove_punctuation_nltk(text):
    
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    tokens = word_tokenize(text)
    tokens_without_punctuation = [word for word in tokens if word not in string.punctuation]
    
    return ' '.join(tokens_without_punctuation)

def remove_number(text):
    return ' '.join(re.sub(r'\w*\d\w*', '', word) for word in word_tokenize(text))

def preprocess_english_text(text, conf:dict, n_exemple:int):
    
    pp_stats = {}

    if conf['convert'] == 'lower':
        print(f"Before lower ex.: {text[n_exemple].split('.')[0]}")
        text = text.str.lower()
        pp_count_words = list(map(lambda string: len(string.split()), text))
        pp_stats['lower'] = pp_count_words
        print(f"-> After lower ex.:  {text[n_exemple].split('.')[0]}")

    if conf['convert'] == 'upper':
        print(f"Before upper ex.: {text[n_exemple].split('.')[0]}")
        text = text.str.upper()
        pp_count_words = list(map(lambda string: len(string.split()), text))
        pp_stats['upper'] = pp_count_words
        print(f"-> After upper ex.:  {text[n_exemple].split('.')[0]}")

    if conf['reduce'] == 'lemming':
        print(f"Before lemmatizer ex.: {text[n_exemple].split('.')[0]}")
        processed = text.apply(lemmatize_text_spacy)
        pp_count_words = list(map(lambda string: len(string.split()), text))
        pp_stats['lemm'] = pp_count_words
        print(f"-> After lemmatizer ex.: {processed[n_exemple].split('.')[0]}")

    if conf['reduce'] == 'steamming':
        print(f"Before stemming ex.: {text.iloc[n_exemple].split('.')[0]}")
        processed = text.apply(stem_text)
        pp_count_words = processed.apply(lambda x: len(x.split()))
        pp_stats['stem'] = pp_count_words
        print(f"-> After stemming ex.: {processed.iloc[n_exemple].split('.')[0]}")

    if conf['stop_words'] == True:        
        print(f"Before stopword ex.: {processed[n_exemple].split('.')[0]}")
        processed = processed.apply(remove_stopwords_from_text)
        pp_count_words = processed.apply(lambda x: len(x.split()))
        pp_stats['stop_words'] = pp_count_words
        print(f"-> After stopword ex.: {processed.iloc[n_exemple].split('.')[0]}")

    if conf['punctuation'] == True:
        print(f"Before remove punctuation ex.:  {processed[n_exemple].split('.')[0]}")
        processed = processed.apply(remove_punctuation_nltk)
        pp_count_words = processed.apply(lambda x: len(x.split()))
        pp_stats['punctuation'] = pp_count_words
        print(f"-> After remove ponctuation ex.: {processed.iloc[n_exemple].split('.')[0]}")
    
    if conf['number'] == True:

        print(f"Before remove numbers ex.:  {processed[n_exemple].split('.')[0]}")
        processed = [remove_number(text) for text in processed]
        pp_count_words = [len(text.split()) for text in processed]
        pp_stats['number'] = pp_count_words
        print(f"-> After remove numbers ex.: {processed[n_exemple].split('.')[0]}")

    return processed, pp_stats

In [214]:
print(f"{'-'*50}Data: Document{'-'*50}")
df_data_pp = df_data.copy()
df_data_pp['pp'], pp_count_d= preprocess_english_text(df_data['TEXT'], pp_config, n_exemple=0)

print(f"{'-'*50}Data: Queries{'-'*50}")
df_query_pp = df_query.copy()
df_query_pp['pp'], pp_count_q= preprocess_english_text(df_query['QU'], pp_config, n_exemple=0)

--------------------------------------------------Data: Document--------------------------------------------------
Before lower ex.: Pseudomonas aeruginosa infection in cystic fibrosis
-> After lower ex.:  pseudomonas aeruginosa infection in cystic fibrosis
Before stemming ex.: pseudomonas aeruginosa infection in cystic fibrosis
-> After stemming ex.: pseudomona aeruginosa infect in cystic fibrosi 
Before stopword ex.: pseudomona aeruginosa infect in cystic fibrosi 
-> After stopword ex.: pseudomona aeruginosa infect cystic fibrosi 
Before remove punctuation ex.:  pseudomona aeruginosa infect cystic fibrosi 
-> After remove ponctuation ex.: pseudomona aeruginosa infect cystic fibrosi occurr precipit antibodi pseudomona aeruginosa relat concentr sixteen serum protein clinic radiograph statu lungscysticfibrosi co pseudomonasaeruginosa im pseudomonasinfect co respiratorytractinfect coth signific pseudomona aeruginosa infect respiratori tract 9 cystic fibrosi patient studi mean immunoelect

In [215]:
def get_vocabulary(data_text, query_text):
    
    data_complete = data_text.to_list()
    data_complete.extend(query_text.to_list())

    words = []

    for phrase in data_complete:
        for word in phrase.split():
            words.append(word)


    print("Número total de palavras:", len(set(words)))
    return set(words)

def get_synonyms(word):

    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    return list(synonyms)

def choose_reference_synonym(synonyms):
    return random.choice(synonyms) if synonyms else None

def replace_synonyms_in_text(text, synonyms_dict):

    if isinstance(text, str):
        words = text.split()
        replaced_words = [synonyms_dict.get(word, word) for word in words]
        return ' '.join(replaced_words)
    else:
        return text

def reduce_vocabulary(data, query, col_text):
 
    vocabulary = get_vocabulary(data[col_text], query[col_text])
    print("Len vocabulary initial:", len(vocabulary))
    
    synonyms_uniques = {}  

    for word in vocabulary:
        synonyms = get_synonyms(word)
        word_reference = choose_reference_synonym(synonyms)
        if word_reference:
            for synonym in synonyms:
                synonyms_uniques[synonym] = word_reference

    print(f"Before synonyms ex.: {data.loc[0, col_text]}")

    # Aplicando a substituição de sinônimos
    r_data = data.copy()
    r_query = query.copy()
    r_data[col_text] = data[col_text].apply(lambda text: replace_synonyms_in_text(text, synonyms_uniques))
    r_query[col_text] = query[col_text].apply(lambda text: replace_synonyms_in_text(text, synonyms_uniques))

    print(f"After synonyms ex.: {data.loc[0, col_text]}")

    vocabulary = get_vocabulary(r_data[col_text], r_query[col_text])
    print("Len vocabulary final:", len(vocabulary))
   
    return r_data, r_query

In [216]:
# Involve both datas
df_dr, df_qr = reduce_vocabulary(df_data_pp, df_query_pp, 'pp')

# Involve both data, but random choose the synonims
#data, query = reduce_vocabulary(df_data, df_query, ['TEXT', 'QU'])

Número total de palavras: 9064
Len vocabulary initial: 9064
Before synonyms ex.: pseudomona aeruginosa infect cystic fibrosi occurr precipit antibodi pseudomona aeruginosa relat concentr sixteen serum protein clinic radiograph statu lungscysticfibrosi co pseudomonasaeruginosa im pseudomonasinfect co respiratorytractinfect coth signific pseudomona aeruginosa infect respiratori tract  cystic fibrosi patient studi mean immunoelectrophoret analysi patient sera number precipitin pseudomona aeruginosa concentr  serum protein addit clinic radiograph statu lung evalu use  score system precipitin pseudomona aeruginosa demonstr sera maximum number one serum wa  concentr  serum protein significantli chang compar match control person notabl igg iga elev acut phase protein chang latter suggest activ tissu damag concentr  acut phase protein notabl haptoglobin correl number precipitin suggest respiratori tract infect patient mani precipitin accompani tissu damag infect patient precipitin result indic

In [217]:
df_data_pp.head()

Unnamed: 0,ID,TEXT,pp
0,74001,Pseudomonas aeruginosa infection in cystic fib...,pseudomona aeruginosa infect cystic fibrosi oc...
1,74002,Amylase content of mixed saliva in children.SA...,amylas content mix saliva childrensaliva en am...
2,74003,A clinical study of the diagnosis of cystic fi...,clinic studi diagnosi cystic fibrosi instrumen...
3,74004,A methodological study of the diagnosis of cys...,methodolog studi diagnosi cystic fibrosi instr...
4,74005,Proteolytic activity in duodenal juice in infa...,proteolyt activ duoden juic infant children ad...


In [218]:
df_dr.head()

Unnamed: 0,ID,TEXT,pp
0,74001,Pseudomonas aeruginosa infection in cystic fib...,pseudomona aeruginosa infect cystic fibrosi oc...
1,74002,Amylase content of mixed saliva in children.SA...,amylas case mix tongue childrensaliva nut amyl...
2,74003,A clinical study of the diagnosis of cystic fi...,clinic studi diagnosi cystic fibrosi tool_arou...
3,74004,A methodological study of the diagnosis of cys...,methodolog studi diagnosi cystic fibrosi tool_...
4,74005,Proteolytic activity in duodenal juice in infa...,proteolyt activ duoden juic infant children ad...


In [88]:
len(result)

958309

In [18]:
df_data.head()

Unnamed: 0,ID,TEXT
0,74001,Pseudomonas aeruginosa infection in cystic fib...
1,74002,Amylase content of mixed saliva in children.SA...
2,74003,A clinical study of the diagnosis of cystic fi...
3,74004,A methodological study of the diagnosis of cys...
4,74005,Proteolytic activity in duodenal juice in infa...


In [37]:
def tf_idf_technicals(data_documents:dict, data_queries:pd.DataFrame, type:str):

    if type == 'tf':
        vectorizer_tf = TfidfVectorizer(use_idf=False, norm='l1') 
    elif type == 'idf':
        vectorizer_idf = TfidfVectorizer(use_idf=True, smooth_idf=False, norm='l2')  # norm='l2' para obter o IDF
    elif type == 'tf-idf':
        # Inicialize o vetorizador TF-IDF
        vectorizer = TfidfVectorizer()

    document_matrix = vectorizer.fit_transform(data_documents)
    query_vector = vectorizer.transform(data_queries)

    # Obtenha os nomes das features (words)
    words = vectorizer.get_feature_names_out()

    # Crie DataFrames para os vetores TF-IDF
    df_documents = pd.DataFrame(data=document_matrix.toarray(), columns=words)
    df_queries = pd.DataFrame(data=query_vector.toarray(), columns=words)

    return df_documents, df_queries

In [38]:
df_documents, df_queries =  tf_idf_technicals(data_preprocessed.values(), df_queries['QU_preprocessed'], 'tf-idf')

In [40]:
df_documents.head()

Unnamed: 0,aa,aat,aathe,aathis,ab,abalthough,aban,abandon,abandoned,abdomen,...,zeta,zinc,zn,zona,zone,zones,zymogen,zymogengranule,zymograms,zz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
df_queries.head()

Unnamed: 0,aa,aat,aathe,aathis,ab,abalthough,aban,abandon,abandoned,abdomen,...,zeta,zinc,zn,zona,zone,zones,zymogen,zymogengranule,zymograms,zz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
import pandas as pd

# Exemplo de documentos e consultas
documentos = ["Este é o primeiro documento.", "Este documento é o segundo documento.", "E este é o terceiro documento."]
consultas = ["Esta é uma consulta.", "Outra consulta aqui."]

# Inicialize o vetorizador TF-IDF
vectorizer = TfidfVectorizer()

# Ajuste o vetorizador aos documentos
tfidf_matrix = vectorizer.fit_transform(documentos)

# Transforme as consultas em vetores TF-IDF
vetores_consultas = vectorizer.transform(consultas)

# Obtenha os nomes das features (palavras)
palavras = vectorizer.get_feature_names_out()

# Crie DataFrames para os vetores TF-IDF
df_documentos = pd.DataFrame(data=tfidf_matrix.toarray(), columns=palavras)
df_consultas = pd.DataFrame(data=vetores_consultas.toarray(), columns=palavras)

print("Vetores TF-IDF dos documentos:")
print(df_documentos)

print("\nVetores TF-IDF das consultas:")
print(df_consultas)


Vetores TF-IDF dos documentos:
   documento      este  primeiro   segundo  terceiro
0   0.453295  0.453295  0.767495  0.000000  0.000000
1   0.713070  0.356535  0.000000  0.603667  0.000000
2   0.453295  0.453295  0.000000  0.000000  0.767495

Vetores TF-IDF das consultas:
   documento  este  primeiro  segundo  terceiro
0        0.0   0.0       0.0      0.0       0.0
1        0.0   0.0       0.0      0.0       0.0


In [10]:
type(vectorizer)

sklearn.feature_extraction.text.TfidfVectorizer