In [1]:
import numpy as np
import pandas as pd
from unidecode import unidecode
import re
import string
from tqdm import tqdm
from nltk.corpus import stopwords
import nltk
tqdm.pandas()
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\annap\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
fn_files = ['cf74', 'cf75', 'cf76', 'cf77', 'cf78', 'cf79']
fn_queries = 'cfquery'

# Read Text

In [3]:
def extract_informations(filename):
    data = {}
    id_actual = None
    information = ""

    try:
        with open(filename, "r", encoding='ansi') as file:
            for line in file:
                if line.startswith("PN"):
                    # Se há uma informação anterior, armazena no dicionário
                    if id_actual and information:
                        data[id_actual] = information
                        information = ""

                    # Obtém o ID a partir da segunda palavra
                    id_actual = line.split()[1]
                elif line.startswith(("TI", "MJ", "MN", "AB", "EX")) and id_actual:
                    information += line[3:].strip()
                    while True:
                        next_line = file.readline()
                        if not next_line or re.match(r'\b[A-Z]{2}\s', next_line):
                            break
                        information += " " + next_line.strip()

            # Adiciona a última informação ao dicionário
            if id_actual and information:
                data[id_actual] = information

    except FileNotFoundError:
        print("File not founded.")

    return data

# Dicionário para armazenar os data
data_complete = {}

# Extrai as informações de cada file
for fn in fn_files:
    path_file = f"../data/{fn}"
    data = extract_informations(path_file)

    data_complete.update(data)

data_complete

{'74001': 'Pseudomonas aeruginosa infection in cystic fibrosis.  Occurrence of precipitating antibodies against pseudomonas aeruginosa in relation to the concentration of sixteen serum proteins and the clinical and radiographical status of the lungs.CYSTIC-FIBROSIS: co.  PSEUDOMONAS-AERUGINOSA: im. PSEUDOMONAS-INFECTIONS: co.  RESPIRATORY-TRACT-INFECTIONS: co.The significance of Pseudomonas aeruginosa infection in the respiratory tract of 9 cystic fibrosis patients have been studied by means of immunoelectrophoretical analysis of patients\' sera for the number of precipitins against Pseudomonas aeruginosa and the concentrations of 16 serum proteins.  In addition, the clinical and radiographical status of the lungs have been evaluated using 2 scoring systems.  Precipitins against Pseudomonas aeruginosa were demonstrated in all sera, the maximum number in one serum was 22. The concentrations of 12 of the serum proteins were significantly changed compared with matched control persons.  No

# Read Queries

In [98]:
def read_file_query(file_path):
 
    data = {'QN': [], 'QU': [], 'NR': [], 'RD': []}

    qn_id = None
    qu_texto = None
    nr_numero = None
    rd_lista = []

    with open(file_path, 'r') as arquivo:
        for linha in arquivo:
            if linha.startswith('RD'):
                rd_lista = [int(x) for x in re.findall(r'\d+', linha)]
                while True:
                    try:
                        proxima_linha = next(arquivo)
                        if proxima_linha.startswith('QN'):
                            # Salva os dados acumulados até aqui
                            if qn_id is not None:
                                data['QN'].append(qn_id)
                                data['QU'].append(qu_texto)
                                data['NR'].append(nr_numero)
                                data['RD'].append(rd_lista)
                            # Reinicia as variáveis para o próximo 'QN'
                            qn_id = int(re.search(r'\d+', proxima_linha).group())
                            qu_texto = None
                            nr_numero = None
                            rd_lista = []
                            break
                        rd_lista.extend([int(x) for x in re.findall(r'\d+', proxima_linha)])
                    except StopIteration:
                        break
            elif linha.startswith('QN'):
                qn_id = int(re.search(r'\d+', linha).group())
            elif linha.startswith('QU'):
                qu_texto = linha[3:].strip()
            elif linha.startswith('NR'):
                nr_numero = int(re.search(r'\d+', linha).group())

    # Adiciona os últimos dados, se houver
    if qn_id is not None:
        data['QN'].append(qn_id)
        data['QU'].append(qu_texto)
        data['NR'].append(nr_numero)
        data['RD'].append(rd_lista)

    df = pd.DataFrame(data)
    return df

# Lê o arquivo e cria o DataFrame
df_queries = read_file_query(f'../data/{fn_queries}')

In [94]:
df.head()

Unnamed: 0,QN,QU,NR,RD
0,1,What are the effects of calcium on the physica...,34,"[139, 1222, 151, 2211, 166, 1, 311, 1, 370, 10..."
1,2,Can one distinguish between the effects of muc...,7,"[169, 1000, 434, 1001, 454, 100, 498, 1000, 49..."
2,3,How are salivary glycoproteins from CF patient...,43,"[23, 1000, 40, 10, 139, 2122, 190, 1, 221, 1, ..."
3,4,What is the lipid composition of CF respirator...,9,"[503, 1, 538, 100, 539, 100, 540, 100, 553, 1,..."
4,5,Is CF mucus abnormal?,131,"[23, 2220, 47, 2221, 50, 1, 60, 1, 114, 11, 13..."


# Text preprocessing

In [102]:
def preprocess_english_text(text: str):

    # convert to lower, remove accents and remove pontuactions
    text = text.lower()
    text = unidecode(text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # remove numbers
    text = re.sub(r'\d+', '', text)
    
    # remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    
    return ' '.join(tokens)

In [103]:
data_preprocessed = data_complete.copy()

for key, value in tqdm(data_complete.items()):

    data_preprocessed[key] = preprocess_english_text(value)

100%|██████████| 1211/1211 [00:00<00:00, 2588.03it/s]


In [104]:
df_queries['QU_preprocessed'] = df_queries['QU'].progress_apply(lambda x: preprocess_english_text(x) )

100%|██████████| 100/100 [00:00<00:00, 2448.37it/s]


In [105]:
df_queries.head()

Unnamed: 0,QN,QU,NR,RD,QU_preprocessed
0,1,What are the effects of calcium on the physica...,34,"[139, 1222, 151, 2211, 166, 1, 311, 1, 370, 10...",effects calcium physical properties mucus
1,2,Can one distinguish between the effects of muc...,7,"[169, 1000, 434, 1001, 454, 100, 498, 1000, 49...",one distinguish effects mucus hypersecretion
2,3,How are salivary glycoproteins from CF patient...,43,"[23, 1000, 40, 10, 139, 2122, 190, 1, 221, 1, ...",salivary glycoproteins cf patients different
3,4,What is the lipid composition of CF respirator...,9,"[503, 1, 538, 100, 539, 100, 540, 100, 553, 1,...",lipid composition cf respiratory secretions
4,5,Is CF mucus abnormal?,131,"[23, 2220, 47, 2221, 50, 1, 60, 1, 114, 11, 13...",cf mucus abnormal


In [56]:
len(df_queries.loc[0]['RD'])

14