In [2]:
import gzip
import io
import tarfile
from tarfile import TarInfo
import threading

import sys
import import_ipynb

from typing import TextIO, BinaryIO
from typing import List

sys.path.append('../')

import pre_processing.TextProcessor as text_proc

importing Jupyter notebook from C:\Users\gabri\Documents\GitHub\MultimediaProject\pre_processing\..\pre_processing\TextProcessor.ipynb


In [19]:
def parse_compressed_tsv_line_by_line(compressed_file_path):
    try:
        with open(compressed_file_path, 'rb') as f:
            text_processor = TextProcessor()
            with gzip.GzipFile(fileobj=f) as gz: 
                buffer = io.TextIOWrapper(gz, encoding='utf-8')

                # C'è bisogno di leggere la prima riga a parte perchè essa inizia con i seguenti metadati "# collection.tsv0000777000175000017502663675055413400073633015704 0ustar  spacemanidolspacemanidol0 "
                # Quindi prima di entrare nel ciclo che legge tutte le righe, elimino questo prefisso
                first_line = next(buffer)  # Leggi la prima riga
                cleaned_first_line = ' '.join(first_line.split()[3:])  # Rimuovi la parte iniziale indesiderata
                text_processor.process_text(text)(0, cleaned_first_line)

                for line in buffer:
                    pid, text = line.strip().split('\t')
                    text_processor.process_text(text)(pid, text)
                    print(pid,text)

    except FileNotFoundError:
        print(f"File {compressed_file_path} non trovato.")
    except Exception as e:
        print(e)
        print(f"Si è verificato un errore durante l'analisi del file {compressed_file_path}: {e}")

# Esempio di utilizzo
#parse_compressed_tsv_line_by_line('collection.tar.gz')


In [33]:
def decompress_strings(compressed_file_path):
    try:
        with open(compressed_file_path, 'rb') as f:
            text_processor = TextProcessor()
            with gzip.GzipFile(fileobj=f) as gz: 
                buffer = io.TextIOWrapper(gz, encoding='utf-8')
                print(buffer)
                # C'è bisogno di leggere la prima riga a parte perchè essa inizia con i seguenti metadati "# collection.tsv0000777000175000017502663675055413400073633015704 0ustar  spacemanidolspacemanidol0 "
                # Quindi prima di entrare nel ciclo che legge tutte le righe, elimino questo prefisso
                first_line = next(buffer)  # Leggi la prima riga
                cleaned_first_line = ' '.join(first_line.split()[3:])  # Rimuovi la parte iniziale indesiderata
                text_processor.process_text(text)(0, cleaned_first_line)

                for line in buffer:
                    pid, text = line.strip().split('\t')
                    text_processor.process_text(text)(pid, text)
                    #print(pid,text)

    except FileNotFoundError:
        print(f"File {compressed_file_path} non trovato.")
    except Exception as e:
        print(e)
        print(f"Si è verificato un errore durante l'analisi del file {compressed_file_path}: {e}")
    
    return decompressed_strings

In [None]:
# Specifica il percorso del file TSV
file_path = "collection_cleaned.tsv"
with open(file_path, 'rb') as f:
    count = 0
    for line in f:
        print(line)
        count = count + 1
        if count == 5:
            break

In [20]:
parse_compressed_tsv_line_by_line("C:/Users/Davide/IR/collection.tar.gz")

'module' object is not callable
Si è verificato un errore durante l'analisi del file C:/Users/Davide/IR/collection.tar.gz: 'module' object is not callable


In [4]:
class Collection_Reader:
    collection_file: tarfile.TarFile
        
    MEMORY_BASED_BUFFER_SIZE:int
    memory_based_buffer:List[str]
        
    text_processor:text_proc.TextProcessor
    file_position: int
    file_members:List[TarInfo]
    
    def __init__(self,path_collection_file:str,memory_based_buffer_size:int):
        
        print("Init Collection Reader")
        
        self.collection_file = tarfile.open(path_collection_file, 'r:gz')
        self.MEMORY_BASED_BUFFER_SIZE = memory_based_buffer_size
        self.text_processor = text_proc.TextProcessor(True, True)
        self.file_position = 0
        self.memory_based_buffer = []
        self.file_member = self.collection_file.getmembers()[0]

        self.lock = threading.Lock()

    def read_collection(self):
        print("Reading collection...")        

        # Leggi MEMORY_BASED_BUFFER_SIZE righe in un colpo dal membro corrente
        with self.collection_file.extractfile(self.file_member) as file_handle:
            file_handle.seek(self.file_position)
            
            for _ in range(self.MEMORY_BASED_BUFFER_SIZE):
                line = file_handle.readline().decode('utf-8')
    
                # end of file
                if not line:
                    break 
                    
                line_processed = self.text_processor.process_text(line)
                self.memory_based_buffer.append(line_processed)

                self.file_position = file_handle.tell()

    def read_collection_threaded(self):
       with self.lock:
           self.read_collection()
    
    def get_documents(self):
        """ This is the only function to be called from outside to have a list of documents ready to be processed.
        
        Returns:
           the memory_based_buffer list of strings
            
        """
        self.memory_based_buffer.clear()

        # Start thread to read next block
        thread = threading.Thread(target=self.read_collection_threaded)
        thread.start()

        # Waiting for the end of reading
        thread.join()
        
        return self.memory_based_buffer
    
    
    
    def close_file_collection(self):
        collection_file.close()
         
#Qua potrebbe essere utile pensare di farlo con 2 thread.
#Quando viene chiamata la funzione get_documents, si restituisce al chiamante il blocco di documenti richiesti
#e nel mentre si potrebbe attivare un thread che parallelamente carica il successivo blocco di documenti e lo
#salva localmente nella struttura dati, quindi alla prossima chiamata get_documents ho già pronto il blocco e non
# devo stare ad aspettare tutta la lettura.

In [11]:
path = "C:/Users/gabri/Desktop/Materie magistrale/Multimedia/progetto/collection.tar.gz"
reader = Collection_Reader(path, 5)
# print("Primo blocco: \n")
block_0 = reader.get_documents()

assert len(block_0) == reader.MEMORY_BASED_BUFFER_SIZE
assert block_0[2] == '2 essay manhattan project manhattan project manhattan project see make atom bomb possibl success project would forev chang world forev make known someth power manmad'
# print(block_0)

# print("Secondo blocco: \n")
block_2 = reader.get_documents()

for i, document in enumerate(block_2, start=5):
    expected_prefix = str(i)
    assert document.startswith(expected_prefix)
    
# print(block_2)

Init Collection Reader
Reading collection...
Reading collection...
5


TypeError: startswith() takes at least 1 argument (0 given)