### Filtrage des chapitres qui font mention d'un gène mitochondrial 

#### Lecture des gènes mitocarta3

In [1]:
import json

json_path = "../data/pivot_output/mitocarta_pivot_full.json"
with open(json_path, "r", encoding="utf8") as f:
    mitocarta_pivot = json.load(f)

# Récupérer tous les HumanGeneID
mito_symbols = set()
for gene_id, gene_info in mitocarta_pivot.items():
    symbol = gene_info.get("symbol")
    if symbol:
        mito_symbols.add(symbol)

print("Nombre de gènes mitochondriaux :", len(mito_symbols))
print(list(mito_symbols)[:10])  # aperçu des 10 premiers symbols

Nombre de gènes mitochondriaux : 1136
['HSD17B4', 'LDHD', 'NDUFB2', 'HINT1', 'MRPL43', 'MRPL4', 'COX7A1', 'GCAT', 'NUDT5', 'ACAA2']


#### Filtrage des chapitres 

In [2]:
import tarfile

tar_path = "../data/public_db/genereviews/gene_NBK1116.tar.gz"

with tarfile.open(tar_path, "r:gz") as tar:
    # Choisir un fichier à tester (le premier fichier)
    for member in tar.getmembers():
        if member.isfile():
            f = tar.extractfile(member)
            raw = f.read(200)  # lire juste les 200 premiers bytes
            f.close()
            
            print(f"Test encodage pour {member.name}:")
            for enc in ["utf-8", "utf-16", "ISO-8859-1", "cp1252"]:
                try:
                    snippet = raw.decode(enc)
                    print(f"  ✅ {enc} fonctionne : {snippet[:100]!r}")
                except UnicodeDecodeError:
                    print(f"  ❌ {enc} échoue")
            break  # tester juste un fichier


Test encodage pour gene_NBK1116/ibm-Image001.jpg:
  ❌ utf-8 échoue
  ❌ utf-16 échoue
  ✅ ISO-8859-1 fonctionne : 'ÿØÿà\x00\x10JFIF\x00\x01\x01\x01\x00\x96\x00\x96\x00\x00ÿí\x00,Photoshop 3.0\x008BIM\x03í\x00\x00\x00\x00\x00\x10\x00\x96\x00\x00\x00\x01\x00\x01\x00\x96\x00\x00\x00\x01\x00\x01ÿáNÑhttp://ns.adobe.com/xap/1.0/\x00<'
  ✅ cp1252 fonctionne : 'ÿØÿà\x00\x10JFIF\x00\x01\x01\x01\x00–\x00–\x00\x00ÿí\x00,Photoshop 3.0\x008BIM\x03í\x00\x00\x00\x00\x00\x10\x00–\x00\x00\x00\x01\x00\x01\x00–\x00\x00\x00\x01\x00\x01ÿáNÑhttp://ns.adobe.com/xap/1.0/\x00<'


In [4]:
import re

gene_reviews_mito = {}

# Fonction de chunking simple (peut être remplacée par un tokenizer)
def chunk_text(text, chunk_size=300):
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size):
        chunks.append(" ".join(words[i:i+chunk_size]))
    return chunks

with tarfile.open(tar_path, "r:gz") as tar:
    for member in tar.getmembers():
        # On ne prend que les fichiers texte (.txt ou .html)
        if member.isfile() and (member.name.endswith(".txt") or member.name.endswith(".html")):
            f = tar.extractfile(member)
            raw = f.read()
            f.close()
            
            # Décodage robuste
            try:
                content = raw.decode("utf-8")
            except UnicodeDecodeError:
                try:
                    content = raw.decode("ISO-8859-1")
                except UnicodeDecodeError:
                    print(f"Impossible de décoder {member.name}, fichier ignoré")
                    continue
            
            # Filtrer chapitres mitochondriaux
            genes_in_text = [symbol for symbol in mito_symbols if re.search(rf"\b{symbol}\b", content)]
            if not genes_in_text:
                continue  # pas de gènes mitochondriaux, on skip
            
            # Chunking
            chunks = chunk_text(content)
            
            # Création des records pour chaque chunk
            gene_reviews_mito[member.name] = []
            for i, chunk in enumerate(chunks):
                gene_reviews_mito[member.name].append({
                    "chunk_id": f"{member.name}_chunk{i+1}",
                    "content": chunk,
                    "mito_genes": genes_in_text
                })

print("Nombre de chapitres mitochondriaux :", len(gene_reviews_mito))

Nombre de chapitres mitochondriaux : 1


In [5]:
print(gene_reviews_mito)

{'gene_NBK1116/license.txt': [{'chunk_id': 'gene_NBK1116/license.txt_chunk1', 'content': 'Terms of Use for ca/84/gene_NBK1116.tar.gz; GeneReviews&#174;; University of Washington, Seattle; 1993; NBK1116 For each document, read the license and copyright statements in the XML or PDF files before reusing or redistributing any part of the document. The license and copyright statements define what uses of the document are permitted, and apply to all associated files, including images and supplementary material. The terms and conditions of use are not identical for all documents. If there is no license or copyright statement in the XML or PDF files, then the document is in the public domain. No permission is needed to reproduce or distribute public domain content, but the authoring institute or agency must be given appropriate attribution. Contact the publisher if you have any questions about the permissible uses of the documents. For more information, see the Bookshelf Copyright Notice (http