<a href="https://colab.research.google.com/github/Dallah2002/a8alatr0clown/blob/main/Acquisition_Automatique_de_Donn%C3%A9es_Brevet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h1><b>Acquisition automatique de Données : Brevet

L'objectif principal de ce projet est d'automatiser la collecte et l'analyse de brevets concernant la 6G et la sécurité dans un fichier .csv, en se focalisant sur des brevets qu’on a retrouvé sur Google Patents, ensuite de créer une application qui permet la visualisation des données extraites dans le brevets.



*   <h2>Receuils des brevets



In [None]:
pip install aiohttp aiofiles

Collecting aiofiles
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Downloading aiofiles-24.1.0-py3-none-any.whl (15 kB)
Installing collected packages: aiofiles
Successfully installed aiofiles-24.1.0


In [None]:
from google.colab import drive

# Monter Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import aiohttp
import asyncio
from aiofiles import open as aio_open
import time


# --- CONSTANTES ---
URL_BASE = "https://patents.google.com/patent/US{}"

# Chemins des fichiers pour la gestion des brevets
fichier_brevets = "/content/drive/MyDrive/acquisitionbrevet/brevets/brevets_urls.txt"
fichier_csv = "/content/drive/MyDrive/acquisitionbrevet/brevets/brevets_6G.csv"
fichier_brevets_traite = "/content/drive/MyDrive/acquisitionbrevet/brevets/brevets_traite.txt"
fichier_dernier_brevet = "/content/drive/MyDrive/acquisitionbrevet/brevets/dernier_brevet.txt"

# --- Mots-clés liés à la 6G ---
MOTS_CLES_6G = [
    "terahertz communication",
    "THz",
    "massive MIMO",
    "ultra-massive MIMO",
    "reconfigurable intelligent surfaces",
    "RIS",
    "artificial intelligence",
    "AI",
    "machine learning",
    "blockchain",
    "distributed ledger technology",
    "DLT",
    "energy harvesting",
    "quantum communication",
    "non-terrestrial networks",
    "NTN",
    "visible light communication",
    "VLC",
    "orbital angular momentum",
    "OAM",
    "6G frequency bands",
    "digital twins",
    "photonic communication",
    "holographic communication",
    "self-sustaining networks",
    "enhanced security and privacy",
    "ultra-low latency",
    "ultra-high speed",
    "terabit-per-second",
    "advanced beamforming",
    "RIS channel modeling",
    "RIS-network integration",
    "Privacy",
    "Data",
    "Security",
    "Privacy Policies",
    "Device Sensors",
    "Network Slicing",
    "Edge Computing",
    "Ultra-dense networks",
    "Zero-touch networks",
    "Massive IoT",
    "Autonomous Vehicles",
    "Smart Cities",
    "Artificial General Intelligence",
    "Self-Organizing Networks",
    "Blockchain for Network Management",
    "Direct Device-to-Device Communication",
    "Cognitive Radio",
    "Massive Cloud-RAN",
    "Ambient Intelligence",
    "Heterogeneous Networks",
    "Molecular Communication"
]


# --- FONCTION DE VALIDATION (ASYNCHRONE) ---
async def est_brevet_valide(session, numero_brevet):
    url = URL_BASE.format(numero_brevet)
    try:
        async with session.get(url, timeout=10) as reponse:
            if reponse.status == 200:
                contenu = await reponse.text()
                if any(mot_cle in contenu.lower() for mot_cle in MOTS_CLES_6G):
                    return url
    except Exception as e:
        print(f"Erreur lors de la vérification du brevet {numero_brevet}: {e}")
    return None

# --- CHARGEMENT DES URLS DÉJÀ ENREGISTRÉES ---
async def charger_urls_existantes():
    try:
        async with aio_open(fichier_brevets_traite, "r") as f:
            return {ligne.strip() for ligne in await f.readlines()}
    except FileNotFoundError:
        return set()

# --- DÉTERMINER LE DERNIER NUMÉRO PARCOURU ---
async def obtenir_dernier_brevet():
    try:
        async with aio_open(fichier_dernier_brevet, "r") as f:
            dernier_numero = await f.read()
            return int(dernier_numero.strip()) if dernier_numero.strip().isdigit() else 10318759
    except Exception:
        return 10318759

# --- MISE À JOUR DES FICHIERS ---
async def mettre_a_jour_fichiers(nouveaux_brevets, dernier_numero):
    async with aio_open(fichier_brevets, "a") as f:
        await f.writelines(f"{url}\n" for url in nouveaux_brevets)

    async with aio_open(fichier_brevets_traite, "a") as f:
        await f.writelines(f"{url}\n" for url in nouveaux_brevets)

    async with aio_open(fichier_dernier_brevet, "w") as f:
        await f.write(str(dernier_numero))

# --- ACQUISITION ASYNCHRONE DES BREVETS ---
async def acquisition_brevets(nombre_a_verifier):
    debut_temps = time.time()

    urls_traitees = await charger_urls_existantes()
    dernier_brevet = await obtenir_dernier_brevet()

    print(f"Démarrage de la recherche à partir du numéro : {dernier_brevet}")
    nouveaux_brevets = []

    async with aiohttp.ClientSession() as session:
        taches = []
        for i in range(nombre_a_verifier):
            numero_courant = dernier_brevet + i
            numero_brevet = f"{numero_courant}B2"

            if any(numero_brevet in url for url in urls_traitees):
                print(f"🔁 Déjà traité pour US{numero_brevet}")
                continue

            taches.append(est_brevet_valide(session, numero_brevet))

        resultats = await asyncio.gather(*taches)

        for url in resultats:
            if url:
                nouveaux_brevets.append(url)
                print(f"✅ Nouveau brevet trouvé : {url}")

    # Mise à jour des fichiers
    await mettre_a_jour_fichiers(nouveaux_brevets, dernier_brevet + nombre_a_verifier)

    # Résumé de l'exécution
    fin_temps = time.time()
    print(f"🔍 Nombre total de brevets trouvés : {len(nouveaux_brevets)}")
    print(f"⏱️ Temps total d'exécution : {fin_temps - debut_temps:.2f} secondes")
    print("🚀 Récupération des brevets terminée. Fichiers mis à jour.")

# --- EXÉCUTION DU SCRIPT ---
await acquisition_brevets(nombre_a_verifier=100)


Démarrage de la recherche à partir du numéro : 10319059
✅ Nouveau brevet trouvé : https://patents.google.com/patent/US10319064B2
✅ Nouveau brevet trouvé : https://patents.google.com/patent/US10319066B2
✅ Nouveau brevet trouvé : https://patents.google.com/patent/US10319069B2
✅ Nouveau brevet trouvé : https://patents.google.com/patent/US10319070B2
✅ Nouveau brevet trouvé : https://patents.google.com/patent/US10319073B2
✅ Nouveau brevet trouvé : https://patents.google.com/patent/US10319075B2
✅ Nouveau brevet trouvé : https://patents.google.com/patent/US10319076B2
✅ Nouveau brevet trouvé : https://patents.google.com/patent/US10319080B2
✅ Nouveau brevet trouvé : https://patents.google.com/patent/US10319086B2
✅ Nouveau brevet trouvé : https://patents.google.com/patent/US10319092B2
✅ Nouveau brevet trouvé : https://patents.google.com/patent/US10319093B2
✅ Nouveau brevet trouvé : https://patents.google.com/patent/US10319095B2
✅ Nouveau brevet trouvé : https://patents.google.com/patent/US103190






*    <h2> Extraction de données des brevets

In [None]:
import aiofiles
import pandas as pd
from bs4 import BeautifulSoup


# --- CONSTANTES ---
BASE_URL = "https://patents.google.com"
fichier_brevets = "/content/drive/MyDrive/acquisitionbrevet/brevets/brevets_urls.txt"
fichier_csv = "/content/drive/MyDrive/acquisitionbrevet/brevets/brevets_6G.csv"

# --- COLONNES DANS L'ORDRE DEMANDÉ ---
COLUMNS = [
    "Lien", "Numéro Brevet", "Titre", "Date de publication", "Mots-clés",
    "Description", "Domaine Technologique", "Inventeurs", "Titulaire du brevet",
    "Statut du brevet"
]

# --- Liste des stopwords à supprimer ---
STOPWORDS = {"a", "and", "that", "to", "be", "in", "the", "of","an","one","present","some","for","are","is","with","this","provided","herein","or","more","each","includes","include"}

# --- Charger les brevets déjà traités ---
def load_processed_patents():
    try:
        df = pd.read_csv(fichier_csv)
        return set(df["Numéro Brevet"].dropna().unique())
    except FileNotFoundError:
        return set()


# --- Détection du domaine technologique ---
def detect_technology_field(title, abstract):
    """ Catégorisation automatique des brevets selon leur domaine technologique. """
    keywords = {
    "Télécommunications": [
        "5G", "6G", "network", "antenna", "MIMO", "massive MIMO", "ultra-massive MIMO",
        "signal", "terahertz communication", "THz", "6G frequency bands", "advanced beamforming",
        "RIS", "RIS channel modeling", "RIS-network integration", "non-terrestrial networks", "NTN",
        "visible light communication", "VLC", "orbital angular momentum", "OAM", "ultra-high speed",
        "terabit-per-second", "network slicing", "ultra-dense networks", "direct device-to-device communication",
        "heterogeneous networks", "massive Cloud-RAN"
        ],
    "Intelligence Artificielle": [
        "artificial intelligence", "AI", "machine learning", "deep learning", "neural network",
        "artificial general intelligence", "ambient intelligence", "cognitive radio", "self-organizing networks",
        "zero-touch networks"
        ],
    "Sécurité": [
        "cryptography", "authentication", "security", "hacking", "protection", "enhanced security and privacy",
        "privacy", "privacy policies", "data", "blockchain", "distributed ledger technology", "DLT",
        "blockchain for network management"
        ],
    "Énergie": [
        "battery", "energy", "recharge", "autonomy", "energy harvesting", "self-sustaining networks"
        ],
    "Quantique": [
        "quantum", "qubit", "quantum communication", "molecular communication"
        ],
    "IoT et Réseaux": [
        "massive IoT", "device sensors", "smart cities", "autonomous vehicles", "edge computing"
        ],
    "Communication Avancée": [
        "photonic communication", "holographic communication", "digital twins"
        ]
}

    for field, words in keywords.items():
        if any(word.lower() in (title + abstract).lower() for word in words):
            return field
    return "Autre"


# --- EXTRACTION ASYNCHRONE DES BREVETS ---
async def fetch_patent_details(session, url):
    """ Récupère et extrait les informations principales d'un brevet. """
    try:
        async with session.get(url, timeout=10) as response:
            if response.status == 200:
                content = await response.text()
                soup = BeautifulSoup(content, "html.parser")

                # 📌 Extraction du Titre via <meta name="DC.title">
                title_tag = soup.find("meta", {"name": "DC.title"})
                title = title_tag["content"].strip() if title_tag else "Non trouvé"
                patent_number = url.split("/")[-1]
                # 📌 Extraction de la Date de publication
                pub_date = soup.find("meta", {"name": "DC.date"})["content"] if soup.find("meta", {"name": "DC.date"}) else "Non trouvé"

                # 📌 Récupération des inventeurs
                inventors = ", ".join([tag.text.strip() for tag in soup.find_all(itemprop="inventor")]) or "Non trouvé"

                # 📌 Récupération du titulaire du brevet (anciennement "Assignee")
                assignees = ", ".join([tag.text.strip() for tag in soup.find_all(itemprop="assigneeCurrent")]) or "Non trouvé"

                # 📌 Récupération du résumé (description)
                abstract = soup.find("meta", {"name": "DC.description"})["content"] if soup.find("meta", {"name": "DC.description"}) else "Non trouvé"

                # 📌 Détection du domaine technologique
                domain = detect_technology_field(title, abstract)

                # 📌 Extraction du statut via itemprop "legalStatusIfi"
                status_tag = soup.find(itemprop="legalStatusIfi")
                status = status_tag.text.strip() if status_tag else "Inconnu"

                # 📌 Génération des mots-clés (filtrés)
                words = [word for word in abstract.split()[:10] if word.lower() not in STOPWORDS]
                keywords = ", ".join(words)



                return {
                    "Lien": url,
                    "Numéro Brevet": patent_number,
                    "Titre": title,
                    "Date de publication": pub_date,
                    "Mots-clés": keywords,
                    "Description": abstract,
                    "Domaine Technologique": domain,
                    "Inventeurs": inventors,
                    "Titulaire du brevet": assignees,
                    "Statut du brevet": status
                }
    except Exception as e:
        print(f"❌ Erreur lors de l'extraction de {url} : {e}")
    return None

# --- CHARGER LES BREVETS TROUVÉS ---
async def load_patent_urls():
    try:
        async with aiofiles.open(fichier_brevets, "r") as f:
            urls = [line.strip() for line in await f.readlines() if line.strip()]
            return urls
    except FileNotFoundError:
        print("📁 Fichier patent_urls.txt introuvable. Vérifie le chemin.")
        return []

# --- SAUVEGARDE EN CSV ---
async def save_to_csv(data):
    df = pd.DataFrame(data, columns=COLUMNS)
    try:
        existing_df = pd.read_csv(fichier_csv)
        df = pd.concat([existing_df, df], ignore_index=True)
    except FileNotFoundError:
        pass
    df.to_csv(fichier_csv, index=False)
    print(f"✅ {len(data)} brevets ajoutés à {fichier_csv}")

# --- LANCEMENT DE L'EXTRACTION ---
async def extract_patents_data():
    urls = await load_patent_urls()
    if not urls:
        print("⚠️ Aucun brevet à extraire.")
        return

    processed_patents = load_processed_patents()
    urls_to_process = [url for url in urls if url.split("/")[-1] not in processed_patents]

    if not urls_to_process:
        print("✅ Tous les brevets sont déjà traités.")
        return

    print(f"🔍 Extraction des données pour {len(urls_to_process)} brevets...")

    async with aiohttp.ClientSession() as session:
        tasks = [fetch_patent_details(session, url) for url in urls_to_process]
        results = await asyncio.gather(*tasks)

    valid_results = [res for res in results if res is not None]
    if valid_results:
        await save_to_csv(valid_results)
    else:
        print("⚠️ Aucune donnée valide extraite.")

# --- EXÉCUTION DU SCRIPT ---
await extract_patents_data()


🔍 Extraction des données pour 90 brevets...
❌ Erreur lors de l'extraction de https://patents.google.com/patent/US10319022B2 : 
❌ Erreur lors de l'extraction de https://patents.google.com/patent/US10318882B2 : 
❌ Erreur lors de l'extraction de https://patents.google.com/patent/US10318941B2 : 
✅ 87 brevets ajoutés à /content/drive/MyDrive/acquisitionbrevet/brevets/brevets_6G.csv


*    <h2> Ajouts des colonnes de Résumé, Problème et Solution aux données extraites

In [None]:
import os
import time
import pandas as pd
import asyncio
import json
import aiohttp
import nest_asyncio

nest_asyncio.apply()

# --- CONFIGURATION ---
API_KEY = "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"
MODEL_ID = "cognitivecomputations/dolphin3.0-r1-mistral-24b:free"
CSV_FILE = "/content/drive/MyDrive/acquisitionbrevet/brevets/brevets_6G.csv"
BATCH_SIZE = 10  # Nombre de brevets à traiter par exécution

PROMPT_TEMPLATE = """
Analyze the following patent description and extract:
- Summary of the patent in 2 to 3 sentences.
- Problem addressed
- Solution provided

Description:
{content}

Return the response in JSON format:
{{
  "summary": "...",
  "problem": "...",
  "solution": "..."
}}
"""

# --- FONCTION D'ANALYSE ---
async def analyze_patent(content):
    """ Analyse un brevet et extrait Résumé, Problème et Solution via OpenRouter """
    if not content or pd.isna(content) or content.strip() == "":
        return "", "", ""

    prompt = PROMPT_TEMPLATE.format(content=content)

    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type": "application/json"
    }

    payload = {
        "model": MODEL_ID,
        "messages": [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        "max_tokens": 500
    }

    try:
        async with aiohttp.ClientSession() as session:
            async with session.post("https://openrouter.ai/api/v1/chat/completions", json=payload, headers=headers) as response:
                response_json = await response.json()

                if "choices" not in response_json:
                    print(f"⚠️ Réponse API invalide: {response_json}")
                    return "", "", ""

                result = response_json["choices"][0]["message"]["content"]

                # Vérifie si la réponse est bien un JSON
                try:
                    data = json.loads(result)
                except json.JSONDecodeError:
                    print(f"⚠️ Erreur JSON: Réponse brute non valide\n{result}")
                    return "", "", ""

                return data.get("summary", ""), data.get("problem", ""), data.get("solution", "")

    except Exception as e:
        print(f"❌ Erreur API: {e}")
        return "", "", ""

# --- TRAITEMENT CSV ---
async def process_csv():
    """ Charge le fichier CSV, analyse les brevets manquants et met à jour les résultats. """
    try:
        df = pd.read_csv(CSV_FILE, dtype=str).fillna("")
    except FileNotFoundError:
        print(f"❌ Erreur: Fichier {CSV_FILE} introuvable.")
        return

    # Ajouter les colonnes si elles n'existent pas
    for col in ["Résumé", "Problème", "Solution"]:
        if col not in df.columns:
            df[col] = ""

    # Sélection des brevets ayant au moins une colonne vide, et qui ne sont pas déjà complétés
    to_analyze = df[(df[["Résumé", "Problème", "Solution"]] == "").any(axis=1) & (df["Description"].str.strip() != "")].head(BATCH_SIZE)

    if to_analyze.empty:
        print("✅ Tous les brevets ont déjà été analysés !")
        return

    print(f"🔍 Nombre de brevets à analyser: {len(to_analyze)}")

    tasks = []
    indices = []

    for idx, row in to_analyze.iterrows():
        description = row["Description"]
        if description.strip():  # Vérifier que la description n'est pas vide
            tasks.append(analyze_patent(description))
            indices.append(idx)
        else:
            print(f"⚠️ Brevet {idx} ignoré car sans description.")

    # Exécuter les requêtes en parallèle
    results = await asyncio.gather(*tasks)

    # Mise à jour du DataFrame avec les résultats uniquement si la colonne est vide
    for idx, (summary, problem, solution) in zip(indices, results):
        if df.at[idx, "Résumé"] == "":
            df.at[idx, "Résumé"] = summary
        if df.at[idx, "Problème"] == "":
            df.at[idx, "Problème"] = problem
        if df.at[idx, "Solution"] == "":
            df.at[idx, "Solution"] = solution

    # Sauvegarde propre du fichier sans écraser les données existantes
    try:
        df.to_csv(CSV_FILE, index=False, encoding='utf-8')
        print(f"✅ Mise à jour terminée ! {len(indices)} brevets analysés.")
    except Exception as e:
        print(f"❌ Erreur d'écriture du fichier CSV: {e}")

# --- EXECUTION ---
await process_csv()


❌ Erreur: Fichier /content/drive/MyDrive/acquisitionbrevet/brevets/brevets_6G.csv introuvable.
