<a href="https://colab.research.google.com/github/Dallah2002/a8alatr0clown/blob/main/Acquisition_Automatique_de_Donn%C3%A9es_Brevet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h1><b>Acquisition automatique de Donn√©es : Brevet

L'objectif principal de ce projet est d'automatiser la collecte et l'analyse de brevets concernant la 6G et la s√©curit√© dans un fichier .csv, en se focalisant sur des brevets qu‚Äôon a retrouv√© sur Google Patents, ensuite de cr√©er une application qui permet la visualisation des donn√©es extraites dans le brevets.



*   <h2>Receuils des brevets



In [None]:
pip install aiohttp aiofiles

Collecting aiofiles
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Downloading aiofiles-24.1.0-py3-none-any.whl (15 kB)
Installing collected packages: aiofiles
Successfully installed aiofiles-24.1.0


In [None]:
from google.colab import drive

# Monter Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import aiohttp
import asyncio
from aiofiles import open as aio_open
import time


# --- CONSTANTES ---
URL_BASE = "https://patents.google.com/patent/US{}"

# Chemins des fichiers pour la gestion des brevets
fichier_brevets = "/content/drive/MyDrive/acquisitionbrevet/brevets/brevets_urls.txt"
fichier_csv = "/content/drive/MyDrive/acquisitionbrevet/brevets/brevets_6G.csv"
fichier_brevets_traite = "/content/drive/MyDrive/acquisitionbrevet/brevets/brevets_traite.txt"
fichier_dernier_brevet = "/content/drive/MyDrive/acquisitionbrevet/brevets/dernier_brevet.txt"

# --- Mots-cl√©s li√©s √† la 6G ---
MOTS_CLES_6G = [
    "terahertz communication",
    "THz",
    "massive MIMO",
    "ultra-massive MIMO",
    "reconfigurable intelligent surfaces",
    "RIS",
    "artificial intelligence",
    "AI",
    "machine learning",
    "blockchain",
    "distributed ledger technology",
    "DLT",
    "energy harvesting",
    "quantum communication",
    "non-terrestrial networks",
    "NTN",
    "visible light communication",
    "VLC",
    "orbital angular momentum",
    "OAM",
    "6G frequency bands",
    "digital twins",
    "photonic communication",
    "holographic communication",
    "self-sustaining networks",
    "enhanced security and privacy",
    "ultra-low latency",
    "ultra-high speed",
    "terabit-per-second",
    "advanced beamforming",
    "RIS channel modeling",
    "RIS-network integration",
    "Privacy",
    "Data",
    "Security",
    "Privacy Policies",
    "Device Sensors",
    "Network Slicing",
    "Edge Computing",
    "Ultra-dense networks",
    "Zero-touch networks",
    "Massive IoT",
    "Autonomous Vehicles",
    "Smart Cities",
    "Artificial General Intelligence",
    "Self-Organizing Networks",
    "Blockchain for Network Management",
    "Direct Device-to-Device Communication",
    "Cognitive Radio",
    "Massive Cloud-RAN",
    "Ambient Intelligence",
    "Heterogeneous Networks",
    "Molecular Communication"
]


# --- FONCTION DE VALIDATION (ASYNCHRONE) ---
async def est_brevet_valide(session, numero_brevet):
    url = URL_BASE.format(numero_brevet)
    try:
        async with session.get(url, timeout=10) as reponse:
            if reponse.status == 200:
                contenu = await reponse.text()
                if any(mot_cle in contenu.lower() for mot_cle in MOTS_CLES_6G):
                    return url
    except Exception as e:
        print(f"Erreur lors de la v√©rification du brevet {numero_brevet}: {e}")
    return None

# --- CHARGEMENT DES URLS D√âJ√Ä ENREGISTR√âES ---
async def charger_urls_existantes():
    try:
        async with aio_open(fichier_brevets_traite, "r") as f:
            return {ligne.strip() for ligne in await f.readlines()}
    except FileNotFoundError:
        return set()

# --- D√âTERMINER LE DERNIER NUM√âRO PARCOURU ---
async def obtenir_dernier_brevet():
    try:
        async with aio_open(fichier_dernier_brevet, "r") as f:
            dernier_numero = await f.read()
            return int(dernier_numero.strip()) if dernier_numero.strip().isdigit() else 10318759
    except Exception:
        return 10318759

# --- MISE √Ä JOUR DES FICHIERS ---
async def mettre_a_jour_fichiers(nouveaux_brevets, dernier_numero):
    async with aio_open(fichier_brevets, "a") as f:
        await f.writelines(f"{url}\n" for url in nouveaux_brevets)

    async with aio_open(fichier_brevets_traite, "a") as f:
        await f.writelines(f"{url}\n" for url in nouveaux_brevets)

    async with aio_open(fichier_dernier_brevet, "w") as f:
        await f.write(str(dernier_numero))

# --- ACQUISITION ASYNCHRONE DES BREVETS ---
async def acquisition_brevets(nombre_a_verifier):
    debut_temps = time.time()

    urls_traitees = await charger_urls_existantes()
    dernier_brevet = await obtenir_dernier_brevet()

    print(f"D√©marrage de la recherche √† partir du num√©ro : {dernier_brevet}")
    nouveaux_brevets = []

    async with aiohttp.ClientSession() as session:
        taches = []
        for i in range(nombre_a_verifier):
            numero_courant = dernier_brevet + i
            numero_brevet = f"{numero_courant}B2"

            if any(numero_brevet in url for url in urls_traitees):
                print(f"üîÅ D√©j√† trait√© pour US{numero_brevet}")
                continue

            taches.append(est_brevet_valide(session, numero_brevet))

        resultats = await asyncio.gather(*taches)

        for url in resultats:
            if url:
                nouveaux_brevets.append(url)
                print(f"‚úÖ Nouveau brevet trouv√© : {url}")

    # Mise √† jour des fichiers
    await mettre_a_jour_fichiers(nouveaux_brevets, dernier_brevet + nombre_a_verifier)

    # R√©sum√© de l'ex√©cution
    fin_temps = time.time()
    print(f"üîç Nombre total de brevets trouv√©s : {len(nouveaux_brevets)}")
    print(f"‚è±Ô∏è Temps total d'ex√©cution : {fin_temps - debut_temps:.2f} secondes")
    print("üöÄ R√©cup√©ration des brevets termin√©e. Fichiers mis √† jour.")

# --- EX√âCUTION DU SCRIPT ---
await acquisition_brevets(nombre_a_verifier=100)


D√©marrage de la recherche √† partir du num√©ro : 10319059
‚úÖ Nouveau brevet trouv√© : https://patents.google.com/patent/US10319064B2
‚úÖ Nouveau brevet trouv√© : https://patents.google.com/patent/US10319066B2
‚úÖ Nouveau brevet trouv√© : https://patents.google.com/patent/US10319069B2
‚úÖ Nouveau brevet trouv√© : https://patents.google.com/patent/US10319070B2
‚úÖ Nouveau brevet trouv√© : https://patents.google.com/patent/US10319073B2
‚úÖ Nouveau brevet trouv√© : https://patents.google.com/patent/US10319075B2
‚úÖ Nouveau brevet trouv√© : https://patents.google.com/patent/US10319076B2
‚úÖ Nouveau brevet trouv√© : https://patents.google.com/patent/US10319080B2
‚úÖ Nouveau brevet trouv√© : https://patents.google.com/patent/US10319086B2
‚úÖ Nouveau brevet trouv√© : https://patents.google.com/patent/US10319092B2
‚úÖ Nouveau brevet trouv√© : https://patents.google.com/patent/US10319093B2
‚úÖ Nouveau brevet trouv√© : https://patents.google.com/patent/US10319095B2
‚úÖ Nouveau brevet trouv√© : 






*    <h2> Extraction de donn√©es des brevets

In [None]:
import aiofiles
import pandas as pd
from bs4 import BeautifulSoup


# --- CONSTANTES ---
BASE_URL = "https://patents.google.com"
fichier_brevets = "/content/drive/MyDrive/acquisitionbrevet/brevets/brevets_urls.txt"
fichier_csv = "/content/drive/MyDrive/acquisitionbrevet/brevets/brevets_6G.csv"

# --- COLONNES DANS L'ORDRE DEMAND√â ---
COLUMNS = [
    "Lien", "Num√©ro Brevet", "Titre", "Date de publication", "Mots-cl√©s",
    "Description", "Domaine Technologique", "Inventeurs", "Titulaire du brevet",
    "Statut du brevet"
]

# --- Liste des stopwords √† supprimer ---
STOPWORDS = {"a", "and", "that", "to", "be", "in", "the", "of","an","one","present","some","for","are","is","with","this","provided","herein","or","more","each","includes","include"}

# --- Charger les brevets d√©j√† trait√©s ---
def load_processed_patents():
    try:
        df = pd.read_csv(fichier_csv)
        return set(df["Num√©ro Brevet"].dropna().unique())
    except FileNotFoundError:
        return set()


# --- D√©tection du domaine technologique ---
def detect_technology_field(title, abstract):
    """ Cat√©gorisation automatique des brevets selon leur domaine technologique. """
    keywords = {
    "T√©l√©communications": [
        "5G", "6G", "network", "antenna", "MIMO", "massive MIMO", "ultra-massive MIMO",
        "signal", "terahertz communication", "THz", "6G frequency bands", "advanced beamforming",
        "RIS", "RIS channel modeling", "RIS-network integration", "non-terrestrial networks", "NTN",
        "visible light communication", "VLC", "orbital angular momentum", "OAM", "ultra-high speed",
        "terabit-per-second", "network slicing", "ultra-dense networks", "direct device-to-device communication",
        "heterogeneous networks", "massive Cloud-RAN"
        ],
    "Intelligence Artificielle": [
        "artificial intelligence", "AI", "machine learning", "deep learning", "neural network",
        "artificial general intelligence", "ambient intelligence", "cognitive radio", "self-organizing networks",
        "zero-touch networks"
        ],
    "S√©curit√©": [
        "cryptography", "authentication", "security", "hacking", "protection", "enhanced security and privacy",
        "privacy", "privacy policies", "data", "blockchain", "distributed ledger technology", "DLT",
        "blockchain for network management"
        ],
    "√ânergie": [
        "battery", "energy", "recharge", "autonomy", "energy harvesting", "self-sustaining networks"
        ],
    "Quantique": [
        "quantum", "qubit", "quantum communication", "molecular communication"
        ],
    "IoT et R√©seaux": [
        "massive IoT", "device sensors", "smart cities", "autonomous vehicles", "edge computing"
        ],
    "Communication Avanc√©e": [
        "photonic communication", "holographic communication", "digital twins"
        ]
}

    for field, words in keywords.items():
        if any(word.lower() in (title + abstract).lower() for word in words):
            return field
    return "Autre"


# --- EXTRACTION ASYNCHRONE DES BREVETS ---
async def fetch_patent_details(session, url):
    """ R√©cup√®re et extrait les informations principales d'un brevet. """
    try:
        async with session.get(url, timeout=10) as response:
            if response.status == 200:
                content = await response.text()
                soup = BeautifulSoup(content, "html.parser")

                # üìå Extraction du Titre via <meta name="DC.title">
                title_tag = soup.find("meta", {"name": "DC.title"})
                title = title_tag["content"].strip() if title_tag else "Non trouv√©"
                patent_number = url.split("/")[-1]
                # üìå Extraction de la Date de publication
                pub_date = soup.find("meta", {"name": "DC.date"})["content"] if soup.find("meta", {"name": "DC.date"}) else "Non trouv√©"

                # üìå R√©cup√©ration des inventeurs
                inventors = ", ".join([tag.text.strip() for tag in soup.find_all(itemprop="inventor")]) or "Non trouv√©"

                # üìå R√©cup√©ration du titulaire du brevet (anciennement "Assignee")
                assignees = ", ".join([tag.text.strip() for tag in soup.find_all(itemprop="assigneeCurrent")]) or "Non trouv√©"

                # üìå R√©cup√©ration du r√©sum√© (description)
                abstract = soup.find("meta", {"name": "DC.description"})["content"] if soup.find("meta", {"name": "DC.description"}) else "Non trouv√©"

                # üìå D√©tection du domaine technologique
                domain = detect_technology_field(title, abstract)

                # üìå Extraction du statut via itemprop "legalStatusIfi"
                status_tag = soup.find(itemprop="legalStatusIfi")
                status = status_tag.text.strip() if status_tag else "Inconnu"

                # üìå G√©n√©ration des mots-cl√©s (filtr√©s)
                words = [word for word in abstract.split()[:10] if word.lower() not in STOPWORDS]
                keywords = ", ".join(words)



                return {
                    "Lien": url,
                    "Num√©ro Brevet": patent_number,
                    "Titre": title,
                    "Date de publication": pub_date,
                    "Mots-cl√©s": keywords,
                    "Description": abstract,
                    "Domaine Technologique": domain,
                    "Inventeurs": inventors,
                    "Titulaire du brevet": assignees,
                    "Statut du brevet": status
                }
    except Exception as e:
        print(f"‚ùå Erreur lors de l'extraction de {url} : {e}")
    return None

# --- CHARGER LES BREVETS TROUV√âS ---
async def load_patent_urls():
    try:
        async with aiofiles.open(fichier_brevets, "r") as f:
            urls = [line.strip() for line in await f.readlines() if line.strip()]
            return urls
    except FileNotFoundError:
        print("üìÅ Fichier patent_urls.txt introuvable. V√©rifie le chemin.")
        return []

# --- SAUVEGARDE EN CSV ---
async def save_to_csv(data):
    df = pd.DataFrame(data, columns=COLUMNS)
    try:
        existing_df = pd.read_csv(fichier_csv)
        df = pd.concat([existing_df, df], ignore_index=True)
    except FileNotFoundError:
        pass
    df.to_csv(fichier_csv, index=False)
    print(f"‚úÖ {len(data)} brevets ajout√©s √† {fichier_csv}")

# --- LANCEMENT DE L'EXTRACTION ---
async def extract_patents_data():
    urls = await load_patent_urls()
    if not urls:
        print("‚ö†Ô∏è Aucun brevet √† extraire.")
        return

    processed_patents = load_processed_patents()
    urls_to_process = [url for url in urls if url.split("/")[-1] not in processed_patents]

    if not urls_to_process:
        print("‚úÖ Tous les brevets sont d√©j√† trait√©s.")
        return

    print(f"üîç Extraction des donn√©es pour {len(urls_to_process)} brevets...")

    async with aiohttp.ClientSession() as session:
        tasks = [fetch_patent_details(session, url) for url in urls_to_process]
        results = await asyncio.gather(*tasks)

    valid_results = [res for res in results if res is not None]
    if valid_results:
        await save_to_csv(valid_results)
    else:
        print("‚ö†Ô∏è Aucune donn√©e valide extraite.")

# --- EX√âCUTION DU SCRIPT ---
await extract_patents_data()


üîç Extraction des donn√©es pour 90 brevets...
‚ùå Erreur lors de l'extraction de https://patents.google.com/patent/US10319022B2 : 
‚ùå Erreur lors de l'extraction de https://patents.google.com/patent/US10318882B2 : 
‚ùå Erreur lors de l'extraction de https://patents.google.com/patent/US10318941B2 : 
‚úÖ 87 brevets ajout√©s √† /content/drive/MyDrive/acquisitionbrevet/brevets/brevets_6G.csv


*    <h2> Ajouts des colonnes de R√©sum√©, Probl√®me et Solution aux donn√©es extraites

In [None]:
import os
import time
import pandas as pd
import asyncio
import json
import aiohttp
import nest_asyncio

nest_asyncio.apply()

# --- CONFIGURATION ---
API_KEY = "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"
MODEL_ID = "cognitivecomputations/dolphin3.0-r1-mistral-24b:free"
CSV_FILE = "/content/drive/MyDrive/acquisitionbrevet/brevets/brevets_6G.csv"
BATCH_SIZE = 10  # Nombre de brevets √† traiter par ex√©cution

PROMPT_TEMPLATE = """
Analyze the following patent description and extract:
- Summary of the patent in 2 to 3 sentences.
- Problem addressed
- Solution provided

Description:
{content}

Return the response in JSON format:
{{
  "summary": "...",
  "problem": "...",
  "solution": "..."
}}
"""

# --- FONCTION D'ANALYSE ---
async def analyze_patent(content):
    """ Analyse un brevet et extrait R√©sum√©, Probl√®me et Solution via OpenRouter """
    if not content or pd.isna(content) or content.strip() == "":
        return "", "", ""

    prompt = PROMPT_TEMPLATE.format(content=content)

    headers = {
        "Authorization": f"Bearer {API_KEY}",
        "Content-Type": "application/json"
    }

    payload = {
        "model": MODEL_ID,
        "messages": [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        "max_tokens": 500
    }

    try:
        async with aiohttp.ClientSession() as session:
            async with session.post("https://openrouter.ai/api/v1/chat/completions", json=payload, headers=headers) as response:
                response_json = await response.json()

                if "choices" not in response_json:
                    print(f"‚ö†Ô∏è R√©ponse API invalide: {response_json}")
                    return "", "", ""

                result = response_json["choices"][0]["message"]["content"]

                # V√©rifie si la r√©ponse est bien un JSON
                try:
                    data = json.loads(result)
                except json.JSONDecodeError:
                    print(f"‚ö†Ô∏è Erreur JSON: R√©ponse brute non valide\n{result}")
                    return "", "", ""

                return data.get("summary", ""), data.get("problem", ""), data.get("solution", "")

    except Exception as e:
        print(f"‚ùå Erreur API: {e}")
        return "", "", ""

# --- TRAITEMENT CSV ---
async def process_csv():
    """ Charge le fichier CSV, analyse les brevets manquants et met √† jour les r√©sultats. """
    try:
        df = pd.read_csv(CSV_FILE, dtype=str).fillna("")
    except FileNotFoundError:
        print(f"‚ùå Erreur: Fichier {CSV_FILE} introuvable.")
        return

    # Ajouter les colonnes si elles n'existent pas
    for col in ["R√©sum√©", "Probl√®me", "Solution"]:
        if col not in df.columns:
            df[col] = ""

    # S√©lection des brevets ayant au moins une colonne vide, et qui ne sont pas d√©j√† compl√©t√©s
    to_analyze = df[(df[["R√©sum√©", "Probl√®me", "Solution"]] == "").any(axis=1) & (df["Description"].str.strip() != "")].head(BATCH_SIZE)

    if to_analyze.empty:
        print("‚úÖ Tous les brevets ont d√©j√† √©t√© analys√©s !")
        return

    print(f"üîç Nombre de brevets √† analyser: {len(to_analyze)}")

    tasks = []
    indices = []

    for idx, row in to_analyze.iterrows():
        description = row["Description"]
        if description.strip():  # V√©rifier que la description n'est pas vide
            tasks.append(analyze_patent(description))
            indices.append(idx)
        else:
            print(f"‚ö†Ô∏è Brevet {idx} ignor√© car sans description.")

    # Ex√©cuter les requ√™tes en parall√®le
    results = await asyncio.gather(*tasks)

    # Mise √† jour du DataFrame avec les r√©sultats uniquement si la colonne est vide
    for idx, (summary, problem, solution) in zip(indices, results):
        if df.at[idx, "R√©sum√©"] == "":
            df.at[idx, "R√©sum√©"] = summary
        if df.at[idx, "Probl√®me"] == "":
            df.at[idx, "Probl√®me"] = problem
        if df.at[idx, "Solution"] == "":
            df.at[idx, "Solution"] = solution

    # Sauvegarde propre du fichier sans √©craser les donn√©es existantes
    try:
        df.to_csv(CSV_FILE, index=False, encoding='utf-8')
        print(f"‚úÖ Mise √† jour termin√©e ! {len(indices)} brevets analys√©s.")
    except Exception as e:
        print(f"‚ùå Erreur d'√©criture du fichier CSV: {e}")

# --- EXECUTION ---
await process_csv()


‚ùå Erreur: Fichier /content/drive/MyDrive/acquisitionbrevet/brevets/brevets_6G.csv introuvable.
