In [17]:
import os
import pandas as pd
import requests
import subprocess
from tqdm import tqdm
import os
import json
from elasticsearch import Elasticsearch


In [6]:
# Créer le répertoire pour stocker les fichiers téléchargés
download_dir = 'bano-data'
os.makedirs(download_dir, exist_ok=True)

def download_file(url, output_path):
    # Vérifier si le fichier existe déjà
    if os.path.exists(output_path):
        print(f"{output_path} already exists, skipping download.")
        return
    
    # Faire la requête pour récupérer la taille du fichier
    response = requests.get(url, stream=True)
    total_size = int(response.headers.get('content-length', 0))

    # Télécharger avec barre de progression
    with open(output_path, 'wb') as file, tqdm(
        desc=output_path,
        total=total_size,
        unit='B',
        unit_scale=True,
        unit_divisor=1024,
    ) as bar:
        for data in response.iter_content(chunk_size=1024):
            file.write(data)
            bar.update(len(data))

def main():
    base_url = "http://bano.openstreetmap.fr/data/"
    depts = list(range(1, 21)) + list(range(21, 96))

    for dept in depts:
        dept_str = f"{dept:02d}"
        print(f"Downloading bano department {dept_str}")
        url = f"{base_url}bano-{dept_str}.csv"
        output_path = os.path.join(download_dir, f"bano-{dept_str}.csv")
        download_file(url, output_path)

if __name__ == "__main__":
    main()

Downloading bano department 01
bano-data\bano-01.csv already exists, skipping download.
Downloading bano department 02
bano-data\bano-02.csv already exists, skipping download.
Downloading bano department 03
bano-data\bano-03.csv already exists, skipping download.
Downloading bano department 04
bano-data\bano-04.csv already exists, skipping download.
Downloading bano department 05
bano-data\bano-05.csv already exists, skipping download.
Downloading bano department 06
bano-data\bano-06.csv already exists, skipping download.
Downloading bano department 07
bano-data\bano-07.csv already exists, skipping download.
Downloading bano department 08
bano-data\bano-08.csv already exists, skipping download.
Downloading bano department 09
bano-data\bano-09.csv already exists, skipping download.
Downloading bano department 10
bano-data\bano-10.csv already exists, skipping download.
Downloading bano department 11
bano-data\bano-11.csv already exists, skipping download.
Downloading bano department 12
b

In [7]:
# Lire la région à partir de la variable d'environnement
region = os.getenv('REGION', 'default_region')

# Charger le fichier CSV
df = pd.read_csv('bano-data/bano-01.csv', sep=',', header=None, names=[
    "id", "number", "street_name", "zipcode", "city", "source", "latitude", "longitude"
])

# Convertir les colonnes latitude et longitude en float
df['latitude'] = df['latitude'].astype(float)
df['longitude'] = df['longitude'].astype(float)

# Renommer les colonnes
df.rename(columns={
    "longitude": "location_lon",
    "latitude": "location_lat",
    "number": "address_number",
    "street_name": "address_street_name",
    "zipcode": "address_zipcode",
    "city": "address_city"
}, inplace=True)

# Ajouter la colonne 'region' avec la valeur de l'environnement REGION
df['region'] = region

# Supprimer les colonnes inutiles
df.drop(columns=["source"], inplace=True)

# Afficher un échantillon des données pour vérifier le traitement
print(df.head())

# Sauvegarder le dataframe traité dans un nouveau fichier CSV si besoin
df.to_csv('processed_data.csv', index=False)

              id address_number  address_street_name  address_zipcode  \
0  010010005-103            103  Impasse des Acacias             1400   
1  010010005-104            104  Impasse des Acacias             1400   
2   010010005-26             26  Impasse des Acacias             1400   
3   010010005-30             30  Impasse des Acacias             1400   
4   010010005-59             59  Impasse des Acacias             1400   

              address_city  location_lat  location_lon          region  
0  L'Abergement-Clémenciat     46.147615      4.924047  default_region  
1  L'Abergement-Clémenciat     46.147662      4.924207  default_region  
2  L'Abergement-Clémenciat     46.146906      4.924205  default_region  
3  L'Abergement-Clémenciat     46.147004      4.924469  default_region  
4  L'Abergement-Clémenciat     46.147161      4.924281  default_region  


In [16]:
# Connexion à Elasticsearch (ajustez l'URL si nécessaire)
es = Elasticsearch("http://localhost:9200")

# Charger le fichier de modèle (template) JSON
template_file = 'bano.json'
with open(template_file, 'r') as f:
    template = json.load(f)

# Nom de la région, récupéré depuis une variable d'environnement
region = os.getenv('REGION', 'default_region')

# Index Elasticsearch basé sur la région
index_name = f".bano-{region}"

# Nom du modèle (template)
template_name = "bano"

# Fonction pour créer ou mettre à jour le modèle (template)
def create_or_update_template(es_client, template_name, template_body):
    es_client.indices.put_template(name=template_name, body=template_body)

# Appliquer ou mettre à jour le modèle (template) sur Elasticsearch
create_or_update_template(es, template_name, template)

# Exemple de document à indexer
document = {
    "id": "12345",
    "address": {
        "number": 10,
        "street_name": "Rue de l'exemple",
        "zipcode": "75001",
        "city": "Paris"
    },
    "location": {
        "lat": 48.8566,
        "lon": 2.3522
    }
}

# Indexer le document avec le document_id basé sur l'ID du document
def index_document(es_client, index_name, document):
    document_id = document.get("id")
    es_client.index(index=index_name, id=document_id, body=document)

# Indexer le document dans Elasticsearch
index_document(es, index_name, document)

print(f"Document indexed in {index_name} with ID {document['id']}")

Document indexed in .bano-default_region with ID 12345


  es_client.indices.put_template(name=template_name, body=template_body)


In [26]:
import os
import subprocess
import requests

# Fonction pour importer une région dans Elasticsearch via Logstash
def import_region(region):
    # Exporter la région comme variable d'environnement (pour être utilisée par Logstash)
    os.environ['REGION'] = region  # Ici, region est directement passé par l'appel à import_region.
    
    # Fichier CSV pour la région donnée
    file_path = os.path.abspath(f'bano-data/bano-{region}.csv')
    
    # Vérifier si le fichier CSV existe
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Le fichier {file_path} est introuvable.")
    
    # Supprimer l'index dans Elasticsearch (équivalent du 'curl -XDELETE')
    es_url = f"http://localhost:9200/.bano-{region}?pretty"
    response = requests.delete(es_url)
    
    if response.status_code == 200:
        print(f"Deleted index .bano-{region} from Elasticsearch.")
    else:
        print(f"Failed to delete index .bano-{region}. Status: {response.status_code}")
    
    # Chemin vers Logstash
    logstash_command = [
        'C:/Users/GRETA/Desktop/logstash-8.15.1/bin/logstash.bat',
        '-f', 'bano-data.conf'
    ]
    
    # Exécuter Logstash avec le fichier de configuration
    with open(file_path, 'r') as file:
        # Envoyer les données du fichier à Logstash via la commande subprocess
        process = subprocess.Popen(logstash_command, stdin=file)
        process.communicate()

# Liste des départements à traiter
def process_departments():
    depts = [f"{i:02d}" for i in range(1, 20)] + [f"{i:02d}" for i in range(21, 96)]
    
    for dept in depts:
        print(f"Processing department {dept}")
        import_region(dept)

if __name__ == "__main__":
    process_departments()

Processing department 01
Failed to delete index .bano-01. Status: 404
Processing department 02
Failed to delete index .bano-02. Status: 404
Processing department 03
Failed to delete index .bano-03. Status: 404
Processing department 04
Failed to delete index .bano-04. Status: 404
Processing department 05
Failed to delete index .bano-05. Status: 404
Processing department 06
Failed to delete index .bano-06. Status: 404
Processing department 07
Failed to delete index .bano-07. Status: 404
Processing department 08
Failed to delete index .bano-08. Status: 404
Processing department 09
Failed to delete index .bano-09. Status: 404
Processing department 10
Failed to delete index .bano-10. Status: 404
Processing department 11
Failed to delete index .bano-11. Status: 404
Processing department 12
Failed to delete index .bano-12. Status: 404
Processing department 13
Failed to delete index .bano-13. Status: 404
Processing department 14
Failed to delete index .bano-14. Status: 404
Processing departmen