In [79]:
import sys
import boto3
import json
import pandas as pd
from io import BytesIO
import argparse
import logging
from pymongo import MongoClient, errors
from bson import ObjectId
import hashlib
from datetime import datetime


In [2]:
# Function to load the secrets from the local file
def load_secrets(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)
secrets = load_secrets('secrets.json')

aws_access_key_id = secrets['AWS_ACCESS_KEY_ID']
aws_secret_access_key = secrets['AWS_SECRET_ACCESS_KEY']
aws_region = secrets['AWS_REGION']

In [3]:
s3 = boto3.client('s3', 
                  aws_access_key_id=aws_access_key_id, 
                  aws_secret_access_key=aws_secret_access_key, 
                  region_name=aws_region)

In [4]:
# Delete if not in Jupyter Notebook
if 'ipykernel' in sys.modules:
    sys.argv = ['jsonl.py', 'InfoClimat']  


parser = argparse.ArgumentParser(description="Process an Excel file")
parser.add_argument(
    'file',
    default='InfoClimat',
    help='The name of the station to process. Only accepts InfoClimat'
)

parser.add_argument(
    "--mongodb_address", 
    default="mongodb://localhost:27017/", 
    help="The MongoDB address (default: mongodb://localhost:27017/)"
)

def upper_case(string):
    return string.upper()
parser.add_argument(
    "-v", "--verbosity",
    type=upper_case,
    default="INFO", 
    choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
    help="Set the logging verbosity level (default: INFO)"
)
args = parser.parse_args()


In [5]:
log_level = getattr(logging, args.verbosity)
logging.basicConfig(level=log_level)
logger = logging.getLogger(__name__)

In [6]:
mongodb_address = args.mongodb_address
# After parsing arguments
logger.info(f"Station: {args.file}")  
logger.info(f"MongoDB adress: {mongodb_address}")  # Debug lines to check the parsed arguments

INFO:__main__:Station: InfoClimat
INFO:__main__:MongoDB adress: mongodb://localhost:27017/


In [7]:
if not args.file:
    args.file = ['InfoClimat']

if args.file == 'InfoClimat':
    file_key = "greencoop-airbyte/Stations_meteorologiques_du_reseau_InfoClimat_(Bergues,_Hazebrouck,_Armentieres,_Lille-Lesquin)/2025_03_14_1741977939508_0.jsonl"

In [8]:
bucket_name = 'greencoop-airbyte'
s3_object = s3.get_object(Bucket=bucket_name, Key=file_key)
file_content = s3_object['Body'].read().decode('utf-8')

In [9]:
# Read the JSONL file from S3
response = s3.get_object(Bucket=bucket_name, Key=file_key)
content = response["Body"].read().decode("utf-8")

# Convert JSONL to Pandas DataFrame
json_list = [json.loads(line) for line in content.splitlines()]
df = pd.DataFrame(json_list)

# Show first rows
df.head()


Unnamed: 0,_airbyte_raw_id,_airbyte_extracted_at,_airbyte_meta,_airbyte_generation_id,_airbyte_data
0,5e7a90f2-1e3e-42a6-8c76-ee59f1a8f45b,1741977942000,"{'sync_id': 1, 'changes': []}",1,"{'status': 'OK', 'errors': [], 'data': [], 'st..."


In [10]:
import pandas as pd
from pandas import json_normalize

# Assuming `df_clean` is the output of json_normalize
df_clean = pd.json_normalize(df["_airbyte_data"])

# Expand the 'stations' field
stations_df = pd.json_normalize(df_clean["stations"].explode())

# Expand the 'hourly' fields
hourly_dfs = {}
hourly_cols = [col for col in df_clean.columns if col.startswith("hourly.")]
for col in hourly_cols:
    hourly_dfs[col] = pd.json_normalize(df_clean[col].explode())

# Display the extracted DataFrames
print("Stations Data:")
print(stations_df.head())

for col, hourly_df in hourly_dfs.items():
    print(f"\nHourly Data ({col}):")
    print(hourly_df.head())


Stations Data:
           id           name  latitude  longitude  elevation    type  \
0       00052    Armentières    50.689      2.877         16  static   
1       000R5        Bergues    50.968      2.441         17  static   
2       07015  Lille-Lesquin    50.575      3.092         47   synop   
3  STATIC0010     Hazebrouck    50.734      2.545         31  static   

       license.license                                        license.url  \
0                CC BY    https://creativecommons.org/licenses/by/2.0/fr/   
1                CC BY    https://creativecommons.org/licenses/by/2.0/fr/   
2  Etalab Open License  https://www.etalab.gouv.fr/licence-ouverte-ope...   
3                CC BY    https://creativecommons.org/licenses/by/2.0/fr/   

                   license.source  \
0                   infoclimat.fr   
1                   infoclimat.fr   
2  Meteo-France via infoclimat.fr   
3                   infoclimat.fr   

                                 license.metadonnees

In [11]:
data_to_insert = df_clean.to_dict(orient="records")
data_to_insert[0].keys()

dict_keys(['status', 'errors', 'data', 'stations', 'metadata.temperature', 'metadata.pression', 'metadata.humidite', 'metadata.point_de_rosee', 'metadata.visibilite', 'metadata.vent_moyen', 'metadata.vent_rafales', 'metadata.vent_direction', 'metadata.pluie_3h', 'metadata.pluie_1h', 'metadata.neige_au_sol', 'metadata.nebulosite', 'metadata.temps_omm', 'hourly.07015', 'hourly.00052', 'hourly.000R5', 'hourly.STATIC0010', 'hourly._params'])

In [72]:
data_to_insert[0]

{'status': 'OK',
 'errors': [],
 'data': [],
 'stations': [{'id': '00052',
   'name': 'Armentières',
   'latitude': 50.689,
   'longitude': 2.877,
   'elevation': 16,
   'type': 'static',
   'license': {'license': 'CC BY',
    'url': 'https://creativecommons.org/licenses/by/2.0/fr/',
    'source': 'infoclimat.fr',
    'metadonnees': 'https://www.infoclimat.fr/stations/metadonnees.php?id=00052'}},
  {'id': '000R5',
   'name': 'Bergues',
   'latitude': 50.968,
   'longitude': 2.441,
   'elevation': 17,
   'type': 'static',
   'license': {'license': 'CC BY',
    'url': 'https://creativecommons.org/licenses/by/2.0/fr/',
    'source': 'infoclimat.fr',
    'metadonnees': 'https://www.infoclimat.fr/stations/metadonnees.php?id=000R5'}},
  {'id': '07015',
   'name': 'Lille-Lesquin',
   'latitude': 50.575,
   'longitude': 3.092,
   'elevation': 47,
   'type': 'synop',
   'license': {'license': 'Etalab Open License',
    'url': 'https://www.etalab.gouv.fr/licence-ouverte-open-licence',
    'sourc

In [13]:
data_to_insert[0]["stations"]

[{'id': '00052',
  'name': 'Armentières',
  'latitude': 50.689,
  'longitude': 2.877,
  'elevation': 16,
  'type': 'static',
  'license': {'license': 'CC BY',
   'url': 'https://creativecommons.org/licenses/by/2.0/fr/',
   'source': 'infoclimat.fr',
   'metadonnees': 'https://www.infoclimat.fr/stations/metadonnees.php?id=00052'}},
 {'id': '000R5',
  'name': 'Bergues',
  'latitude': 50.968,
  'longitude': 2.441,
  'elevation': 17,
  'type': 'static',
  'license': {'license': 'CC BY',
   'url': 'https://creativecommons.org/licenses/by/2.0/fr/',
   'source': 'infoclimat.fr',
   'metadonnees': 'https://www.infoclimat.fr/stations/metadonnees.php?id=000R5'}},
 {'id': '07015',
  'name': 'Lille-Lesquin',
  'latitude': 50.575,
  'longitude': 3.092,
  'elevation': 47,
  'type': 'synop',
  'license': {'license': 'Etalab Open License',
   'url': 'https://www.etalab.gouv.fr/licence-ouverte-open-licence',
   'source': 'Meteo-France via infoclimat.fr',
   'metadonnees': 'https://donneespubliques.mete

In [75]:
data_to_insert[0]['hourly.STATIC0010'][0:3]

[{'id_station': 'STATIC0010',
  'dh_utc': '2024-10-05 00:00:00',
  'temperature': '4.7',
  'pression': '1020.6',
  'humidite': '97',
  'point_de_rosee': '4.2',
  'vent_moyen': '0',
  'vent_rafales': None,
  'vent_direction': '0',
  'pluie_3h': None,
  'pluie_1h': '0',
  'station': 'Hazebrouck'},
 {'id_station': 'STATIC0010',
  'dh_utc': '2024-10-05 00:10:00',
  'temperature': '4.7',
  'pression': '1020.7',
  'humidite': '97',
  'point_de_rosee': '4.2',
  'vent_moyen': '0',
  'vent_rafales': None,
  'vent_direction': '0',
  'pluie_3h': None,
  'pluie_1h': None,
  'station': 'Hazebrouck'},
 {'id_station': 'STATIC0010',
  'dh_utc': '2024-10-05 00:20:00',
  'temperature': '4.6',
  'pression': '1020.6',
  'humidite': '97',
  'point_de_rosee': '4.1',
  'vent_moyen': '0',
  'vent_rafales': None,
  'vent_direction': '0',
  'pluie_3h': None,
  'pluie_1h': None,
  'station': 'Hazebrouck'}]

In [78]:
data_to_insert[0]['hourly.07015'][3]

{'id_station': '07015',
 'dh_utc': '2024-10-05 03:00:00',
 'temperature': '6.2',
 'pression': '1020.1',
 'humidite': '95',
 'point_de_rosee': '5.5',
 'visibilite': '2500',
 'vent_moyen': '3.6',
 'vent_rafales': '7.2',
 'vent_direction': '60',
 'pluie_3h': '0',
 'pluie_1h': '0',
 'neige_au_sol': None,
 'nebulosite': '',
 'temps_omm': '10',
 'station': 'Lille-Lesquin'}

In [15]:
for i in data_to_insert[0]['hourly.07015'][3].keys():
    j = data_to_insert[0]['hourly.07015'][3][i]
    print(type(j),"est le type de",j)

<class 'str'> est le type de 07015
<class 'str'> est le type de 2024-10-05 03:00:00
<class 'str'> est le type de 6.2
<class 'str'> est le type de 1020.1
<class 'str'> est le type de 95
<class 'str'> est le type de 5.5
<class 'str'> est le type de 2500
<class 'str'> est le type de 3.6
<class 'str'> est le type de 7.2
<class 'str'> est le type de 60
<class 'str'> est le type de 0
<class 'str'> est le type de 0
<class 'NoneType'> est le type de None
<class 'str'> est le type de 
<class 'str'> est le type de 10


In [38]:
# ✅ 1. Connect to MongoDB
client = MongoClient(mongodb_address)  # Update if needed
db = client["weather_data"]  # Your database name
collection = db["weather_station"]  # Collection name


# ✅ 2. Load the JSON data (assuming it's in a file or a variable)
weather_data = data_to_insert

# ✅ 3. Extract Relevant Data
stations = weather_data[0].get("stations", [])  # Extract station metadata
hourly_data = {
    key: weather_data[0][key]
    for key in weather_data[0]
    if key.startswith("hourly.") and key != "hourly._params"
}

# ✅ 4. Flatten Hourly Data
flattened_data = []
for station_id, records in hourly_data.items():
    for record in records:
        flattened_data.append(record)  # Convert list of dicts into single list



In [17]:
Id_to_station = {station['id']: station['name'] for station in data_to_insert[0]["stations"]}
Id_to_station

{'00052': 'Armentières',
 '000R5': 'Bergues',
 '07015': 'Lille-Lesquin',
 'STATIC0010': 'Hazebrouck'}

In [18]:
for doc in flattened_data:
    doc['station'] = Id_to_station.get(doc['id_station'], 'Unknown')

In [19]:
from datetime import datetime
import copy

def process_data(document):
    document = copy.deepcopy(document)
    # Convert strings to appropriate types
    document['dh_utc'] = datetime.strptime(document['dh_utc'], '%Y-%m-%d %H:%M:%S')  # Convert to datetime, must start as str
    # Ensure all fields are converted properly, handling missing keys
    document['temperature'] = float(document['temperature']) if document.get('temperature') else None
    document['pression'] = float(document['pression']) if document.get('pression') else None
    document['humidite'] = int(document['humidite']) if document.get('humidite') else None
    document['point_de_rosee'] = float(document['point_de_rosee']) if document.get('point_de_rosee') else None
    document['visibilite'] = int(document.get('visibilite', 0)) if document.get('visibilite') else None
    document['vent_moyen'] = float(document['vent_moyen']) if document.get('vent_moyen') else None
    document['vent_rafales'] = float(document['vent_rafales']) if document.get('vent_rafales') else None
    document['vent_direction'] = int(document['vent_direction']) if document.get('vent_direction') else None
    document['pluie_3h'] = float(document['pluie_3h']) if document.get('pluie_3h') else None
    document['pluie_1h'] = float(document['pluie_1h']) if document.get('pluie_1h') else None
    document['neige_au_sol'] = float(document.get('neige_au_sol', 0)) if document.get('neige_au_sol') else None
    document['nebulosite'] = str(document.get('nebulosite', '')) if document.get('nebulosite') else ''
    document['temps_omm'] = float(document.get('temps_omm', 0)) if document.get('temps_omm') else None
    

    return document

In [20]:
def process_data2(document):
    # Define a mapping of fields to their conversion functions
    conversion_map = {
        'dh_utc': lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S') if x else None,
        'temperature': lambda x: float(x) if x else None,
        'pression': lambda x: float(x) if x else None,
        'humidite': lambda x: int(x) if x else None,
        'point_de_rosee': lambda x: float(x) if x else None,
        'visibilite': lambda x: int(x) if x else None,
        'vent_moyen': lambda x: float(x) if x else None,
        'vent_rafales': lambda x: float(x) if x else None,
        'vent_direction': lambda x: int(x) if x else None,
        'pluie_3h': lambda x: float(x) if x else None,
        'pluie_1h': lambda x: float(x) if x else None,
        'neige_au_sol': lambda x: float(x) if x else None,
        'nebulosite': lambda x: str(x) if x else '',
        'temps_omm': lambda x: float(x) if x else None
    }

    # Use deepcopy to avoid mutating the original document
    document = copy.deepcopy(document)

    # Loop over the conversion map and apply the transformation if the field exists in the document
    for field, conversion_func in conversion_map.items():
        if field in document:
            document[field] = conversion_func(document[field])

    return document


In [21]:
def rename_columns(document):
    # Column renaming and translation mapping
    column_mapping = {
        'dh_utc': 'datetime',
        'temperature': 'temperature_°C',  # In Celsius
        'pression': 'pressure_hPa',  # In hPa
        'humidite': 'humidity_%',
        'point_de_rosee': 'dew_point_°C',  # In Celsius
        'visibilite': 'visibility_m',  # Assuming meters for visibility
        'vent_moyen': 'wind_speed_kph',  # In km/h
        'vent_rafales': 'wind_gust_kph',  # In km/h
        'vent_direction': 'wind_dir',  # Wind direction (unchanged)
        'pluie_3h': 'precip_rate_mm/hr (3hrs)',  # In mm/hr
        'pluie_1h': 'precip_rate_mm/hr',  # In mm/hr
        'neige_au_sol': 'snow_depth_mm',  # In mm
        'nebulosite': 'cloud_coverage',  # General cloud coverage (string or percentage)
        'temps_omm': 'solar_w/m²'  # Assuming temperature is related to solar irradiance
    }

    # Rename keys based on the mapping
    for old_key, new_key in column_mapping.items():
        if old_key in document:
            document[new_key] = document.pop(old_key)

    return document

 

In [22]:
document1 = process_data(flattened_data[4])
document1

{'id_station': '07015',
 'dh_utc': datetime.datetime(2024, 10, 5, 4, 0),
 'temperature': 5.5,
 'pression': 1019.9,
 'humidite': 96,
 'point_de_rosee': 4.9,
 'visibilite': 500,
 'vent_moyen': 3.6,
 'vent_rafales': 3.6,
 'vent_direction': 90,
 'pluie_3h': None,
 'pluie_1h': 0.0,
 'neige_au_sol': None,
 'nebulosite': '',
 'temps_omm': 44.0,
 'station': 'Lille-Lesquin'}

In [23]:
rename_columns(document1)

{'id_station': '07015',
 'station': 'Lille-Lesquin',
 'datetime': datetime.datetime(2024, 10, 5, 4, 0),
 'temperature_°C': 5.5,
 'pressure_hPa': 1019.9,
 'humidity_%': 96,
 'dew_point_°C': 4.9,
 'visibility_m': 500,
 'wind_speed_kph': 3.6,
 'wind_gust_kph': 3.6,
 'wind_dir': 90,
 'precip_rate_mm/hr (3hrs)': None,
 'precip_rate_mm/hr': 0.0,
 'snow_depth_mm': None,
 'cloud_coverage': '',
 'solar_w/m²': 44.0}

In [24]:
document1.keys()

dict_keys(['id_station', 'station', 'datetime', 'temperature_°C', 'pressure_hPa', 'humidity_%', 'dew_point_°C', 'visibility_m', 'wind_speed_kph', 'wind_gust_kph', 'wind_dir', 'precip_rate_mm/hr (3hrs)', 'precip_rate_mm/hr', 'snow_depth_mm', 'cloud_coverage', 'solar_w/m²'])

In [25]:
for i in document1.keys():
    j = document1[i]
    print(type(j),"est le type de",j)

<class 'str'> est le type de 07015
<class 'str'> est le type de Lille-Lesquin
<class 'datetime.datetime'> est le type de 2024-10-05 04:00:00
<class 'float'> est le type de 5.5
<class 'float'> est le type de 1019.9
<class 'int'> est le type de 96
<class 'float'> est le type de 4.9
<class 'int'> est le type de 500
<class 'float'> est le type de 3.6
<class 'float'> est le type de 3.6
<class 'int'> est le type de 90
<class 'NoneType'> est le type de None
<class 'float'> est le type de 0.0
<class 'NoneType'> est le type de None
<class 'str'> est le type de 
<class 'float'> est le type de 44.0


In [26]:
document1['datetime']

datetime.datetime(2024, 10, 5, 4, 0)

In [27]:
# ✅ 5.1. Convert into correct types
for i in flattened_data:
    i = process_data(i)

processed_data = [process_data(doc) for doc in flattened_data]

# ✅ 5.1.5 Convert the column names into the general ones
processed_data = [rename_columns(doc) for doc in processed_data] 



In [28]:
processed_data[1:3]

[{'id_station': '07015',
  'station': 'Lille-Lesquin',
  'datetime': datetime.datetime(2024, 10, 5, 1, 0),
  'temperature_°C': 7.5,
  'pressure_hPa': 1020.6,
  'humidity_%': 92,
  'dew_point_°C': 6.3,
  'visibility_m': 7000,
  'wind_speed_kph': 3.6,
  'wind_gust_kph': 7.2,
  'wind_dir': 30,
  'precip_rate_mm/hr (3hrs)': None,
  'precip_rate_mm/hr': 0.0,
  'snow_depth_mm': None,
  'cloud_coverage': '',
  'solar_w/m²': None},
 {'id_station': '07015',
  'station': 'Lille-Lesquin',
  'datetime': datetime.datetime(2024, 10, 5, 2, 0),
  'temperature_°C': 7.4,
  'pressure_hPa': 1020.3,
  'humidity_%': 93,
  'dew_point_°C': 6.3,
  'visibility_m': 4400,
  'wind_speed_kph': 7.2,
  'wind_gust_kph': 7.2,
  'wind_dir': 40,
  'precip_rate_mm/hr (3hrs)': None,
  'precip_rate_mm/hr': 0.0,
  'snow_depth_mm': None,
  'cloud_coverage': '',
  'solar_w/m²': 10.0}]

In [29]:
common_keys = set(processed_data[0].keys())
for doc in processed_data[1:]:
    common_keys &= set(doc.keys())
len(common_keys)

16

In [30]:
# Set to hold unique keys
all_keys = set()

# Iterate over all documents in processed_data
for document in processed_data:
    all_keys.update(document.keys())
len(all_keys)

16

In [None]:

# Function to generate ObjectId from a unique key
def generate_objectid(unique_str):
    hash_hex = hashlib.md5(unique_str.encode()).hexdigest()[:24]  # Ensure 24 chars
    return ObjectId(hash_hex)

# Add the ObjectId to each document in the list
for doc in processed_data:
    unique_str = str(doc['datetime']) + doc['station']
    doc['_id'] = generate_objectid(unique_str)



{'id_station': '07015', 'station': 'Lille-Lesquin', 'datetime': datetime.datetime(2024, 10, 5, 0, 0), 'temperature_°C': 7.6, 'pressure_hPa': 1020.7, 'humidity_%': 89, 'dew_point_°C': 5.9, 'visibility_m': 6000, 'wind_speed_kph': 3.6, 'wind_gust_kph': 7.2, 'wind_dir': 90, 'precip_rate_mm/hr (3hrs)': 0.0, 'precip_rate_mm/hr': 0.0, 'snow_depth_mm': None, 'cloud_coverage': '', 'solar_w/m²': None, '_id': ObjectId('6844ef7d013a32b44e5c1b85')}
{'id_station': '07015', 'station': 'Lille-Lesquin', 'datetime': datetime.datetime(2024, 10, 5, 1, 0), 'temperature_°C': 7.5, 'pressure_hPa': 1020.6, 'humidity_%': 92, 'dew_point_°C': 6.3, 'visibility_m': 7000, 'wind_speed_kph': 3.6, 'wind_gust_kph': 7.2, 'wind_dir': 30, 'precip_rate_mm/hr (3hrs)': None, 'precip_rate_mm/hr': 0.0, 'snow_depth_mm': None, 'cloud_coverage': '', 'solar_w/m²': None, '_id': ObjectId('3df7545ee2ade0bcbb51b66d')}
{'id_station': '07015', 'station': 'Lille-Lesquin', 'datetime': datetime.datetime(2024, 10, 5, 2, 0), 'temperature_°C':

In [66]:
# Insert documents
try:
    # Insert documents, set 'ordered=False' to continue on duplicate key error
    result = collection.insert_many(processed_data, ordered=False)

    # Log the number of inserted documents
    inserted_count = len(result.inserted_ids)
    logger.info(f"Successfully inserted {inserted_count} documents into MongoDB at {mongodb_address}!")

except errors.BulkWriteError as e:
    # Extract useful summary info without dumping full error
    inserted_count = e.details.get('nInserted', 0)
    write_errors = e.details.get('writeErrors', [])
    duplicate_count = 0
    validation_count = 0
    other_count = 0

    # Separate errors into categories
    for error in write_errors:
        if error.get('code') == 11000:  # Duplicate key error
            duplicate_count += 1
        elif error.get('code') == 121:  # Validation error
            validation_count += 1
        else:  # Other errors
            other_count += 1

    # Log the counts of different error types
    logger.warning(f"Duplicate key error: {duplicate_count} documents were skipped.")
    logger.warning(f"Validation error: {validation_count} documents failed validation.")
    logger.warning(f"Other errors: {other_count} documents encountered other errors.")

    # Successfully inserted documents
    logger.info(f"{inserted_count} documents were successfully inserted despite this error.")

    # Log the first 3 duplicate _id values
    duplicate_key_errors_handled = 0
    for error in write_errors:
        if error.get('code') == 11000 and duplicate_key_errors_handled < 3:  # Duplicate key error
            errmsg = error.get('errmsg', 'No detailed message available')
            logger.info(f"Duplicate key error: {errmsg}")

            # Extract the duplicate key information from the error details
            dup_id = error.get('keyValue', {}).get('_id', 'unknown')
            logger.debug(f"Duplicate _id: {dup_id}")

            # Log the failed document data for duplicate key errors
            logger.info(f"Failed document data for duplicate key: {error.get('op', {})}")
            duplicate_key_errors_handled += 1

    if duplicate_count > 3:
        logger.info(f"...and {duplicate_count - 3} more duplicates were skipped.")

    # Log the first 3 validation errors
    validation_errors_handled = 0
    for error in write_errors:
        if error.get('code') == 121 and validation_errors_handled < 3:  # Validation error
            errmsg = error.get('errmsg', 'No detailed message available')
            logger.info(f"Validation failed for document: {errmsg}")
            logger.info(f"Failed document data for validation error: {error.get('op', {})}")
            validation_errors_handled += 1

    if validation_count > 3:
        logger.debug(f"...and {validation_count - 3} more validation errors occurred.")

    # If there are any other errors (non-validation, non-duplicate), log them
    if other_count > 0:
        logger.debug(f"...and {other_count} other errors occurred.")


INFO:__main__:0 documents were successfully inserted despite this error.
INFO:__main__:Duplicate key error: E11000 duplicate key error collection: weather_data.weather_station index: _id_ dup key: { _id: ObjectId('67f2b4be447757d499edca17') }
INFO:__main__:Failed document data for duplicate key: {'id_station': '07015', 'station': 'Lille-Lesquin', 'datetime': datetime.datetime(2024, 10, 5, 0, 0), 'temperature_°C': 7.6, 'pressure_hPa': 1020.7, 'humidity_%': 89, 'dew_point_°C': 5.9, 'visibility_m': 6000, 'wind_speed_kph': 3.6, 'wind_gust_kph': 7.2, 'wind_dir': 90, 'precip_rate_mm/hr (3hrs)': 0.0, 'precip_rate_mm/hr': 0.0, 'snow_depth_mm': None, 'cloud_coverage': '', 'solar_w/m²': None, '_id': ObjectId('67f2b4be447757d499edca17')}
INFO:__main__:Duplicate key error: E11000 duplicate key error collection: weather_data.weather_station index: _id_ dup key: { _id: ObjectId('67f2b4be447757d499edca18') }
INFO:__main__:Failed document data for duplicate key: {'id_station': '07015', 'station': 'Lil

In [69]:
# ✅ 6. Verify Data
print("Sample Data from MongoDB:")
pipeline = [
    {
        "$group": {
            "_id": "$station",  # Group by id_station
            "first_document": {"$first": "$$ROOT"}  # Take the first document for each id_station
        }
    },
    {
        "$replaceRoot": {"newRoot": "$first_document"}  # Replace the root document with the first document
    }
]
# Execute the aggregation
result = collection.aggregate(pipeline)

# Fetch the results
distinct_documents = list(result)

# Print the result
for doc in distinct_documents:
    print(doc)

Sample Data from MongoDB:
{'_id': ObjectId('67f2b4be447757d499edca17'), 'id_station': '07015', 'station': 'Lille-Lesquin', 'datetime': datetime.datetime(2024, 10, 5, 0, 0), 'temperature_°C': 7.6, 'pressure_hPa': 1020.7, 'humidity_%': 89, 'dew_point_°C': 5.9, 'visibility_m': 6000, 'wind_speed_kph': 3.6, 'wind_gust_kph': 7.2, 'wind_dir': 90, 'precip_rate_mm/hr (3hrs)': 0.0, 'precip_rate_mm/hr': 0.0, 'snow_depth_mm': None, 'cloud_coverage': '', 'solar_w/m²': None}
{'_id': ObjectId('67f2b4be447757d499edcbbc'), 'id_station': '000R5', 'station': 'Bergues', 'datetime': datetime.datetime(2024, 10, 5, 0, 0), 'temperature_°C': 8.4, 'pressure_hPa': 1019.3, 'humidity_%': 86, 'dew_point_°C': 6.2, 'visibility_m': None, 'wind_speed_kph': 1.4, 'wind_gust_kph': None, 'wind_dir': 113, 'precip_rate_mm/hr (3hrs)': 0.0, 'precip_rate_mm/hr': 0.0, 'snow_depth_mm': None, 'cloud_coverage': '', 'solar_w/m²': None}
{'_id': ObjectId('05cfc351250674cec2eaef31'), 'station': 'Ichtegem', 'datetime': datetime.datetime