In [1]:
import sys
import json
import re
import argparse
from datetime import datetime
from io import BytesIO
import os
import hashlib
import logging

import pandas as pd
import boto3
from pymongo import MongoClient, errors
from bson import ObjectId


In [2]:
logging.basicConfig(level=logging.INFO)  # Change to DEBUG for higher verbosity
logger = logging.getLogger(__name__)

In [3]:
def load_secrets(file_path):
    if not os.path.exists(file_path):
        logger.error(f"Error: {file_path} not found!")
        sys.exit(1)
    
    with open(file_path, 'r') as file:
        return json.load(file)
secrets = load_secrets('secrets.json')

aws_access_key_id = secrets['AWS_ACCESS_KEY_ID']
aws_secret_access_key = secrets['AWS_SECRET_ACCESS_KEY']
aws_region = secrets['AWS_REGION']

In [4]:
# Delete if not in Jupyter Notebook
if 'ipykernel' in sys.modules:
    sys.argv = ['xlsx.py', 'Ichtegem']  


parser = argparse.ArgumentParser(description="Process an Excel file")
parser.add_argument(
    'file',
    default='Ichtegem',
    help='The name of the station to process. Only accepts Ichtegem or Madeleine'
)

parser.add_argument(
    "--mongodb_address", 
    default="mongodb://localhost:27017/", 
    help="The MongoDB address (default: mongodb://localhost:27017/)"
)

args = parser.parse_args()

mongodb_address = args.mongodb_address

In [5]:
s3 = boto3.client('s3', 
                  aws_access_key_id=aws_access_key_id, 
                  aws_secret_access_key=aws_secret_access_key, 
                  region_name=aws_region)

In [6]:
bucket_name = 'greencoop-airbyte'
if args.file == 'Ichtegem':
    file_key = "greencoop-airbyte/Ichtegem.xlsx"
elif args.file == 'Madeleine':
    file_key = "greencoop-airbyte/Madeleine.xlsx"

s3_object = s3.get_object(Bucket=bucket_name, Key=file_key)
file_content = s3_object['Body'].read()

# Charger le fichier Excel avec pandas
excel_file = pd.ExcelFile(BytesIO(file_content), engine='openpyxl')

In [7]:
# Define column renaming dictionary (with explicit units)
column_mapping = {
    "Time": "time",
    "Temperature": "temperature_°F",
    "Dew Point": "dew_point_°F",
    "Humidity": "humidity_%",
    "Wind": "wind_dir",
    "Speed": "wind_speed_mph",
    "Gust": "wind_gust_mph",
    "Pressure": "pressure_inHg",
    "Precip. Rate.": "precip_rate_in/hr",
    "Precip. Accum.": "precip_accum_in",
    "UV": "uv_index",
    "Solar": "solar_w/m²"
}

def clean_value(value):
    if isinstance(value, str):
        match = re.search(r"[-+]?\d*\.?\d+", value)  # Extract numeric part
        return float(match.group()) if match else None  # Keep None for non-numeric strings
    return value  # Return unchanged if not a string (including NaN)


# List to store processed DataFrames
dfs = []

# Loop through all sheets
for sheet_name in excel_file.sheet_names:
    # Read the current sheet
    df = excel_file.parse(sheet_name, na_values=["", "None", "NA", "NaN"])
    # Rename columns for consistency
    df.rename(columns=column_mapping, inplace=True)
    
    # Apply cleaning function to all columns (except 'time' and 'wind_dir')
    for col in df.columns:
        if col not in ["time", "wind_dir"]:  # Exclude categorical columns
            df[col] = df[col].apply(clean_value)

    df = df.dropna(how='all')

    # Convert sheet name (DDMMAAAA) to date (YYYY-MM-DD)
    date_formatted = pd.to_datetime(sheet_name, format="%d%m%y")
    
    # Add the date column
    df.insert(0, "date", date_formatted)
    df["time"] = pd.to_datetime(df["time"], format="%H:%M:%S").dt.time
    df["datetime"] = pd.to_datetime(df["date"].astype(str) + " " + df["time"].astype(str))
    df = df.drop(columns=["date", "time"])
    df = df[["datetime"] + [col for col in df.columns if col != "datetime"]]

    # Append cleaned DataFrame
    dfs.append(df)

# Combine all sheets into one DataFrame
final_df = pd.concat(dfs, ignore_index=True)

In [8]:
# Display unique values for each column
# for col in final_df.columns:
#    unique_values = final_df[col].unique()
#    print(f"Column: {col}")
#    print(f"Unique Values: {unique_values[:100]}")  # Display only the first 10 unique values
#    print("-" * 50)


In [9]:
final_df

Unnamed: 0,datetime,temperature_°F,dew_point_°F,humidity_%,wind_dir,wind_speed_mph,wind_gust_mph,pressure_inHg,precip_rate_in/hr,precip_accum_in,uv_index,solar_w/m²
0,2024-10-01 00:04:00,56.8,53.1,87.0,WSW,8.2,10.4,29.48,0.0,0.00,0.0,0.0
1,2024-10-01 00:09:00,56.8,52.9,87.0,WSW,7.9,9.8,29.47,0.0,0.00,0.0,0.0
2,2024-10-01 00:14:00,57.0,52.8,86.0,West,10.3,12.8,29.47,0.0,0.00,0.0,0.0
3,2024-10-01 00:19:00,57.2,52.7,85.0,WSW,9.7,12.2,29.47,0.0,0.00,0.0,0.0
4,2024-10-01 00:24:00,57.2,52.7,85.0,WSW,9.7,11.9,29.47,0.0,0.00,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1894,2024-10-07 14:29:00,65.2,56.3,73.0,WSW,3.7,4.8,29.37,0.0,0.01,4.0,414.8
1895,2024-10-07 14:34:00,65.5,56.3,72.0,WSW,3.4,4.2,29.36,0.0,0.01,4.0,416.6
1896,2024-10-07 14:39:00,65.7,55.5,70.0,WSW,4.3,6.1,29.36,0.0,0.01,3.0,338.5
1897,2024-10-07 14:44:00,66.4,56.1,70.0,West,2.9,4.1,29.36,0.0,0.01,3.0,349.6


In [10]:
def convertToMetric(df):
    """
    Convert to metric and to other small ajustements for all data to be formated the same way
    """
    df = df.copy() 
    	
    df['dew_point_°C'] = ((df['dew_point_°F'] - 32) * 5/9).round(1)
    df['temperature_°C'] = ((df['temperature_°F'] - 32) * 5/9).round(1)
    df['wind_speed_kph'] = (df['wind_speed_mph'] * 1.60934).round(1)
    df['wind_gust_kph'] = (df['wind_gust_mph'] * 1.60934).round(1)
    df['pressure_hPa'] = (df['pressure_inHg'] * 33.8639).round(1)
    df['precip_rate_mm/hr'] = (df['precip_rate_in/hr'] * 25.4).round(1)
    df['precip_accum_mm'] = (df['precip_accum_in'] * 25.4).round(1)

    df['humidity_%'] = df['humidity_%'].astype(int)

    df.drop(columns=['temperature_°F', 'wind_speed_mph', 'wind_gust_mph', 'pressure_inHg', 
                     'precip_rate_in/hr', 'precip_accum_in', 'dew_point_°F'], inplace=True)
    return df


In [11]:
final_df2 = convertToMetric(final_df)

In [12]:
final_df2

Unnamed: 0,datetime,humidity_%,wind_dir,uv_index,solar_w/m²,dew_point_°C,temperature_°C,wind_speed_kph,wind_gust_kph,pressure_hPa,precip_rate_mm/hr,precip_accum_mm
0,2024-10-01 00:04:00,87,WSW,0.0,0.0,11.7,13.8,13.2,16.7,998.3,0.0,0.0
1,2024-10-01 00:09:00,87,WSW,0.0,0.0,11.6,13.8,12.7,15.8,998.0,0.0,0.0
2,2024-10-01 00:14:00,86,West,0.0,0.0,11.6,13.9,16.6,20.6,998.0,0.0,0.0
3,2024-10-01 00:19:00,85,WSW,0.0,0.0,11.5,14.0,15.6,19.6,998.0,0.0,0.0
4,2024-10-01 00:24:00,85,WSW,0.0,0.0,11.5,14.0,15.6,19.2,998.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1894,2024-10-07 14:29:00,73,WSW,4.0,414.8,13.5,18.4,6.0,7.7,994.6,0.0,0.3
1895,2024-10-07 14:34:00,72,WSW,4.0,416.6,13.5,18.6,5.5,6.8,994.2,0.0,0.3
1896,2024-10-07 14:39:00,70,WSW,3.0,338.5,13.1,18.7,6.9,9.8,994.2,0.0,0.3
1897,2024-10-07 14:44:00,70,West,3.0,349.6,13.4,19.1,4.7,6.6,994.2,0.0,0.3


In [13]:
def wind_dir_to_angle(df):
    df = df.copy()
    dir_to_angle = {
        'N': 0,
        'NNE': 22.5,
        'NE': 45,
        'ENE': 67.5,
        'E': 90,
        'ESE': 112.5,
        'SE': 135,
        'SSE': 157.5,
        'S': 180,
        'SSW': 202.5,
        'SW': 225,
        'WSW': 247.5,
        'W': 270,
        'WNW': 292.5,
        'NW': 315,
        'NNW': 337.5,
        'North': 0,
        'South': 180,
        'East': 90,
        'West': 270   
    }
    df['wind_dir'] = df['wind_dir'].map(dir_to_angle)
    return df


In [14]:
final_df2 = wind_dir_to_angle(final_df2)

In [15]:
final_df2["precip_rate_mm/hr"].unique()

array([0. , 1.8, 1.3, 3. , 7.9, 6.1, 8.9, 4.8])

In [16]:
final_df2["precip_accum_mm"].unique()

array([0. , 0.3, 0.5, 0.8, 1. , 1.3, 1.5, 1.8, 3. , 3.6, 3.8, 4.1, 4.3,
       4.6, 2.3])

In [17]:
final_df2["wind_dir"].unique()   

array([247.5, 270. , 225. , 202.5, 180. , 157.5,  90. , 135. , 292.5,
       112.5,  67.5,  45. ,  22.5,   0. , 337.5,   nan, 315. ])

In [18]:
final_df2

Unnamed: 0,datetime,humidity_%,wind_dir,uv_index,solar_w/m²,dew_point_°C,temperature_°C,wind_speed_kph,wind_gust_kph,pressure_hPa,precip_rate_mm/hr,precip_accum_mm
0,2024-10-01 00:04:00,87,247.5,0.0,0.0,11.7,13.8,13.2,16.7,998.3,0.0,0.0
1,2024-10-01 00:09:00,87,247.5,0.0,0.0,11.6,13.8,12.7,15.8,998.0,0.0,0.0
2,2024-10-01 00:14:00,86,270.0,0.0,0.0,11.6,13.9,16.6,20.6,998.0,0.0,0.0
3,2024-10-01 00:19:00,85,247.5,0.0,0.0,11.5,14.0,15.6,19.6,998.0,0.0,0.0
4,2024-10-01 00:24:00,85,247.5,0.0,0.0,11.5,14.0,15.6,19.2,998.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1894,2024-10-07 14:29:00,73,247.5,4.0,414.8,13.5,18.4,6.0,7.7,994.6,0.0,0.3
1895,2024-10-07 14:34:00,72,247.5,4.0,416.6,13.5,18.6,5.5,6.8,994.2,0.0,0.3
1896,2024-10-07 14:39:00,70,247.5,3.0,338.5,13.1,18.7,6.9,9.8,994.2,0.0,0.3
1897,2024-10-07 14:44:00,70,270.0,3.0,349.6,13.4,19.1,4.7,6.6,994.2,0.0,0.3


In [19]:
final_df2['station'] = args.file

In [20]:
# Function to generate ObjectId from a unique key
def generate_objectid(unique_str):
    hash_hex = hashlib.md5(unique_str.encode()).hexdigest()[:24]  # Ensure 24 chars
    return ObjectId(hash_hex)

final_df2['_id'] = final_df2.apply(lambda row: generate_objectid(str(row['datetime']) + row['station']), axis=1)

final_df2

Unnamed: 0,datetime,humidity_%,wind_dir,uv_index,solar_w/m²,dew_point_°C,temperature_°C,wind_speed_kph,wind_gust_kph,pressure_hPa,precip_rate_mm/hr,precip_accum_mm,station,_id
0,2024-10-01 00:04:00,87,247.5,0.0,0.0,11.7,13.8,13.2,16.7,998.3,0.0,0.0,Ichtegem,05cfc351250674cec2eaef31
1,2024-10-01 00:09:00,87,247.5,0.0,0.0,11.6,13.8,12.7,15.8,998.0,0.0,0.0,Ichtegem,653906ab4ca02c0973318d93
2,2024-10-01 00:14:00,86,270.0,0.0,0.0,11.6,13.9,16.6,20.6,998.0,0.0,0.0,Ichtegem,11b2154e5d15c8d8e82b68ac
3,2024-10-01 00:19:00,85,247.5,0.0,0.0,11.5,14.0,15.6,19.6,998.0,0.0,0.0,Ichtegem,024abe9c63090bfa96083df0
4,2024-10-01 00:24:00,85,247.5,0.0,0.0,11.5,14.0,15.6,19.2,998.0,0.0,0.0,Ichtegem,935b443032bbaeac73e87365
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1894,2024-10-07 14:29:00,73,247.5,4.0,414.8,13.5,18.4,6.0,7.7,994.6,0.0,0.3,Ichtegem,6982882297d2a0044dfdb263
1895,2024-10-07 14:34:00,72,247.5,4.0,416.6,13.5,18.6,5.5,6.8,994.2,0.0,0.3,Ichtegem,8cc49767eaced4dda9252893
1896,2024-10-07 14:39:00,70,247.5,3.0,338.5,13.1,18.7,6.9,9.8,994.2,0.0,0.3,Ichtegem,0a5b3af8e07cdb5b85605237
1897,2024-10-07 14:44:00,70,270.0,3.0,349.6,13.4,19.1,4.7,6.6,994.2,0.0,0.3,Ichtegem,572f641ab5406d346effad07


In [21]:
final_df2 = final_df2[['station', 'datetime', 'temperature_°C', 'dew_point_°C', 'humidity_%', 'wind_dir', 'wind_speed_kph', 
         'wind_gust_kph', 'pressure_hPa', 'precip_rate_mm/hr', 'precip_accum_mm',  
         'uv_index', 'solar_w/m²',  '_id']]   

In [22]:
final_df2

Unnamed: 0,station,datetime,temperature_°C,dew_point_°C,humidity_%,wind_dir,wind_speed_kph,wind_gust_kph,pressure_hPa,precip_rate_mm/hr,precip_accum_mm,uv_index,solar_w/m²,_id
0,Ichtegem,2024-10-01 00:04:00,13.8,11.7,87,247.5,13.2,16.7,998.3,0.0,0.0,0.0,0.0,05cfc351250674cec2eaef31
1,Ichtegem,2024-10-01 00:09:00,13.8,11.6,87,247.5,12.7,15.8,998.0,0.0,0.0,0.0,0.0,653906ab4ca02c0973318d93
2,Ichtegem,2024-10-01 00:14:00,13.9,11.6,86,270.0,16.6,20.6,998.0,0.0,0.0,0.0,0.0,11b2154e5d15c8d8e82b68ac
3,Ichtegem,2024-10-01 00:19:00,14.0,11.5,85,247.5,15.6,19.6,998.0,0.0,0.0,0.0,0.0,024abe9c63090bfa96083df0
4,Ichtegem,2024-10-01 00:24:00,14.0,11.5,85,247.5,15.6,19.2,998.0,0.0,0.0,0.0,0.0,935b443032bbaeac73e87365
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1894,Ichtegem,2024-10-07 14:29:00,18.4,13.5,73,247.5,6.0,7.7,994.6,0.0,0.3,4.0,414.8,6982882297d2a0044dfdb263
1895,Ichtegem,2024-10-07 14:34:00,18.6,13.5,72,247.5,5.5,6.8,994.2,0.0,0.3,4.0,416.6,8cc49767eaced4dda9252893
1896,Ichtegem,2024-10-07 14:39:00,18.7,13.1,70,247.5,6.9,9.8,994.2,0.0,0.3,3.0,338.5,0a5b3af8e07cdb5b85605237
1897,Ichtegem,2024-10-07 14:44:00,19.1,13.4,70,270.0,4.7,6.6,994.2,0.0,0.3,3.0,349.6,572f641ab5406d346effad07


In [23]:
final_df2.columns

Index(['station', 'datetime', 'temperature_°C', 'dew_point_°C', 'humidity_%',
       'wind_dir', 'wind_speed_kph', 'wind_gust_kph', 'pressure_hPa',
       'precip_rate_mm/hr', 'precip_accum_mm', 'uv_index', 'solar_w/m²',
       '_id'],
      dtype='object')

In [33]:
# MongoDB setup
client = MongoClient(mongodb_address)
db = client["weather_data"]
collection = db["weather_station"]

In [35]:
records = final_df2.to_dict(orient='records')

# Insert documents
try:
    # Insert documents, set 'ordered=False' to continue on duplicate key error
    result = collection.insert_many(records, ordered=False)

    # Log the number of inserted documents
    inserted_count = len(result.inserted_ids)
    logger.info(f"Successfully inserted {inserted_count} documents into MongoDB at {mongodb_address}!")

except errors.BulkWriteError as e:
    # Extract useful summary info without dumping full error
    inserted_count = e.details.get('nInserted', 0)
    write_errors = e.details.get('writeErrors', [])
    duplicate_count = 0
    validation_count = 0
    other_count = 0

    # Separate errors into categories
    for error in write_errors:
        if error.get('code') == 11000:  # Duplicate key error
            duplicate_count += 1
        elif error.get('code') == 121:  # Validation error
            validation_count += 1
        else:  # Other errors
            other_count += 1

    # Log the counts of different error types
    logger.warning(f"Duplicate key error: {duplicate_count} documents were skipped.")
    logger.warning(f"Validation error: {validation_count} documents failed validation.")
    logger.warning(f"Other errors: {other_count} documents encountered other errors.")

    # Successfully inserted documents
    logger.info(f"{inserted_count} documents were successfully inserted despite this error.")

    # Log the first 3 duplicate _id values
    duplicate_key_errors_handled = 0
    for error in write_errors:
        if error.get('code') == 11000 and duplicate_key_errors_handled < 3:  # Duplicate key error
            errmsg = error.get('errmsg', 'No detailed message available')
            logger.info(f"Duplicate key error: {errmsg}")

            # Extract the duplicate key information from the error details
            dup_id = error.get('keyValue', {}).get('_id', 'unknown')
            logger.debug(f"Duplicate _id: {dup_id}")

            # Log the failed document data for duplicate key errors
            logger.info(f"Failed document data for duplicate key: {error.get('op', {})}")
            duplicate_key_errors_handled += 1

    if duplicate_count > 3:
        logger.info(f"...and {duplicate_count - 3} more duplicates were skipped.")

    # Log the first 3 validation errors
    validation_errors_handled = 0
    for error in write_errors:
        if error.get('code') == 121 and validation_errors_handled < 3:  # Validation error
            errmsg = error.get('errmsg', 'No detailed message available')
            logger.info(f"Validation failed for document: {errmsg}")
            logger.info(f"Failed document data for validation error: {error.get('op', {})}")
            validation_errors_handled += 1

    if validation_count > 3:
        logger.debug(f"...and {validation_count - 3} more validation errors occurred.")

    # If there are any other errors (non-validation, non-duplicate), log them
    if other_count > 0:
        logger.debug(f"...and {other_count} other errors occurred.")



INFO:__main__:0 documents were successfully inserted despite this error.
INFO:__main__:Duplicate key error: E11000 duplicate key error collection: weather_data.weather_station index: _id_ dup key: { _id: ObjectId('05cfc351250674cec2eaef31') }
INFO:__main__:Failed document data for duplicate key: {'station': 'Ichtegem', 'datetime': Timestamp('2024-10-01 00:04:00'), 'temperature_°C': 13.8, 'dew_point_°C': 11.7, 'humidity_%': 87, 'wind_dir': 247.5, 'wind_speed_kph': 13.2, 'wind_gust_kph': 16.7, 'pressure_hPa': 998.3, 'precip_rate_mm/hr': 0.0, 'precip_accum_mm': 0.0, 'uv_index': 0.0, 'solar_w/m²': 0.0, '_id': ObjectId('05cfc351250674cec2eaef31')}
INFO:__main__:Duplicate key error: E11000 duplicate key error collection: weather_data.weather_station index: _id_ dup key: { _id: ObjectId('653906ab4ca02c0973318d93') }
INFO:__main__:Failed document data for duplicate key: {'station': 'Ichtegem', 'datetime': Timestamp('2024-10-01 00:09:00'), 'temperature_°C': 13.8, 'dew_point_°C': 11.6, 'humidity

In [26]:
unique_temperatures = final_df2['uv_index'].unique()

print("Unique temperatures:", unique_temperatures)

Unique temperatures: [0. 1. 2. 5. 3. 4.]


In [27]:
print(final_df2.dtypes)

station                      object
datetime             datetime64[ns]
temperature_°C              float64
dew_point_°C                float64
humidity_%                    int64
wind_dir                    float64
wind_speed_kph              float64
wind_gust_kph               float64
pressure_hPa                float64
precip_rate_mm/hr           float64
precip_accum_mm             float64
uv_index                    float64
solar_w/m²                  float64
_id                          object
dtype: object


In [28]:
final_df2

Unnamed: 0,station,datetime,temperature_°C,dew_point_°C,humidity_%,wind_dir,wind_speed_kph,wind_gust_kph,pressure_hPa,precip_rate_mm/hr,precip_accum_mm,uv_index,solar_w/m²,_id
0,Ichtegem,2024-10-01 00:04:00,13.8,11.7,87,247.5,13.2,16.7,998.3,0.0,0.0,0.0,0.0,05cfc351250674cec2eaef31
1,Ichtegem,2024-10-01 00:09:00,13.8,11.6,87,247.5,12.7,15.8,998.0,0.0,0.0,0.0,0.0,653906ab4ca02c0973318d93
2,Ichtegem,2024-10-01 00:14:00,13.9,11.6,86,270.0,16.6,20.6,998.0,0.0,0.0,0.0,0.0,11b2154e5d15c8d8e82b68ac
3,Ichtegem,2024-10-01 00:19:00,14.0,11.5,85,247.5,15.6,19.6,998.0,0.0,0.0,0.0,0.0,024abe9c63090bfa96083df0
4,Ichtegem,2024-10-01 00:24:00,14.0,11.5,85,247.5,15.6,19.2,998.0,0.0,0.0,0.0,0.0,935b443032bbaeac73e87365
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1894,Ichtegem,2024-10-07 14:29:00,18.4,13.5,73,247.5,6.0,7.7,994.6,0.0,0.3,4.0,414.8,6982882297d2a0044dfdb263
1895,Ichtegem,2024-10-07 14:34:00,18.6,13.5,72,247.5,5.5,6.8,994.2,0.0,0.3,4.0,416.6,8cc49767eaced4dda9252893
1896,Ichtegem,2024-10-07 14:39:00,18.7,13.1,70,247.5,6.9,9.8,994.2,0.0,0.3,3.0,338.5,0a5b3af8e07cdb5b85605237
1897,Ichtegem,2024-10-07 14:44:00,19.1,13.4,70,270.0,4.7,6.6,994.2,0.0,0.3,3.0,349.6,572f641ab5406d346effad07


In [29]:
records = final_df2.to_dict(orient='records')

# Insert the documents
try:
    # Insert documents, set 'ordered=False' to continue on duplicate key error
    result = collection.insert_many(records, ordered=False)
    
    # Log the number of inserted documents
    inserted_count = len(result.inserted_ids)
    logger.info(f"Successfully inserted {inserted_count} documents into MongoDB at {mongodb_address}!")

except errors.BulkWriteError as e:
    # Extract useful summary info without dumping full error
    inserted_count = e.details.get('nInserted', 0)
    write_errors = e.details.get('writeErrors', [])
    duplicate_count = 0
    validation_count = 0
    other_count = 0

    # Separate errors into categories
    for error in write_errors:
        if error.get('code') == 11000:  # Duplicate key error
            duplicate_count += 1
        elif error.get('code') == 121:  # Validation error
            validation_count += 1
        else:  # Other errors
            other_count += 1

    # Log the counts of different error types
    logger.warning(f"Duplicate key error: {duplicate_count} documents were skipped.")
    logger.warning(f"Validation error: {validation_count} documents failed validation.")
    logger.warning(f"Other errors: {other_count} documents encountered other errors.")
    
    # Successfully inserted documents
    logger.info(f"{inserted_count} documents were successfully inserted despite this error.")

    # show just the first few duplicate _id values
    for error in write_errors[:3]:  # limit output
        if error.get('code') == 11000:  # Duplicate key error
            dup_id = error.get('keyValue', {}).get('_id', 'unknown')
            logger.debug(f"Duplicate _id: {dup_id}")
            
    # Log detailed information about validation errors
    validation_error_count = 0
    for error in write_errors[:3]:  # limit output
        if error.get('code') == 121:  # Validation error
            validation_error_count += 1
            errmsg = error.get('errmsg', 'No detailed message available')
            logger.debug(f"Validation failed for document: {errmsg}")
            logger.debug(f"Failed document data: {error.get('op', {})}")

    if duplicate_count > 3:
        logger.debug(f"...and {duplicate_count - 3} more duplicates were skipped.")

    if validation_count > 3:
        logger.debug(f"...and {validation_count - 3} more validation errors occurred.")

    if other_count > 3:
        logger.debug(f"...and {other_count - 3} more errors occurred.")

INFO:__main__:0 documents were successfully inserted despite this error.
