In [4]:
#!/usr/bin/env python
# coding: utf-8

import os
import pandas as pd
from pymongo import MongoClient
from dotenv import load_dotenv

# Load environment variables
load_dotenv()
MONGO_URI = os.getenv("MONGO_URI")

if not MONGO_URI:
    raise ValueError("MONGO_URI is not set in your .env file.")

# Connect to MongoDB with a longer timeout (set in MONGO_URI)
client = MongoClient(MONGO_URI)
db = client["my_database"]  # choose the database name as you wish

# Paths to your cleaned CSVs
base_path = '../data/processed/'
files = {
    "covid_vacc_death_rate": "covid-vaccinations-vs-covid-death-rate_cleaned.csv",
    "covid_vacc_manufacturer": "covid-vaccine-doses-by-manufacturer_cleaned.csv",
    "oecd_health_expenditure": "OECD_health_expenditure_cleaned.csv",
    "us_death_rates": "united-states-rates-of-covid-19-deaths-by-vaccination-status_cleaned.csv"
}

batch_size = 2000  # Smaller batch size to prevent timeouts

for collection_name, filename in files.items():
    file_path = os.path.join(base_path, filename)

    if not os.path.exists(file_path):
        print(f"File {file_path} not found. Skipping {collection_name}.")
        continue

    print(f"Loading {filename} into DataFrame...")
    df = pd.read_csv(file_path)

    if df.empty:
        print(f"No data found in {filename}. Collection '{collection_name}' will be empty.")
        # Still clear the collection to ensure no stale data
        db[collection_name].delete_many({})
        continue

    # Convert DataFrame to list of dicts
    data_dicts = df.to_dict("records")

    # Clear existing data in the collection
    db[collection_name].delete_many({})
    print(f"Collection '{collection_name}' cleared.")

    # Insert data in batches
    total_docs = len(data_dicts)
    print(f"Inserting {total_docs} documents into '{collection_name}' in batches of {batch_size}...")
    try:
        for i in range(0, total_docs, batch_size):
            batch = data_dicts[i:i+batch_size]
            db[collection_name].insert_many(batch, ordered=False)
            print(f"  Inserted batch {(i // batch_size) + 1} with {len(batch)} docs.")
        print(f"All {total_docs} documents inserted successfully into '{collection_name}'.")
    except Exception as e:
        print(f"Error inserting into '{collection_name}': {e}")

print("Data storage complete. Check the inserted collections in MongoDB.")


Loading covid-vaccinations-vs-covid-death-rate_cleaned.csv into DataFrame...
Collection 'covid_vacc_death_rate' cleared.
Inserting 447729 documents into 'covid_vacc_death_rate' in batches of 2000...
  Inserted batch 1 with 2000 docs.
  Inserted batch 2 with 2000 docs.
  Inserted batch 3 with 2000 docs.
  Inserted batch 4 with 2000 docs.
  Inserted batch 5 with 2000 docs.
  Inserted batch 6 with 2000 docs.
  Inserted batch 7 with 2000 docs.
  Inserted batch 8 with 2000 docs.
  Inserted batch 9 with 2000 docs.
  Inserted batch 10 with 2000 docs.
  Inserted batch 11 with 2000 docs.
  Inserted batch 12 with 2000 docs.
  Inserted batch 13 with 2000 docs.
  Inserted batch 14 with 2000 docs.
  Inserted batch 15 with 2000 docs.
  Inserted batch 16 with 2000 docs.
  Inserted batch 17 with 2000 docs.
  Inserted batch 18 with 2000 docs.
  Inserted batch 19 with 2000 docs.
  Inserted batch 20 with 2000 docs.
  Inserted batch 21 with 2000 docs.
  Inserted batch 22 with 2000 docs.
  Inserted batch 2