In [1]:
import sys
import os
import subprocess
import importlib.resources
from pathlib import Path

print("=== Environment Setup Check ===\n")

=== Environment Setup Check ===



In [2]:
print(f"Python version: {sys.version}")
print(f"Python executable: {sys.executable}")

Python version: 3.9.23 (main, Jun  5 2025, 13:25:08) [MSC v.1929 64 bit (AMD64)]
Python executable: c:\Users\Jordan\miniconda3\envs\env_data_processor\python.exe


In [3]:
# Check if we're in the correct conda environment
def check_conda_environment():
    # Check multiple indicators for conda environment
    conda_env = os.environ.get('CONDA_DEFAULT_ENV')
    conda_prefix = os.environ.get('CONDA_PREFIX', '')
    
    print(f"CONDA_DEFAULT_ENV: {conda_env}")
    print(f"CONDA_PREFIX: {conda_prefix}")
    
    if conda_env == '':
        print("✅ Running in env_data_processor conda environment")
        return True
    elif 'env_data_processor' in sys.executable:
        print("✅ Running in env_data_processor conda environment")
        return True 
    else:
        print("⚠️  Warning: May not be running in the correct conda environment")
        print(f"Expected: env_data_processor")
        print(f"Current environment: {conda_env}")

env_check = check_conda_environment()

CONDA_DEFAULT_ENV: env_data_processor
CONDA_PREFIX: C:\Users\Jordan\miniconda3\envs\env_data_processor
✅ Running in env_data_processor conda environment


In [5]:
required_packages = [
    'pandas',
    'numpy',
    'matplotlib',
    'seaborn',
    'jupyter',
    'openpyxl',
    'xlrd',
    'PyPDF2',
    'sqlalchemy',
    'psycopg2',
    'pymongo',
    'pdfplumber',
    'tabula-py'
]

print("\n=== Package Verification ===")
missing_packages = []

for package in required_packages:
    try:
        version = pkg_resources.get_distribution(package).version
        print(f"✅ {package}: {version}")
    except pkg_resources.DistributionNotFound:
        print(f"❌ {package}: Not installed")
        missing_packages.append(package)

if missing_packages:
    print(f"\n⚠️  Missing packages: {', '.join(missing_packages)}")
    print("Install missing packages using:")
    print(f"conda install -c conda-forge {' '.join(missing_packages)}")
else:
    print("\n✅ All required packages are installed!")



=== Package Verification ===
✅ pandas: 2.3.1
✅ numpy: 1.26.4
❌ matplotlib: Not installed
❌ seaborn: Not installed
❌ jupyter: Not installed
✅ openpyxl: 3.1.5
✅ xlrd: 2.0.1
✅ PyPDF2: 3.0.1
❌ sqlalchemy: Not installed
❌ psycopg2: Not installed
❌ pymongo: Not installed
✅ pdfplumber: 0.10.3
✅ tabula-py: 2.6.0

⚠️  Missing packages: matplotlib, seaborn, jupyter, sqlalchemy, psycopg2, pymongo
Install missing packages using:
conda install -c conda-forge matplotlib seaborn jupyter sqlalchemy psycopg2 pymongo


In [6]:
print("\n=== Creating Project Structure ===")
project_dirs = [
    'data/raw/excel',
    'data/raw/pdf',
    'data/processed',
    'data/clean',
    'src/extractors',
    'src/processors',
    'src/utils',
    'config',
    'logs'
]

for dir_path in project_dirs:
    Path(dir_path).mkdir(parents=True, exist_ok=True)
    print(f"✅ Created directory: {dir_path}")


=== Creating Project Structure ===
✅ Created directory: data/raw/excel
✅ Created directory: data/raw/pdf
✅ Created directory: data/processed
✅ Created directory: data/clean
✅ Created directory: src/extractors
✅ Created directory: src/processors
✅ Created directory: src/utils
✅ Created directory: config
✅ Created directory: logs


In [8]:
nit_files = [
    'src/__init__.py',
    'src/extractors/__init__.py',
    'src/processors/__init__.py',
    'src/utils/__init__.py'
]

for init_file in nit_files:
    Path(init_file).touch(exist_ok=True)
    print(f"✅ Created: {init_file}")

✅ Created: src/__init__.py
✅ Created: src/extractors/__init__.py
✅ Created: src/processors/__init__.py
✅ Created: src/utils/__init__.py


In [9]:
# Create .env template
env_template = """# Database Configuration
DATABASE_URL=postgresql://username:password@localhost:5432/environmental_db
MONGODB_URI=mongodb://localhost:27017/environmental_db

# File Paths
RAW_DATA_PATH=./data/raw
PROCESSED_DATA_PATH=./data/processed
LOG_PATH=./logs

# Processing Configuration
BATCH_SIZE=1000
MAX_FILE_SIZE_MB=100
"""

with open('.env', 'w') as f:
    f.write(env_template)
print("✅ Created .env template file")

print("\n=== Setup Complete ===")
print("Your conda environment is ready for environmental data processing!")
print("\nNext steps:")
print("1. Update the .env file with your actual database credentials")
print("2. Place your Excel and PDF files in the appropriate data/raw/ subdirectories")
print("3. Run the data extraction notebook (01_data_extraction.ipynb)")

✅ Created .env template file

=== Setup Complete ===
Your conda environment is ready for environmental data processing!

Next steps:
1. Update the .env file with your actual database credentials
2. Place your Excel and PDF files in the appropriate data/raw/ subdirectories
3. Run the data extraction notebook (01_data_extraction.ipynb)
