In [118]:
# !pip install pymongo requests beautifulsoup4

In [119]:
#pip install pymongo pyshp

In [120]:

import zipfile
import os
import json
import requests
from pymongo import MongoClient, errors
import time
from urllib.parse import urljoin
import config  # Import the config dictionary
from bs4 import BeautifulSoup
import shapefile

In [121]:
import config  # Import the config dictionary


In [122]:

def list_zip_files(base_url):
    """Lists all ZIP files in the directory at the given base URL."""
    response = requests.get(base_url)
    response.raise_for_status()
    html_content = response.text

    # Assuming the server lists files as links in the HTML
    soup = BeautifulSoup(html_content, 'html.parser')
    zip_files = [urljoin(base_url, link.get('href')) for link in soup.find_all('a') if link.get('href').endswith('.zip')]
    return zip_files

def download_zip(url, save_path):
    """Downloads a ZIP file from a URL to the specified local path."""
    response = requests.get(url)
    with open(save_path, 'wb') as file:
        file.write(response.content)

def extract_zip(zip_path, extract_to):
    """Extracts the ZIP file to the specified directory."""
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
        
def shape_record_to_geojson(shape_record):
    """Convert a shapefile record to a GeoJSON feature."""
    geom = shape_record.shape.__geo_interface__
    properties = shape_record.record.as_dict()
    feature = {
        "type": "Feature",
        "geometry": geom,
        "properties": properties
    }
    return feature

def read_shapefiles(directory):
    """Reads all shapefiles in the specified directory and returns their contents as GeoJSON."""
    data = []
    for filename in os.listdir(directory):
        if filename.endswith('.shp'):
            shp_path = os.path.join(directory, filename)
            print(f"Reading shapefile: {shp_path}")  # Debug statement
            reader = shapefile.Reader(shp_path)
            features = [shape_record_to_geojson(sr) for sr in reader.shapeRecords()]
            data.extend(features)
    return data

def insert_data_to_mongodb(data, db_name, collection_name, mongo_uri, retries=3, delay=5):
    for attempt in range(retries):
        try:
            client = MongoClient(mongo_uri, serverSelectionTimeoutMS=5000)
            # Check connection by pinging the server
            client.admin.command('ping')
            print("Connected to MongoDB successfully.")

            db = client[db_name]

            # Create collection if it does not exist
            if collection_name not in db.list_collection_names():
                db.create_collection(collection_name)
                print(f"Created collection '{collection_name}' in database '{db_name}'.")

            collection = db[collection_name]

            if isinstance(data, list):
                collection.insert_many(data)
                print(f"Inserted {len(data)} documents into the collection '{collection_name}' in database '{db_name}'.")
            else:
                collection.insert_one(data)
                print(f"Inserted one document into the collection '{collection_name}' in database '{db_name}'.")

            break  # If successful, break out of the loop
        except errors.ConnectionFailure as e:
            print(f"Attempt {attempt + 1} failed: {e}")
            if attempt < retries - 1:
                print(f"Retrying in {delay} seconds...")
                time.sleep(delay)
            else:
                print("All attempts failed. Please check your connection details and try again.")
        except Exception as e:
            print(f"An error occurred: {e}")
            break
                
def filter_zip_files_by_year(zip_files, start_year, end_year):
    """Filters ZIP files based on the year range in their names."""
    filtered_files = []
    for file_url in zip_files:
        # Extract year from file name, assuming format 'YYYY_...'
        file_name = os.path.basename(file_url)
        try:
            year = int(file_name.split('_')[0])
            if start_year <= year <= end_year:
                filtered_files.append(file_url)
        except ValueError:
            pass
    return filtered_files



In [123]:

# Parameters
base_url = config.base_url
extract_to = config.extract_to
db_name = config.db_name
collection_name = config.db_name
mongo_uri = config.mongo_uri
start_year = config.start_year
end_year = config.end_year

# Steps
zip_files = list_zip_files(base_url)
filtered_zip_files = filter_zip_files_by_year(zip_files, start_year, end_year)

for zip_url in filtered_zip_files:
    zip_path = os.path.join('downloads', os.path.basename(zip_url))
    os.makedirs('downloads', exist_ok=True)
    download_zip(zip_url, zip_path)
    extract_zip(zip_path, extract_to) 
    data = read_shapefiles(extract_to)
     # Print first record as JSON
    if len(data) > 0:
        print("First record:")
        print(json.dumps(data[0], indent=2)) 
    insert_data_to_mongodb(data, db_name, collection_name, mongo_uri)
    os.remove(zip_path)
    


Reading shapefile: extracted_data\2013_hotspots.shp
Reading shapefile: extracted_data\2014_hotspots.shp
First record:
{
  "type": "Feature",
  "geometry": {
    "type": "Point",
    "coordinates": [
      -21936.408456150224,
      1039492.9271111382
    ]
  },
  "properties": {
    "LAT": 58.479,
    "LON": -95.386,
    "REP_DATE": "2013-06-30 19:35:00",
    "UID": 6156137,
    "SOURCE": "NASA",
    "SENSOR": "MODIS",
    "SATELLITE": "Aqua",
    "AGENCY": "MB",
    "TEMP": 24.8,
    "RH": 42,
    "WS": 14.8,
    "WD": 240,
    "PCP": 0.0,
    "FFMC": null,
    "DMC": null,
    "DC": null,
    "ISI": null,
    "BUI": null,
    "FWI": 0.0,
    "FUEL": "C2",
    "ROS": 0.0,
    "SFC": 0.0,
    "TFC": 0.0,
    "BFC": 0.0,
    "HFI": 0,
    "CFB": null,
    "AGE": 12137,
    "ESTAREA": 73.33,
    "POLYID": 5822612,
    "PCURING": 50,
    "CFACTOR": 0.100577,
    "GREENUP": 1,
    "ELEV": null
  }
}
Connected to MongoDB successfully.
Attempt 1 failed: ac-wzg1yil-shard-00-01.vmiph7u.mongodb