In [1]:
# Import Libraries
import os
import sys
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import geopandas as gpd
from pymongo import MongoClient
from dotenv import load_dotenv
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import math

In [2]:
# Add the 'scripts' directory to the Python path
sys.path.append(os.path.abspath(os.path.join('..', 'scripts')))

In [3]:
import extract_to_mongodb as etm
import db_utils as dbu
import pickle
import time
from tqdm import tqdm

In [4]:
collection_name = os.getenv('COLLECTION_NAME_CLEANED')
naturalearth_lowres = os.getenv('NATURALEARTH_SHAPEFILE_PATH')
# CACHE_FILE = 'geocodecache.pkl'

In [5]:
print(f"Collection Name: {collection_name}")

Collection Name: wildfire_cleaned_data


Load the Data

In [6]:
# Load the cleaned data
geo_wfp = dbu.load_all_data_from_mongodb(collection_name)

In [None]:
geo_wfp.shape

(450342, 35)

In [None]:
geo_wfp.head(5)

Unnamed: 0,_id,lat,lon,rep_date,source,sensor,satellite,agency,temp,rh,...,cfb,pcuring,greenup,elev,sfl,cfl,tfc0,ecozone,sfc0,geometry
0,6669b15b44a41007063ba9da,62.1163,-161.826,2020/06/02 12:54:00.000,NASA4,IBAND,S-NPP,UAK,15.917,69,...,0.0,50.0,1.0,90,1.01145,0.0,0.35,2.0,0.35,"{'type': 'Point', 'coordinates': [-161.826, 62..."
1,6669b15e44a41007063ba9db,61.6291,-164.449,2020/06/11 12:36:00.000,NASA5,IBAND,NOAA-20,UAK,26.064,52,...,0.0,50.0,1.0,24,0.43536,0.0,0.1,2.0,0.1,"{'type': 'Point', 'coordinates': [-164.449, 61..."
2,6669b15e44a41007063ba9dc,47.474,-69.938,2020/06/20 06:30:00.000,USFS,IBAND,JPSS1,QC,24.666,54,...,0.0,29.0,1.0,27,-1.0,0.0,1.36,,1.36,"{'type': 'Point', 'coordinates': [-69.938, 47...."
3,6669b15e44a41007063ba9dd,32.392,-110.804,2020/06/22 08:24:00.000,NASA6,IBAND,S-NPP,UAZ,32.754,11,...,0.0,50.0,1.0,1534,0.2128,0.00053,0.35,12.0,0.35,"{'type': 'Point', 'coordinates': [-110.804, 32..."
4,6669b15e44a41007063ba9de,33.472,-91.3332,2020/06/13 07:06:00.000,NASA7,IBAND,NOAA-20,UAR,31.359,37,...,0.0,50.0,1.0,38,0.102444,0.000499,0.35,8.0,0.35,"{'type': 'Point', 'coordinates': [-91.3332, 33..."


In [None]:
geo_wfp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 450342 entries, 0 to 450341
Data columns (total 35 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   _id        450342 non-null  object 
 1   lat        450342 non-null  float64
 2   lon        450342 non-null  float64
 3   rep_date   450342 non-null  object 
 4   source     450342 non-null  object 
 5   sensor     450342 non-null  object 
 6   satellite  450342 non-null  object 
 7   agency     450342 non-null  object 
 8   temp       450342 non-null  float64
 9   rh         450342 non-null  int64  
 10  ws         450342 non-null  float64
 11  wd         450342 non-null  int64  
 12  pcp        450342 non-null  float64
 13  ffmc       450342 non-null  float64
 14  dmc        450342 non-null  float64
 15  dc         450342 non-null  float64
 16  isi        450342 non-null  float64
 17  bui        450342 non-null  float64
 18  fwi        450342 non-null  float64
 19  fuel       450342 non-n

In [None]:
# Normalize the data to ensure consistency
geo_wfp['rep_date'] = pd.to_datetime(geo_wfp['rep_date']).dt.strftime('%Y-%m-%d')

In [None]:
# Normalize the data to ensure consistency
geo_wfp['rep_date'] = pd.to_datetime(geo_wfp['rep_date']).dt.strftime('%Y-%m-%d')

# Drop duplicates based on 'lat', 'lon', and 'rep_date' and keep only the first occurrence
unique_geo_wfp = geo_wfp.drop_duplicates(subset=['lat', 'lon', 'rep_date'], keep='first')

In [None]:
unique_geo_wfp.shape

(449139, 35)

In [None]:
unique_geo_wfp.to_csv('unique_geo_wfp.csv', index=False)

In [None]:
# Save the cleaned data to mongodb
dbu.insert_dataframe_to_mongodb(unique_geo_wfp, 'wildfire_cleaned_data_unique_geo_wfp')

An error occurred: 'dict' object has no attribute '__geo_interface__'
Connection closed.


In [None]:

# Calculate number of chunks (30,000 records per chunk)
chunk_size = 30000
num_chunks = math.ceil(len(unique_geo_wfp) / chunk_size)

# Split the data into chunks
for i in range(num_chunks):
    start = i * chunk_size
    end = (i + 1) * chunk_size
    chunk = unique_geo_wfp[start:end]
    chunk.to_csv(f'../data/chunks_unique_geolocation/geocoding_chunk_{i}.csv', index=False)

print(f"Created {num_chunks} chunks of approximately 30,000 records each.")
print("Distribute these files to your team members.")

