In [1]:
# Import Libraries
import os
import sys
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import geopandas as gpd
from pymongo import MongoClient
from dotenv import load_dotenv
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import requests

In [2]:
# Add the 'scripts' directory to the Python path
sys.path.append(os.path.abspath(os.path.join('..', 'scripts')))

In [3]:
import extract_to_mongodb as etm
import db_utils as dbu
import pickle
import time
from tqdm import tqdm

In [None]:
# print(dir(dbu))

In [4]:
collection_name = os.getenv('COLLECTION_NAME_CLEANED')
naturalearth_lowres = os.getenv('NATURALEARTH_SHAPEFILE_PATH')
CACHE_FILE = 'geocode_cache.pkl'

In [5]:
print(f"Collection Name: {collection_name}")

Collection Name: wildfire_feature_engineered_data


Load the Data

In [7]:
# Load the cleaned data
geo_wfp = dbu.load_all_data_from_mongodb(collection_name)

In [8]:
geo_wfp.head(5)

Unnamed: 0,_id,temp,rh,ws,wd,pcp,ffmc,dmc,dc,isi,...,tfc0,sfc0,year,month,day,lat_sin,lat_cos,lon_sin,lon_cos,year_month
0,66846cd755d3554c96b02c66,-1.006741,2.094934,-0.649149,320,0.43,82.976,30.078,161.161,2.68,...,0.35,0.35,2020,6,2,0.883899,0.467678,-0.311904,-0.950114,2020-6
1,66846cd755d3554c96b02c67,0.423696,0.99279,-0.700308,145,1.237,68.466,0.0,294.02,0.977,...,0.1,0.1,2020,6,11,0.87989,0.475177,-0.268096,-0.963392,2020-6
2,66846cd755d3554c96b02c68,0.226618,1.122454,-0.86797,30,0.591,88.685,55.743,202.448,5.536,...,1.36,1.36,2020,6,20,0.736971,0.675925,-0.939322,0.343037,2020-6
3,66846cd755d3554c96b02c69,1.366794,-1.665321,-0.84315,271,0.0,98.652,290.568,841.23,22.181,...,0.35,0.35,2020,6,22,0.535709,0.844403,-0.934801,-0.355172,2020-6
4,66846cd755d3554c96b02c6a,1.170139,0.020311,-0.000958,50,0.001,91.66,18.664,102.62,10.972,...,0.35,0.35,2020,6,13,0.551529,0.834155,-0.999729,-0.023267,2020-6


In [9]:
app_name = os.getenv('APP_NAME')
contact_email = os.getenv('CONTACT_EMAIL')

In [None]:
class RateLimiter:
    def __init__(self, max_calls, time_frame):
        self.max_calls = max_calls
        self.time_frame = time_frame
        self.calls = []

    def wait(self):
        now = time.time()
        self.calls = [call for call in self.calls if now - call < self.time_frame]
        if len(self.calls) >= self.max_calls:
            sleep_time = self.time_frame - (now - self.calls[0])
            time.sleep(sleep_time)
        self.calls.append(time.time())

class GeocodingCache:
    def __init__(self, cache_file):
        self.cache_file = cache_file
        self.cache = self.load_cache()
        self.rate_limiter = RateLimiter(max_calls=1, time_frame=1)

    def load_cache(self):
        if os.path.exists(self.cache_file):
            print(f"Loading existing cache from {self.cache_file}")
            with open(self.cache_file, 'rb') as f:
                return pickle.load(f)
        print(f"Cache file {self.cache_file} not found. Creating a new cache.")
        return {}

    def save_cache(self):
        with open(self.cache_file, 'wb') as f:
            pickle.dump(self.cache, f)

    def get_location_info(self, lat, lon):
        key = f"{lat},{lon}"
        if key in self.cache:
            return self.cache[key]

        self.rate_limiter.wait()
        result = self.reverse_geocode_nominatim(lat, lon)
        self.cache[key] = result
        self.save_cache()
        return result

    def reverse_geocode_nominatim(self, lat, lon):
        url = 'https://nominatim.openstreetmap.org/reverse'
        params = {
            'format': 'json',
            'lat': lat,
            'lon': lon,
            'zoom': 18,
            'addressdetails': 1
        }
        headers = {
            'User-Agent': f'{APP_NAME} ({CONTACT_EMAIL})'
        }
        
        try:
            response = requests.get(url, params=params, headers=headers, timeout=5)
            response.raise_for_status()
            data = response.json()
            address = data.get('address', {})
            
            city = None
            for key in ['city', 'town', 'village', 'hamlet', 'municipality', 'county']:
                if address.get(key):
                    city = address[key]
                    break
            
            postal_code = address.get('postcode')
            
            return city, postal_code
        except Exception as e:
            print(f"Error geocoding {lat}, {lon}: {e}")
            return None, None

In [None]:
def process_dataframe(df, cache):
    def apply_geocoding(row):
        return cache.get_location_info(row['latitude'], row['longitude'])

    tqdm.pandas(desc="Geocoding")
    df[['city', 'postal_code']] = df.progress_apply(apply_geocoding, axis=1, result_type='expand')
    return df

In [None]:
# Initialize the cache
cache = GeocodingCache(CACHE_FILE)

# Process the DataFrame
start_time = time.time()
result_df = process_dataframe(geo_wfp, cache)
end_time = time.time()

print(f"Processing completed in {end_time - start_time} seconds")



In [None]:
#analysis only, this part can be removed
#geo_wfp.to_csv('engineered_wildfire_data.csv', index=False)

In [None]:
# Save the csv featured engineering data to mongodb
#dbu.insert_data_to_mongodb('engineered_wildfire_data.csv', os.getenv('COLLECTION_NAME_FEATUREENGINEERED'))


In [None]:
# Save the dataframe -  featured engineering data to mongodb
dbu.insert_df_only_to_mongodb(geo_wfp, os.getenv('COLLECTION_NAME_CLEANED_WITH_CITY'))