In [15]:
# Import Libraries
import os
import sys
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import geopandas as gpd
from pymongo import MongoClient
from dotenv import load_dotenv
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import requests

In [16]:
# Add the 'scripts' directory to the Python path
sys.path.append(os.path.abspath(os.path.join('..', 'scripts')))

In [17]:
import extract_to_mongodb as etm
import db_utils as dbu

In [18]:
# print(dir(dbu))

In [19]:
collection_name = os.getenv('COLLECTION_NAME_FEATUREENGINEERED')
naturalearth_lowres = os.getenv('NATURALEARTH_SHAPEFILE_PATH')

In [20]:
print(f"Collection Name: {collection_name}")

Collection Name: wildfire_feature_engineered_data


Load the Data

In [21]:
# Load the cleaned data
geo_wfp = dbu.load_all_data_from_mongodb(collection_name)

In [22]:
geo_wfp.head(5)

Unnamed: 0,_id,temp,rh,ws,wd,pcp,ffmc,dmc,dc,isi,...,tfc0,sfc0,year,month,day,lat_sin,lat_cos,lon_sin,lon_cos,year_month
0,66846cd755d3554c96b02c66,-1.006741,2.094934,-0.649149,320,0.43,82.976,30.078,161.161,2.68,...,0.35,0.35,2020,6,2,0.883899,0.467678,-0.311904,-0.950114,2020-6
1,66846cd755d3554c96b02c67,0.423696,0.99279,-0.700308,145,1.237,68.466,0.0,294.02,0.977,...,0.1,0.1,2020,6,11,0.87989,0.475177,-0.268096,-0.963392,2020-6
2,66846cd755d3554c96b02c68,0.226618,1.122454,-0.86797,30,0.591,88.685,55.743,202.448,5.536,...,1.36,1.36,2020,6,20,0.736971,0.675925,-0.939322,0.343037,2020-6
3,66846cd755d3554c96b02c69,1.366794,-1.665321,-0.84315,271,0.0,98.652,290.568,841.23,22.181,...,0.35,0.35,2020,6,22,0.535709,0.844403,-0.934801,-0.355172,2020-6
4,66846cd755d3554c96b02c6a,1.170139,0.020311,-0.000958,50,0.001,91.66,18.664,102.62,10.972,...,0.35,0.35,2020,6,13,0.551529,0.834155,-0.999729,-0.023267,2020-6


In [23]:
app_name = os.getenv('APP_NAME')
contact_email = os.getenv('CONTACT_EMAIL')

In [24]:
def reverse_geocode_nominatim(lat, lon):
    url = 'https://nominatim.openstreetmap.org/reverse'
    params = {
        'format': 'json',
        'lat': lat,
        'lon': lon,
        'zoom': 10,
        'addressdetails': 1
    }
    headers = {
        'User-Agent': f'{app_name} ({contact_email})'
    }
    response = requests.get(url, params=params, headers=headers)
    if response.status_code == 200:
        data = response.json()
        address = data.get('address', {})
        city = address.get('city', None)
        if not city:
            city = address.get('town', None)
        if not city:
            city = address.get('village', None)
        if not city:
            city = address.get('county', None)
        return city
    else:
        print(f"Error: {response.status_code}")
        return None

In [25]:
# Calculate latitude and longitude from sine and cosine values
def calculate_lat_lon(row):
    lat = np.arcsin(row.lat_sin) * 180 / np.pi
    lon = np.arctan2(row.lon_sin, row.lon_cos) * 180 / np.pi
    return lat, lon




In [None]:
# # Apply reverse geocoding to the first 5 records using computed latitude and longitude
# df_first_5 = df.iloc[:5].copy()
# df_first_5['lat'], df_first_5['lon'] = zip(*df_first_5.apply(calculate_lat_lon, axis=1))
# df_first_5['city'], df_first_5['country'] = zip(*df_first_5.apply(lambda row: reverse_geocode_nominatim(row.lat, row.lon), axis=1))

# # Print the first 5 records with the city and country columns
# print(df_first_5[['lat', 'lon', 'city', 'country']])

In [None]:
# Apply reverse geocoding to the first 5 records using longitude and latitude
# geo_wfp_first_5 = geo_wfp.iloc[:5].copy()
# geo_wfp_first_5['city'] = geo_wfp_first_5.apply(lambda row: reverse_geocode_nominatim(row.lat, row.lon), axis=1)

# # Merge the city data back into the original DataFrame
# geo_wfp = geo_wfp.merge(geo_wfp_first_5[['city']], left_index=True, right_index=True, how='left')

# # Apply reverse geocoding to all records using computed latitude and longitude
# geo_wfp['lat'], geo_wfp['lon'] = zip(*geo_wfp.apply(calculate_lat_lon, axis=1))
# geo_wfp['city'] = geo_wfp.apply(lambda row: reverse_geocode_nominatim(row.lat, row.lon), axis=1)


In [30]:
# Limit to the first 5000 records
geo_wfp = geo_wfp.iloc[:5000].copy()

# Calculate lat and lon for the first 5000 records
geo_wfp['lat'], geo_wfp['lon'] = zip(*geo_wfp.apply(calculate_lat_lon, axis=1))

# Apply reverse geocoding to the first 5000 records
geo_wfp['city'], geo_wfp['country'] = zip(*geo_wfp.apply(lambda row: reverse_geocode_nominatim(row.lat, row.lon), axis=1))

# Print the first 10 records with the city and country columns
print(geo_wfp[['lat', 'lon', 'city', 'country']].head(10))


In [None]:
geo_wfp.head(5)

Unnamed: 0,_id,temp,rh,ws,wd,pcp,ffmc,dmc,dc,isi,...,year,month,day,lat_sin,lat_cos,lon_sin,lon_cos,year_month,lat,lon
0,66846cd755d3554c96b02c66,-1.006741,2.094934,-0.649149,320,0.43,82.976,30.078,161.161,2.68,...,2020,6,2,0.883899,0.467678,-0.311904,-0.950114,2020-6,62.1163,-161.826
1,66846cd755d3554c96b02c67,0.423696,0.99279,-0.700308,145,1.237,68.466,0.0,294.02,0.977,...,2020,6,11,0.87989,0.475177,-0.268096,-0.963392,2020-6,61.6291,-164.449
2,66846cd755d3554c96b02c68,0.226618,1.122454,-0.86797,30,0.591,88.685,55.743,202.448,5.536,...,2020,6,20,0.736971,0.675925,-0.939322,0.343037,2020-6,47.474,-69.938
3,66846cd755d3554c96b02c69,1.366794,-1.665321,-0.84315,271,0.0,98.652,290.568,841.23,22.181,...,2020,6,22,0.535709,0.844403,-0.934801,-0.355172,2020-6,32.392,-110.804
4,66846cd755d3554c96b02c6a,1.170139,0.020311,-0.000958,50,0.001,91.66,18.664,102.62,10.972,...,2020,6,13,0.551529,0.834155,-0.999729,-0.023267,2020-6,33.472,-91.3332


In [None]:
geo_wfp.drop(columns=['lon, lat'], inplace=True)

In [None]:
#analysis only, this part can be removed
#geo_wfp.to_csv('engineered_wildfire_data.csv', index=False)

In [None]:
# Save the csv featured engineering data to mongodb
#dbu.insert_data_to_mongodb('engineered_wildfire_data.csv', os.getenv('COLLECTION_NAME_FEATUREENGINEERED'))


In [None]:
# Save the dataframe -  featured engineering data to mongodb
dbu.insert_df_only_to_mongodb(geo_wfp, os.getenv('COLLECTION_NAME_FEATUREENGINEERED_WITH_CITY'))