# Preparing the Dataset

In [1]:
import numpy as np
import pandas as pd
import os
import pymongo
from dtime import is_leap_year, ymd
import datetime

In [2]:
npy_file = os.path.join("data","Fire_Data.npy")

In [3]:
pickle = np.load(npy_file, allow_pickle=True)

In [4]:
fires_df = pd.DataFrame(pickle, columns=['FPA_ID','FIRE_NAME','FIRE_YEAR','DISCOVERY_DOY','DISCOVERY_TIME','CONT_DOY','CONT_TIME','FIRE_SIZE','FIRE_SIZE_CLASS','LATITUDE','LONGITUDE','STATE','COUNTY','FIPS_CODE','FIPS_NAME','STAT_CAUSE_DESCR','OWNER_CODE','OWNER_DESCR','DAYS_TO_CONT'])

In [5]:
fires_df.head()

Unnamed: 0,FPA_ID,FIRE_NAME,FIRE_YEAR,DISCOVERY_DOY,DISCOVERY_TIME,CONT_DOY,CONT_TIME,FIRE_SIZE,FIRE_SIZE_CLASS,LATITUDE,LONGITUDE,STATE,COUNTY,FIPS_CODE,FIPS_NAME,STAT_CAUSE_DESCR,OWNER_CODE,OWNER_DESCR,DAYS_TO_CONT
0,FS-1418826,FOUNTAIN,2005,33,1300,33,1730,0.1,A,40.0369,-121.006,CA,63,63,Plumas,Miscellaneous,5,USFS,0
1,FS-1418827,PIGEON,2004,133,845,133,1530,0.25,A,38.9331,-120.404,CA,61,61,Placer,Lightning,5,USFS,0
2,FS-1418835,SLACK,2004,152,1921,152,2024,0.1,A,38.9842,-120.736,CA,17,17,El Dorado,Debris Burning,13,STATE OR PRIVATE,0
3,FS-1418845,DEER,2004,180,1600,185,1400,0.1,A,38.5592,-119.913,CA,3,3,Alpine,Lightning,5,USFS,5
4,FS-1418847,STEVENOT,2004,180,1600,185,1200,0.1,A,38.5592,-119.933,CA,3,3,Alpine,Lightning,5,USFS,5


In [6]:
fires_df["DAYS_TO_CONT"] = fires_df["DAYS_TO_CONT"].transform(lambda x: x if x > -1 else x + 365)

In [7]:
fires_df['DISCOVERY_TIME'] = fires_df['DISCOVERY_TIME'].apply(lambda x: x.replace("None",'0') if x == "None" else x)

In [8]:
fires_df['DISCOVERY_TIME'] = fires_df['DISCOVERY_TIME'].astype('str')

In [9]:
fires_df['CONT_TIME'] = fires_df['CONT_TIME'].apply(lambda x: x.replace("None",'0') if x == "None" else x)

In [10]:
fires_df['CONT_TIME'] = fires_df['CONT_TIME'].apply(lambda x: x.replace("",'0') if x == "" else x)

In [11]:
fires_df['CONT_TIME'] = fires_df['CONT_TIME'].astype('str')

In [12]:
fires_df['DISCOVERY_HOUR'] = fires_df['DISCOVERY_TIME'].apply(lambda x: x[:2])

In [14]:
fires_df['DISCOVERY_MINUTE'] = fires_df['DISCOVERY_TIME'].apply(lambda x: x[2:])

In [16]:
fires_df['CONT_HOUR'] = fires_df['CONT_TIME'].apply(lambda x: x[:2])

In [15]:
fires_df['CONT_MINUTE'] = fires_df['CONT_TIME'].apply(lambda x: x[2:])

In [17]:
# fires_df['DISCOVERY_DATE_PD'] = pd.to_datetime(fires_df['FIRE_YEAR'] * 1000 + fires_df['DISCOVERY_DOY'], format='%Y%j') 


In [13]:
# fires_df['DISCOVERY_DATE_PD'] = pd.to_datetime(fires_df['FIRE_YEAR'] * 31,556,952 + fires_df['DISCOVERY_DOY'], units='s', format='%Y%j%X') 


In [18]:
fires_df.head()

Unnamed: 0,FPA_ID,FIRE_NAME,FIRE_YEAR,DISCOVERY_DOY,DISCOVERY_TIME,CONT_DOY,CONT_TIME,FIRE_SIZE,FIRE_SIZE_CLASS,LATITUDE,...,FIPS_CODE,FIPS_NAME,STAT_CAUSE_DESCR,OWNER_CODE,OWNER_DESCR,DAYS_TO_CONT,DISCOVERY_HOUR,DISCOVERY_MINUTE,CONT_MINUTE,CONT_HOUR
0,FS-1418826,FOUNTAIN,2005,33,1300,33,1730,0.1,A,40.0369,...,63,Plumas,Miscellaneous,5,USFS,0.0,13,0,30,17
1,FS-1418827,PIGEON,2004,133,845,133,1530,0.25,A,38.9331,...,61,Placer,Lightning,5,USFS,0.0,8,45,30,15
2,FS-1418835,SLACK,2004,152,1921,152,2024,0.1,A,38.9842,...,17,El Dorado,Debris Burning,13,STATE OR PRIVATE,0.0,19,21,24,20
3,FS-1418845,DEER,2004,180,1600,185,1400,0.1,A,38.5592,...,3,Alpine,Lightning,5,USFS,5.0,16,0,0,14
4,FS-1418847,STEVENOT,2004,180,1600,185,1200,0.1,A,38.5592,...,3,Alpine,Lightning,5,USFS,5.0,16,0,0,12


In [19]:
# fires_df['DISCOVERY_TIME_NS'] = pd.to_datetime(fires_df['DISCOVERY_HOUR'].replace('',0).astype('int') * 3.6e12 + fires_df['DISCOVERY_MINUTE'].replace('',0).astype('int') * 6.0e10, format='ns')

In [20]:
ctr = 0
for col in fires_df.columns:
    ctr += 1
    print(ctr, col)


1 FPA_ID
2 FIRE_NAME
3 FIRE_YEAR
4 DISCOVERY_DOY
5 DISCOVERY_TIME
6 CONT_DOY
7 CONT_TIME
8 FIRE_SIZE
9 FIRE_SIZE_CLASS
10 LATITUDE
11 LONGITUDE
12 STATE
13 COUNTY
14 FIPS_CODE
15 FIPS_NAME
16 STAT_CAUSE_DESCR
17 OWNER_CODE
18 OWNER_DESCR
19 DAYS_TO_CONT
20 DISCOVERY_HOUR
21 DISCOVERY_MINUTE
22 CONT_MINUTE
23 CONT_HOUR


In [21]:
fires_df.STAT_CAUSE_DESCR.value_counts()

Debris Burning       429028
Miscellaneous        323805
Arson                281455
Lightning            278468
Missing/Undefined    166723
Equipment Use        147612
Campfire              76139
Children              61167
Smoking               52869
Railroad              33455
Powerline             14448
Fireworks             11500
Structure              3796
Name: STAT_CAUSE_DESCR, dtype: int64

In [61]:
# build datetime for CONT and DISCOVERY Dates 
# if cont doy less than discovery doy add 1 to year
# make new column for CONT and DISCOVERY
disc_dates = []
cont_dates = []
for row in fires_df.itertuples():
    Dyr = int(row[3])
    Ddy = int(row[4])
    Dhr = int(row[20])
    Cdy = int(row[6])
    Chr = int(row[22])
    if Cdy < Ddy:
        Cyr = int(row[3]) + 1 
    else:
        Cyr = int(row[3])
    try:
        Dmn = int(row[21])
    except ValueError as De:
        Dmn = 00
    try:
        Cmn = int(row[23])
    except ValueError as Ce:
        Cmn = 00
    DYr,DMo,DDa = ymd(Dyr,Ddy)
    CYr,CMo,CDa = ymd(Cyr,Cdy)
    disc_dates.append(datetime.datetime(DYr,DMo,DDa,Dhr,Dmn))
    cont_dates.append(datetime.datetime(CYr,CMo,CDa,Chr,Cmn))

In [62]:
print(len(disc_dates),len(cont_dates))

1880465 1880465


In [63]:
fires_df['DISCOVERY_DATE'] = disc_dates

In [64]:
fires_df['CONT_DATE'] = cont_dates

In [65]:
fires_df.head()

Unnamed: 0,FPA_ID,FIRE_NAME,FIRE_YEAR,DISCOVERY_DOY,DISCOVERY_TIME,CONT_DOY,CONT_TIME,FIRE_SIZE,FIRE_SIZE_CLASS,LATITUDE,...,STAT_CAUSE_DESCR,OWNER_CODE,OWNER_DESCR,DAYS_TO_CONT,DISCOVERY_HOUR,DISCOVERY_MINUTE,CONT_HOUR,CONT_MINUTE,DISCOVERY_DATE,CONT_DATE
0,FS-1418826,FOUNTAIN,2005,33,1300,33,1730,0.1,A,40.0369,...,Miscellaneous,5,USFS,0,13,0,17,30,2005-02-02 13:00:00,2005-02-02 17:30:00
1,FS-1418827,PIGEON,2004,133,845,133,1530,0.25,A,38.9331,...,Lightning,5,USFS,0,8,45,15,30,2004-05-12 08:45:00,2004-05-12 15:30:00
2,FS-1418835,SLACK,2004,152,1921,152,2024,0.1,A,38.9842,...,Debris Burning,13,STATE OR PRIVATE,0,19,21,20,24,2004-05-31 19:21:00,2004-05-31 20:24:00
3,FS-1418845,DEER,2004,180,1600,185,1400,0.1,A,38.5592,...,Lightning,5,USFS,5,16,0,14,0,2004-06-28 16:00:00,2004-07-03 14:00:00
4,FS-1418847,STEVENOT,2004,180,1600,185,1200,0.1,A,38.5592,...,Lightning,5,USFS,5,16,0,12,0,2004-06-28 16:00:00,2004-07-03 12:00:00


In [73]:
Viz_df = fires_df[['FPA_ID','FIRE_NAME','FIRE_YEAR','FIRE_SIZE','FIRE_SIZE_CLASS','LATITUDE','LONGITUDE','STATE','FIPS_CODE','FIPS_NAME','STAT_CAUSE_DESCR','OWNER_CODE','OWNER_DESCR','DAYS_TO_CONT','DISCOVERY_DATE','CONT_DATE']]

In [74]:
Viz_df.head(10)

Unnamed: 0,FPA_ID,FIRE_NAME,FIRE_YEAR,FIRE_SIZE,FIRE_SIZE_CLASS,LATITUDE,LONGITUDE,STATE,FIPS_CODE,FIPS_NAME,STAT_CAUSE_DESCR,OWNER_CODE,OWNER_DESCR,DAYS_TO_CONT,DISCOVERY_DATE,CONT_DATE
0,FS-1418826,FOUNTAIN,2005,0.1,A,40.0369,-121.006,CA,63,Plumas,Miscellaneous,5,USFS,0.0,2005-02-02 13:00:00,2005-02-02 17:30:00
1,FS-1418827,PIGEON,2004,0.25,A,38.9331,-120.404,CA,61,Placer,Lightning,5,USFS,0.0,2004-05-12 08:45:00,2004-05-12 15:30:00
2,FS-1418835,SLACK,2004,0.1,A,38.9842,-120.736,CA,17,El Dorado,Debris Burning,13,STATE OR PRIVATE,0.0,2004-05-31 19:21:00,2004-05-31 20:24:00
3,FS-1418845,DEER,2004,0.1,A,38.5592,-119.913,CA,3,Alpine,Lightning,5,USFS,5.0,2004-06-28 16:00:00,2004-07-03 14:00:00
4,FS-1418847,STEVENOT,2004,0.1,A,38.5592,-119.933,CA,3,Alpine,Lightning,5,USFS,5.0,2004-06-28 16:00:00,2004-07-03 12:00:00
5,FS-1418849,HIDDEN,2004,0.1,A,38.6353,-120.104,CA,5,Amador,Lightning,5,USFS,1.0,2004-06-30 18:00:00,2004-07-01 16:00:00
6,FS-1418851,FORK,2004,0.1,A,38.6883,-120.153,CA,17,El Dorado,Lightning,5,USFS,1.0,2004-07-01 18:00:00,2004-07-02 14:00:00
7,FS-1418854,SLATE,2005,0.8,B,40.9681,-122.434,CA,67,,Debris Burning,13,STATE OR PRIVATE,0.0,2005-03-08 13:00:00,2005-03-08 16:00:00
8,FS-1418856,SHASTA,2005,1.0,B,41.2336,-122.283,CA,67,,Debris Burning,13,STATE OR PRIVATE,0.0,2005-03-15 12:00:00,2005-03-15 17:00:00
9,FS-1418859,TANGLEFOOT,2004,0.1,A,38.5483,-120.149,CA,5,Amador,Lightning,5,USFS,1.0,2004-07-01 18:00:00,2004-07-02 18:00:00


In [24]:
## Setup Mongo DB
client = pymongo.MongoClient()

In [25]:
## Build DataBase
db = client.Project_2_db
fires = db.fires

In [77]:
fires.insert_many(Viz_df.to_dict('records'))

<pymongo.results.InsertManyResult at 0x2327cc84688>

In [26]:
fires.create_index([("FIRE_YEAR", pymongo.ASCENDING)],name='year',unique=False)

'year'

In [27]:
fires.create_index([("STAT_CAUSE_DESCR", pymongo.ASCENDING)],name='cause',unique=False)

'cause'

In [28]:
fires.create_index([("STATE", pymongo.ASCENDING)],name='state',unique=False)

'state'

In [29]:
fires.create_index([("DISCOVERY_DATE", pymongo.ASCENDING)],name='disc_date',unique=False)

'disc_date'

In [30]:
fires.create_index([("CONT_DATE", pymongo.ASCENDING)],name='cont_date',unique=False)

'cont_date'