
<img width= "300" src="https://cdn.discordapp.com/attachments/392490318798389248/945695517004951633/jeanettes_fireart.png" alt="US Fires Logo">

# Analyzing US Wildfires: Final Report

### by Lori Ainslie, Jeanette Schulz, Kristine Cabanela, and Sophia Stewart 
### Last Updated: 23 February 2022

In [1]:
# Data Science Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Group-made functions
from wrangle import *

# Blocking Warning Boxes
import warnings
warnings.filterwarnings("ignore")

# Remove Limits On Viewing Dataframes
pd.set_option('display.max_columns', None)


---
# Acquire
---

# Information on where we found the data

In [2]:
# Acquiring the data from our wrangle
df = acquire_fires()
df.head()

Unnamed: 0,fire_year,discovery_date,general_cause,containment_date,fire_size,latitude,longitude,state
0,2005,2005-02-02,Power generation/transmission/distribution,2005-02-02,0.1,40.036944,-121.005833,CA
1,2004,2004-05-12,Natural,2004-05-12,0.25,38.933056,-120.404444,CA
2,2004,2004-05-31,Debris and open burning,2004-05-31,0.1,38.984167,-120.735556,CA
3,2004,2004-06-28,Natural,2004-07-03,0.1,38.559167,-119.913333,CA
4,2004,2004-06-28,Natural,2004-07-03,0.1,38.559167,-119.933056,CA


---
# Prepare
---

We started by preparing the data as a team. All decisions made were as a group to ensure our wrangled data would benefit each of our future exploration needs.

We started by dropped columns with large amount of null values:

- LOCAL_FIRE_REPORT_ID
- LOCAL_INCIDENT_ID
- FIRE_CODE
- FIRE_NAME
- ICS_209_PLUS_INCIDENT_JOIN_ID
- ICS_209_PLUS_COMPLEX_JOIN_ID
- MTBS_ID
- MTBS_FIRE_NAME
- COMPLEX_NAME
- DISCOVERY_TIME
- NWCG_CAUSE_AGE_CATEGORY
- CONT_TIME
- COUNTY
- FIPS_CODE
- FIPS_NAME

Next, this data set came with a lot of columns that had additional information. This included columns such as the `NWCG_REPORTING_AGENCY` (superfluous information) and `DISCOVERY_DOY` (uneccesarry if we keep `DISCOVERY_DATE`). Thus, we dropped these additonal columns as follows:  

- FOD_ID
- FPA_ID
- SOURCE_SYSTEM_TYPE
- SOURCE_SYSTEM
- NWCG_REPORTING_AGENCY
- NWCG_REPORTING_UNIT_ID
- NWCG_REPORTING_UNIT_NAME
- SOURCE_REPORTING_UNIT
- SOURCE_REPORTING_UNIT_NAME
- FIRE_SIZE_CLASS
- NWCG_CAUSE_CLASSIFICATION
- CONT_DOY
- DISCOVERY_DOY
- OWNER_DESCR

Now that we had the columns we wanted to keep, it was time to make sure they were the correct dtype and format. We made sure `DISCOVERY_DATE` and `CONT_DATE` were datetime dtypes. In addition, we renamed `nwcg_general_cause` and `cont_date` to `general_cause` and `containment_date`. Then, finally, we changed all the names of the columns to be lowercase so they would be easier to code.   
Here is what our cleaned dataframe looked like:

In [3]:
df = wrangle_fires()
df.head()

KeyError: "['LOCAL_FIRE_REPORT_ID' 'LOCAL_INCIDENT_ID' 'FIRE_CODE' 'FIRE_NAME'\n 'ICS_209_PLUS_INCIDENT_JOIN_ID' 'ICS_209_PLUS_COMPLEX_JOIN_ID' 'MTBS_ID'\n 'MTBS_FIRE_NAME' 'COMPLEX_NAME' 'DISCOVERY_TIME'\n 'NWCG_CAUSE_AGE_CATEGORY' 'CONT_TIME' 'COUNTY' 'FIPS_CODE' 'FIPS_NAME'] not found in axis"

---
# Explore
---

In [None]:
# Creating a time-series dataframe
time_df = df
time_df = time_df.set_index('discovery_date').sort_index()
time_df.head()

In [None]:
# plot number of fires by year
plt.figure(figsize = (20,8))
sns.countplot(data= df, x= 'fire_year')
plt.legend([],[], frameon=False)
plt.title('Number of Fires per Year', fontdict={'fontsize': 24});


In [None]:
# plot number of fires by month
time_df['month'] = time_df.index.month_name()

plt.figure(figsize = (20,8))
sns.countplot(data= time_df, x= 'month')
plt.legend([],[], frameon=False)
plt.title('Total Fires for each Month', fontdict={'fontsize': 24});

In [None]:
# plot number of fires by state
plt.figure(figsize=(10,20))
sns.countplot(data= df, y= 'state', order= df.state.value_counts().index)
plt.title('Total Fires for each State', fontdict={'fontsize': 24});

In [None]:
# plot number of fires per 6-month period over the years
num_fires_by_month = time_df.resample('M').fire_year.count()

plt.figure(figsize = (20,8))
num_fires_by_month.resample('6M').mean().plot(marker='o')
plt.title('Monthly Average Count of Fires by Year', fontdict={'fontsize': 24});

In [None]:
# plot number of fires per year over the years
plt.figure(figsize = (20,8))
num_fires_by_month.resample('Y').mean().plot(marker='o')
plt.title('Yearly Average Count of Fires by Year', fontdict={'fontsize': 24});

In [None]:
# Largest acres burned in a month?
size_fires_by_month = time_df.resample('M').fire_size.sum()
size_fires_by_month.idxmax()
size_fires_by_month.max()

In [None]:
# plot acreage burned by wildfires over the years
plt.figure(figsize = (20,8))
df.groupby('fire_year').fire_size.sum().plot()
plt.title('Acreage Burned By Wildfires', fontdict={'fontsize': 24});

In [None]:
# Lori 
# Make a list of top five wildfire states 
top_five_wildfire_states = df.state.value_counts().head(5).index.to_list()

# Create the dataframe for the chart
num_fires_by_year = time_df[time_df.state.isin(top_five_wildfire_states)]\
.groupby(['fire_year', 'state']).count().reset_index()\
.iloc[:, 0:3].rename(columns={'general_cause':'num_fires', 'fire_year':'year'})

# Plot findings
plt.figure(figsize = (20,8))
sns.lineplot(data=num_fires_by_year, x='year', y='num_fires', hue='state')
plt.title('Number of Wildfires in the Top Five Wildfire States by Year', fontdict={'fontsize': 24});

In [None]:
# Lori
time_df['year'] = time_df.index.year
top_causes_of_wildfires = time_df.general_cause.value_counts().head()

# create a new dataframe that groups by year and gets a count by year of the top causes for wildfires
num_causes_by_year = time_df[time_df.general_cause.isin(top_causes_of_wildfires[1:].index.to_list())]\
.groupby(['year', 'general_cause']).count().reset_index().iloc[:, 0:3]\
.rename(columns={'fire_year':'num_occurrences'})
num_causes_by_year


In [None]:
# Lori
# plot trends over the years by the top four causes of wildfires, excluding observations where cause was not specified
plt.figure(figsize = (20,8))
sns.lineplot(data=num_causes_by_year, x='year', y='num_occurrences', hue='general_cause')
plt.title('Number of Wildfires By Cause', fontdict={'fontsize': 24});


In [None]:
natural = num_causes_by_year[num_causes_by_year.general_cause == "Natural"]
arson = num_causes_by_year[num_causes_by_year.general_cause == "Arson/incendiarism"]
debris = num_causes_by_year[num_causes_by_year.general_cause == "Debris and open burning"]
equipment = num_causes_by_year[num_causes_by_year.general_cause == "Equipment and vehicle use"]

natural = natural.set_index("year")
arson = arson.set_index("year")
equipment = equipment.set_index("year")
debris = debris.set_index("year")

fig, ax = plt.subplots(figsize = (20,8))

labels = arson.index

ax.bar(labels, arson.num_occurrences, label="Arson/incendiarism")
ax.bar(labels, natural.num_occurrences, label="Natural Causes", bottom=arson.num_occurrences)
ax.bar(labels, debris.num_occurrences, label="Debris & Open Burning", bottom=natural.num_occurrences + arson.num_occurrences)
ax.bar(labels, equipment.num_occurrences, label="Equipment & Vehicle Use", bottom=debris.num_occurrences + natural.num_occurrences + arson.num_occurrences)

ax.legend()
ax.set_ylabel("n occurrences")
ax.set_title("Top Causes Of Wildfires Across The U.S.", fontdict={'fontsize': 24});


#### Number of Months in a year, that had an average days_fire_existed greater than zero
- 1992: 1
- 1993: 0 
- 1994: 3
- 1995: 1
- 1996: 3
- 1997: 1
- 1998: 2
- 1999: 3
- 2000: 3
- 2001: 2
- 2002: 5
- 2003: 4
- 2004: 4
- 2005: 7
- 2006: 6
- 2007: 3
- 2008: 9
- 2009: 6
- 2010: 5
- 2011: 4
- 2012: 7
- 2013: 6
- 2014: 5
- 2015: 4
- 2016: 4
- 2017: 6
- 2018: 6

### While this could certainly be explained better as a chart, this list of number represents an increase in the length of time it takes to contain a fire. In 1992, the length of time it took to contain a fire was averaged. This data shows that in 1992, most fires were contained the same day they were discovered (represeted as an average of 0 days). There was only one month out of 1992 that had an average above this, wherein the average was 1 day. This means for that month, it took more than a day to contain a fire (on average). Notice in the list, this is what the '1' represents for 1992.
### Now as you move down the list through the years, the number of months in a year with an average above 0 days increases. By 2005, there is an month with an average of 10 days. This means, in that month, the average time it took to contain a fire was 10 days!
### The takeaway from this list, is that fires are taking much longer for firemen to contain when comparing 1992 to 2018. In 1992, there was only one month of that year that had a fire containment average above zero days. By 2018, we see there are six months of that year that had a fire containment average above zero days.  

In [None]:
# Jeanette
# Dropping data that has nulls in containment_date
clean_df = df[df.containment_date.isnull() == False]

# Creating a column to see how many days before a fire was contained
clean_df['days_fire_existed'] = df.containment_date - df.discovery_date

# 2005 had fires that took much longer to contain. Idaho shows some fires lasting YEARS!
clean_df[(clean_df.discovery_date == '2005-08') & (clean_df.days_fire_existed > '29 days')]

### While investigating a certain date, we found that there were some fires that lasted well over a year. It's difficult to say why these fires were considered uncontained for so long, and yet burned very little acres. There is a chance these Idaho fires represent the coal mines that caught on fire and were left to burn, however there is no way to prove these fires represent those fires or a different set of fires entirely. 

In [None]:
# Jeanette
# Making a texas dataframe
texas = df[df.state == 'TX']

# Let's look at how many fires Texas has had over the years
plt.figure(figsize = (20,8))
sns.countplot(data= texas , x= 'fire_year', hue='fire_year'  )
plt.legend([],[], frameon=False)
plt.title('Texas Wildfires have Increased in Number', fontdict={'fontsize': 24});

### The chart above shows that the number of fires in Texas was clearly less before 2005. There is a significant increase in the fires between 2005 and 2011. After 2011, it seems the number of fires in a year decreases, but we don't see the smaller numbers as seen before 2005. The "new normal" of fires per year in Texas seems to be between 7,500 and 10,000. This could be a result of climate change getting worse. Unfortunately, it could also be a result of poor reporting in Texas before the year 2005.

In [None]:
# Jeanette
# Total Acres burned in Texas per year
plt.figure(figsize = (20,8))
texas.groupby('fire_year').fire_size.sum().plot()
plt.title('Texas Acres Burned', fontdict={'fontsize': 24});

### This is another visualization for Texas fires, showing the number of acres burned. When compared with the visual above showing the number of fires, it's no surprise to see that the most acres burned was between 2005 and 2012. While the number of fires after 2012 is higher that the 1990's, this chart shows the acres burned is still averaging the same with only a slight increase. Perhaps this means a better response time by firemen? Despite the increase in number of fires, the acres burned seems to be managed. 

In [None]:
# Jeanette
# Are humans the main cause of wildfires?
human_cause = ['Debris and open burning', 
               'Arson/incendiarism', 
               'Equipment and vehicle use', 
               'Recreation and ceremony', 
               'Misuse of fire by a minor', 
               'Smoking', 
               'Power generation/transmission/distribution', 
               'Fireworks', 
               'Railroad operations and maintenance', 
               'Firearms and explosives use']                            
                                       

other_cause = ['Missing data/not specified/undetermined', 
               'Natural', 
               'Other causes']
human_fires = 0
for cause in human_cause:
    human_fires += clean_df[clean_df.general_cause == cause ].shape[0]


other_fires = 0
for cause in other_cause:
    other_fires += clean_df[clean_df.general_cause == cause ].shape[0]

print('Total Human Caused Fires:', human_fires)
print('Total Other Caused Fires:', other_fires)


### A lot of these fires have thier `general_cause` labled as 'Missing data/not specified/undetermined'. Despite this, the data clearly shows that humans are the main cause of wildfires, not nature. While climate change plays a big role in make these fires worse, the cause of the fire is still in the hands of humans. 

In [None]:
# Kristine
# plotted data for cause of wildfire and # of fires
fig = plt.figure(figsize=(10, 8))
ax = df.general_cause.value_counts().plot.bar(width=.7, ec='black', color='red')
ax.set(title='# of Fires vs cause', ylabel='Number of Fires', xlabel='cause')
plt.savefig("Dist.png")

#### * The chart above shows a visual of the distribution of fires categorized by cause from largest to smallest number of fires in each category. 

- Aside from missing/undertermined causes, Debris and open burning, Natural, Arson are the top three causes for most wildfires.  Human interaction is prevalent however in majority of causes altogether.

*Largest cause of wildfires is "missing data/not specified/undetermined"*

In [None]:
# Kristine
# fire size over span of years
plt.figure(figsize = (20,8))
firesizeyear=sns.barplot(x="fire_year", y="fire_size", data= df ,color='red')
firesizeyear.set_title('Fire Size Per Year', fontdict={'fontsize': 24})
firesizeyear.set_xticklabels(firesizeyear.get_xticklabels(),rotation=90);

#### * The chart above shows a visual of the distribution of fire size over each recorded year*

- Although fire size changes from highs and lows as time progresses, the overall size after 2000 seems to have an upward trajectory.

In [None]:
# add new column for day of the week
df['day_of_week']= df['discovery_date'].dt.day_name()
print(df['day_of_week'])

In [None]:
# Kristine
# add new column for day of the week
df['day_of_week']= df['discovery_date'].dt.day_name()

# create dataframe for the count of fires occuring per each day of the week
day_of_week = df.groupby(['day_of_week']).size().reset_index(name = 'count').sort_values('count')


# plot the number of fires per day of week to visualize
plt.figure(figsize=(20,8))
g = sns.barplot(data = day_of_week, y = 'count', x = 'day_of_week')
plt.xlabel('Day of the week')
plt.ylabel('Number of wildfires')
g.axes.set_title('Number of Fires for Day of the Week',fontsize=20);

In [None]:
# Kristine
# create dataframe for number of fires during each day of the week 
# that was caused specifically by "debris and open burning"
debris_over_weekday = df[df['general_cause'] == 'Debris and open burning'].groupby(['day_of_week']).size().reset_index(name = 'count').sort_values('count')

# plot number of fires caused by debris and open burning over days of the week
plt.figure(figsize=(20,8))
g = sns.barplot(data = debris_over_weekday, y = 'count', x = 'day_of_week')
plt.xlabel('Day of week')
plt.ylabel('Number of wildfire cases due to Debris')
g.axes.set_title('Number of Wildfire Per Day of Week Caused by Debris and Open Burning',fontsize=24);

#### * As predicted, the weekends are the most vulnerable days for wildfires to occur. Saturday, Sunday, and Monday have the highest recorded fires. Furthermore, fires caused by Debris and open burning is the most prevalent cause on Saturdays*

In [None]:
# # Sophia
# # plot fires in Texas
# plt.figure(figsize=(15,15))
# sns.scatterplot(x='longitude', y='latitude', data=df[df.state=='TX'], size='fire_size_class', hue='fire_year', palette='flare', alpha=0.5);
# # light to dark, oldest to newest


In [None]:
# Sophia
# create new column
texas['days_uncontrolled'] = texas.containment_date - texas.discovery_date

# look at value counts for uncontrolled windows
texas.days_uncontrolled.value_counts().sort_index()

In [None]:
# Sophia
# plot fires across us
plt.figure(figsize=(20,12))
sns.scatterplot(x='longitude', y='latitude', data= df, size='fire_size', hue='fire_year', palette='flare', alpha=0.5);
plt.title('All Fires in the US', fontdict={'fontsize': 24});


In [None]:
# Sophia
# print max and min for num fires/year
for year in list(range(1992, 2019)):
    print(year)
    print('Fewest fires:', df[df.fire_year == year].groupby('state').fire_size.count().idxmin(), df[df.fire_year == year].groupby('state').fire_size.count().min())
    print('Most fires:',df[df.fire_year == year].groupby('state').fire_size.count().idxmax(), df[df.fire_year == year].groupby('state').fire_size.count().max())
    print()
    

---
# Conclusion
---

Summary goes here