In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime

import wrangle
import explore

import warnings
warnings.filterwarnings('ignore')

In [2]:
# set style defaults
plt.rc('figure', figsize=(13, 7))
plt.style.use('seaborn-darkgrid')

In [3]:
# change setting to view all columns in jupyter notebook
pd.set_option('display.max_columns', None)

In [4]:
# use a function to pull in the data
fires = wrangle.wrangle_fires()
fires.head()

Unnamed: 0,fire_year,discovery_date,general_cause,containment_date,fire_size,latitude,longitude,state,state_size,region,region_size,fire_size_cat
0,2005,2005-02-02,Power generation/transmission/distribution,2005-02-02,0.1,40.036944,-121.005833,CA,101676000.0,west,1008831000.0,small
1,2004,2004-05-12,Natural,2004-05-12,0.25,38.933056,-120.404444,CA,101676000.0,west,1008831000.0,small
2,2004,2004-05-31,Debris and open burning,2004-05-31,0.1,38.984167,-120.735556,CA,101676000.0,west,1008831000.0,small
3,2004,2004-06-28,Natural,2004-07-03,0.1,38.559167,-119.913333,CA,101676000.0,west,1008831000.0,small
4,2004,2004-06-28,Natural,2004-07-03,0.1,38.559167,-119.933056,CA,101676000.0,west,1008831000.0,small


# Univariate Analysis

In [None]:
# check info
fires.info()

In [None]:
# set discovery date as index
df = fires.set_index('discovery_date').sort_index()
df.head(2)

In [None]:
# check oldest and newest discovery dates in the dataset
print(df.index.min(), df.index.max())

### We have 27 years of data.

In [None]:
# create a year and month column
df['year'] = df.index.year
df['month'] = df.index.month_name()
df.head()

In [None]:
# check counts for each cause
df.general_cause.value_counts()

### It looks like a good chunk of our data has no specified cause.

In [None]:
# plot fire size
df.fire_size.hist()

In [None]:
# get statistics for fire size
df.fire_size.describe().apply(lambda x: format(x, 'f'))

### The data contains some outliers when it comes to fire size

In [None]:
# # create a scatter plot of fire by latitude and longitude
# sns.scatterplot(x='longitude', y='latitude', hue='state', data=df)

In [None]:
# check value counts by state
df.state.value_counts()

### States in the NorthEastern part of the United States have the least wildfires. Not surprisingly, California has the most. Other states with the most wildfires seem to be primarily in the South.

# How has the occurrence of wildfires changed over time? How does wildfire seasonality vary by location?

### What is the number of fires per year for all years we have in the data?

In [None]:
# plot counts by year
sns.countplot(data=df, x='year')

### 2006, 2007, and 2011 had the most wildfires. There does almost appear to be a cycle of 5-6 years for the amount of wildfires.

### Are wildfires more prevalent at certain times of year?

In [None]:
# plot counts by month
sns.countplot(data=df, x='month')

### Not surprisingly, July & August tend to have more fires which is likely due to heat, camping, and dry weather. It is surprising to see that March & April have a higher number of fires as well.

### Get visualization of value counts by state

In [None]:
# plot counts by state
plt.figure(figsize=(10,20))
sns.countplot(data=df, y='state', order=df.state.value_counts().index)

In [None]:
# find out what percentage of fires occur in each state
df.state.value_counts(normalize=True)

In [None]:
x = pd.DataFrame(df.groupby(['year', 'state']).fire_year.count()/df.groupby('year').fire_year.count())
x.index

### What does the number of fires and size of fires look like by month?

In [None]:
# resample by month and get count of wildfires by month
num_fires_by_month = df.resample('M').fire_year.count()
num_fires_by_month.head()

In [None]:
# check number of rows
num_fires_by_month.shape

In [None]:
# check which month and year combination had the most fires
num_fires_by_month.idxmax()

In [None]:
# check how many fires happened that month
num_fires_by_month.max()

### March 2006 had the highest number of fires within the dataset with slightly over 19K fires occurring in that month.

In [None]:
# plot monthly number of fires over the years
num_fires_by_month.plot()

In [None]:
# plot number of fires over the years resampled by 6 months
num_fires_by_month.resample('6M').mean().plot(marker='o')

In [None]:
# plot number of fires over the years resampled by a year
num_fires_by_month.resample('Y').mean().plot(marker='o')

In [None]:
df.head()

In [None]:
# resample by month and get summed size of wildfires by month
size_fires_by_month = df.resample('M').fire_size.sum()
size_fires_by_month.head()

In [None]:
# check which month and year combination had the largest summed size of wildfires
size_fires_by_month.idxmax()

In [None]:
# check total sum of fire sizes that month
size_fires_by_month.max()

### Figure out how much this is and determine a good reference so audience can understand the impact

In [None]:
# plot monthly summed size of fires over the years
df.groupby('year').fire_size.sum().plot()

In [None]:
# plot monthly average size of fires over the years
df.resample('M').fire_size.mean().plot()

In [None]:
# plot yearly average size of fires over the years
df.resample('Y').fire_size.mean().plot()

### When resampling by year, there is not a clear trend in the *number* of wildfires however, we do see that the average *size* of wildfires is on an upward trend

In [None]:
df.head()

In [None]:
# get statistics for fire size
df.fire_size.describe().apply(lambda x: format(x, 'f'))

In [None]:
df[df.fire_size > 100]

In [None]:
df.head()

In [None]:
top_five_wildfire_states = df.state.value_counts().head(5).index.to_list()
top_five_wildfire_states

In [None]:
num_fires_by_year = df[df.state.isin(top_five_wildfire_states)]\
.groupby(['year', 'state']).count().reset_index()\
.iloc[:, 0:3].rename(columns={'fire_year':'num_fires'})
num_fires_by_year

In [None]:

sns.lineplot(data=num_fires_by_year, x='year', y='num_fires', hue='state')

In [None]:

top_causes_of_wildfires = df.general_cause.value_counts().head()
top_causes_of_wildfires

In [None]:
top_causes_of_wildfires[1:].index.to_list()

In [None]:
# create a new dataframe that groups by year and gets a count by year of the top causes for wildfires
num_causes_by_year = df[df.general_cause.isin(top_causes_of_wildfires[1:].index.to_list())]\
.groupby(['year', 'general_cause']).count().reset_index().iloc[:, 0:3]\
.rename(columns={'fire_year':'num_occurrences'})
num_causes_by_year

In [None]:
# plot trends over the years by the top four causes of wildfires, excluding observations where cause was not specified
sns.lineplot(data=num_causes_by_year, x='year', y='num_occurrences', hue='general_cause')

In [None]:
num_causes_by_year.head()

In [None]:
num_causes_by_year.to_csv('num_causes_by_year')

In [None]:
num_causes_by_year.shape

In [None]:
natural = num_causes_by_year[num_causes_by_year.general_cause == "Natural"]
arson = num_causes_by_year[num_causes_by_year.general_cause == "Arson/incendiarism"]
debris = num_causes_by_year[num_causes_by_year.general_cause == "Debris and open burning"]
equipment = num_causes_by_year[num_causes_by_year.general_cause == "Equipment and vehicle use"]

natural = natural.set_index("year")
arson = arson.set_index("year")
equipment = equipment.set_index("year")
debris = debris.set_index("year")

fig, ax = plt.subplots()
labels = arson.index

ax.bar(labels, arson.num_occurrences, label="Arson/incendiarism")
ax.bar(labels, natural.num_occurrences, label="Natural Causes", bottom=arson.num_occurrences)
ax.bar(labels, debris.num_occurrences, label="Debris & Open Burning", bottom=natural.num_occurrences + arson.num_occurrences)
ax.bar(labels, equipment.num_occurrences, label="Equipment & Vehicle Use", bottom=debris.num_occurrences + natural.num_occurrences + arson.num_occurrences)

ax.legend()
ax.set_ylabel("n occurrences")
ax.set_title("Top Causes Of Wildfires Across The U.S.")

In [None]:
# plot wildfires caused by natural causes by year
sns.countplot(data=df[df.general_cause == 'Natural'], y='year')

### In recent years, we have seen a decrease in number of wildfires that were caused by natural causes

In [None]:
# plot wildfires caused by Debris and open burning by year
sns.countplot(data=df[df.general_cause == 'Debris and open burning'], y='year')

### There appears to be a slight upward trend in debris and open burning as a cause of wildfires

In [None]:
# plot wildfires caused by Arson/incendiarism by year
sns.countplot(data=df[df.general_cause == 'Arson/incendiarism'], y='year')

### There are some spikes but otherwise a downward trend in arson/incendiarism as a cause for wildfires

In [None]:
# plot wildfires caused by Equipment and vehicle use by year
sns.countplot(data=df[df.general_cause == 'Equipment and vehicle use'], y='year')

### There are no obvious trends or anything that really stands out in this plot for equipment and vehicle use as a cause for wildfires. 

### There's no one cause for the higher number of fires in 2006. All causes have higher numbers for this year.

In [None]:
# get statistics for fire size
df.fire_size.describe().apply(lambda x: format(x, 'f'))

### I want to get a better understanding of range of wildfire sizes

In [None]:
print(f'The 10th percentile is: {df.fire_size.quantile(.1)}')
print(f'The 20th percentile is: {df.fire_size.quantile(.2)}')
print(f'The 25th percentile is: {df.fire_size.quantile(.25)}')
print(f'The 30th percentile is: {df.fire_size.quantile(.3)}')
print(f'The 40th percentile is: {df.fire_size.quantile(.4)}')
print(f'The 50th percentile is: {df.fire_size.quantile(.5)}')
print(f'The 60th percentile is: {df.fire_size.quantile(.6)}')
print(f'The 70th percentile is: {df.fire_size.quantile(.7)}')
print(f'The 75th percentile is: {df.fire_size.quantile(.75)}')
print(f'The 80th percentile is: {df.fire_size.quantile(.8)}')
print(f'The 90th percentile is: {df.fire_size.quantile(.9)}')
print(f'The 100th percentile is: {df.fire_size.quantile(1)}')

In [None]:
df.fire_size.hist()

In [None]:
df[df.fire_size > 100000].shape

In [None]:
# check 
df[df.fire_size > 100000].fire_size.hist()

In [None]:
# plot counts of large fires by year
sns.countplot(data=df[df.fire_size > 100000], x='year')

### I want to graph the average size of wildfires per decade to see if I can clearly represent the trend to our audience.

In [None]:
# create another column categorizing decade
conditions = [df.year < 2000, df.year <2010, df.year <2020]
choices = ['1992 - 1999', '2000 - 2009', '2010 - ']
df['decade'] = np.select(conditions, choices)
df.head()

In [None]:
# plot average fire size by decade
sns.barplot(data=df, x='decade', y='fire_size')

### I am also going to try to graph average fire size per quinquennial between 1995-2015 to see if that also represents this trend

In [None]:
# create another dataframe where I can chunk the data into 5-year periods
df2 = df[(df.year >=1996) & (df.year <=2015)]
df2.year.value_counts().sort_index()

In [None]:
# create another column categorizing quinquennial
conditions = [df2.year <= 2000, df2.year <= 2005, df2.year <= 2010, df2.year <= 2015]
choices = ['1996 - 2000', '2001 - 2005', '2006 - 2010', '2011 - 2015']
df2['quinquennial'] = np.select(conditions, choices)
df2.head()

In [None]:
# plot average fire size by quinquennial
sns.barplot(data=df2, x='quinquennial', y='fire_size')

In [None]:
yearly_mean_fire_size = df.groupby('year').fire_size.mean().reset_index()

In [None]:
# # plot scatterplot with regression line
# # sns.color_palette("rocket", as_cmap=True)
# sns.lmplot(x='year', y='fire_size', data=yearly_mean_fire_size, height=8, markers='x', seed=321, robust=True)
# plt.title('Annual Average Acres Burned Is Increasing', size=14, color_palette='rocket')
# plt.xlabel('Year')
# plt.ylabel('Acres Burned');

In [None]:
df.groupby('year').mean()

The smallest wildfire is only about 4.5 square feet

25% of fires are less than 1/10 of an acre

50% are less than one acre

75% are less than three acres

20% of fires are greater than 5 acres which is equivalent to three and a quarter football fields.

The largest wildfire is larger than 500,000 football fields or 85% of Rhode Island, our smallest state

---

#### I am going to create a column categorizing the fires as:

* small: up to the size of two tennis courts
* medium: up to the size of a football field
* large: up to the size of 4 football fields
* extra_lg: anything larger than this

In [None]:
# create another column categorizing fire size
conditions = [df.fire_size <=.13, df.fire_size <=1.3, df.fire_size <=5.2, df.fire_size >5.2]
choices = ['small', 'medium', 'large', 'extra_lg']
df['size_category'] = np.select(conditions, choices)
df.head()

In [None]:
df.size_category.value_counts(normalize=True)

In [None]:
# plot size of fires by latitude and longitude
sns.scatterplot(x='longitude', y='latitude', hue='size_category', data=df)

### With all the fires plotted across all the years, we see that the large majority seems to be smaller fires. Alaska does have a large amount of extra-large fires

In [None]:
# checking to see what the fire size and location looks like for 2006
sns.scatterplot(x='longitude', y='latitude', hue='size_category', data=df[df.year == 2006])

### It appears that the majority of the largest wildfires in 2006 happened in the central area of the US

In [None]:
df.head()

In [None]:
df.groupby('state').fire_size.sum().sort_values(ascending=False).head()

In [None]:
y = df.groupby('state').fire_size.sum().sort_values(ascending=False).head(10).reset_index()
y.fire_size = round(y.fire_size/1_000_000)
y

In [None]:
sns.barplot(x='state', y='fire_size', data=y)

In [None]:
df[df.state == 'AK'].fire_size.sum()

### I am going to further explore extra-large fires to see if there's anything there.

In [None]:
lg_df = df[df.size_category == 'extra_lg']
lg_df.info()

In [None]:

lg_df.fire_size.describe()

In [None]:
lg_df.general_cause.value_counts()

In [None]:
x = df[df.fire_size > 9]
x.general_cause.value_counts()

In [None]:

df[df.general_cause == 'Debris and open burning'].fire_size.describe()

In [None]:

df[df.general_cause == 'Debris and open burning'].fire_size.hist()

In [None]:

df[(df.general_cause == 'Debris and open burning') & (df.fire_size > 50_000)].info()

In [None]:

df[df.general_cause == 'Natural'].fire_size.describe()

In [None]:

df[df.general_cause == 'Natural'].fire_size.hist()

In [None]:
df[(df.general_cause == 'Natural') & (df.fire_size > 50_000)].info()

In [None]:

df[df.general_cause == 'Arson/incendiarism'].fire_size.describe()

In [None]:
df[df.general_cause == 'Arson/incendiarism'].fire_size.hist()

In [None]:

df[(df.general_cause == 'Arson/incendiarism') & (df.fire_size > 50_000)].info()

### Below is a list of states with the most and fewest amount of fires for each year in our dataset. Washington DC is most often the place with the fewest fires per year. Apart from DC, the states with the fewest fires per year include New England states such as Maryland, Vermont, Delaware, and Massachusetts; Puerto Rico had the fewest fires in 1996. California, Georgia, and Texas are the states with the greatest number of fires for every year except for 2018, during which Arizona had 9,738 fires.

In [None]:
# # Sophia
# # print max and min for num fires/year
# for year in list(range(1992, 2019)):
#     print(year)
#     print('Fewest fires:', df[df.fire_year == year].groupby('state').fire_size.count().idxmin(), df[df.fire_year == year].groupby('state').fire_size.count().min())
#     print('Most fires:',df[df.fire_year == year].groupby('state').fire_size.count().idxmax(), df[df.fire_year == year].groupby('state').fire_size.count().max())
#     print() b

### A teammate discovered that the annual number of wildfires for TX  that we had in our database almost doubled for each year for the period 2005 and after vs 1992 - 2004. I am going to do a little exploring to see if I can discover why there's such a difference.

In [None]:
# create separate dataframes for each period
texas_92 = df[(df.state == 'TX') & (df.fire_year < 2005)]
texas_05 = df[(df.state == 'TX') & (df.fire_year >= 2005)]
texas_05.head()

In [None]:
# plot counts by year for period prior to 2005
sns.countplot(data=texas_92, x='year')

In [None]:
# plot counts by year for period 2005 and after
sns.countplot(data=texas_05, x='year')

### If we look at the numbers on the y-axis, the year with the most fires, 1996, in period pre-2005 is less than the year with the least fires 2007, in period 2005 and after. This is a huge jump and slightly suspicious. Climate change is causing an increase in wildfires but not with that much of a difference from one year to the next.

In [None]:
# check value counts for cause and normalize
texas_92.general_cause.value_counts(normalize=True)

In [None]:
# check value counts for cause and normalize
texas_05.general_cause.value_counts(normalize=True)

Debris and open burning as a cause is down 6%

Missing data as a cause is up 100%

Equipment and vehicle use as a cause is up 7%

### There's nothing here that stands out as the reason for more fires. The significant increase in missing data makes me wonder if there may just have been a change in recording wildfires.

In [None]:
# check stats for wildfire size to see if there's a difference
texas_92.fire_size.describe()

In [None]:
# check stats for wildfire size to see if there's a difference
texas_05.fire_size.describe()

### The fact that there are many more smaller fires reaffirms my suspicions. It could be that prior to 2005 many of the smaller fires that were quicky contained were not documented and 2005 was when they started being more comprehensive in their documentation of fires. 

In [None]:
df[(df.state == 'TX') & (df.fire_year == 2004)].general_cause.value_counts(normalize=True)

In [None]:
df[(df.state == 'TX') & (df.fire_year == 2005)].general_cause.value_counts(normalize=True)

In [None]:
df[(df.state == 'TX') & (df.fire_year == 2004)].fire_size.describe()

In [None]:
df[(df.state == 'TX') & (df.fire_year == 2005)].fire_size.describe()

In [None]:
df = pd.read_csv('fires.csv', index_col=0)
df.head()

In [None]:
df[df.FIRE_SIZE_CLASS == 'G'].sort_values('FIRE_SIZE')

In [None]:
df.FIRE_SIZE.sum()

In [None]:
df[df.FIRE_SIZE_CLASS == 'G'].FIRE_SIZE.sum()

In [None]:
df[df.FIRE_SIZE_CLASS != 'G'].FIRE_SIZE.sum()

In [None]:
df[df.FIRE_SIZE_CLASS == 'G'].FIRE_SIZE.sum() / df.FIRE_SIZE.sum()

In [None]:
df[df.FIRE_SIZE_CLASS == 'G'].NWCG_GENERAL_CAUSE.value_counts()

In [None]:
# plot counts by year
sns.countplot(data=df[df.FIRE_SIZE_CLASS == 'G'], x='FIRE_YEAR')

In [None]:
# plot counts by month
sns.countplot(data=df, x='month')