In [1]:
# !pip install pandas
# !pip install openpyxl
# !pip install mlxtend


import pandas as pd
from mlxtend.frequent_patterns import fpgrowth

In [2]:
# url = "https://open.alberta.ca/dataset/a221e7a0-4f46-4be7-9c5a-e29de9a3447e/resource/80480824-0c50-456c-9723-f9d4fc136141/download/fp-historical-wildfire-data-2006-2023.xlsx"

# data = pd.read_excel(url)

data = pd.read_excel('fp-historical-wildfire-data-2006-2023.xlsx')
data.shape

(25321, 50)

In [3]:
data.head()

Unnamed: 0,fire_year,fire_number,fire_name,current_size,size_class,fire_location_latitude,fire_location_longitude,fire_origin,general_cause_desc,industry_identifier_desc,...,distance_from_water_source,first_bucket_drop_date,bh_fs_date,bh_hectares,uc_fs_date,uc_hectares,to_fs_date,to_hectares,ex_fs_date,ex_hectares
0,2006,PWF001,,0.1,A,56.249956,-117.18196,Private Land,Resident,,...,,,2006-04-02 22:00:00,0.01,2006-04-02 22:00:00,0.01,,,2006-04-03 10:20:00,0.1
1,2006,EWF002,,0.2,B,53.606367,-115.915733,Provincial Land,Incendiary,,...,,,2006-04-03 13:20:00,0.2,2006-04-03 13:20:00,0.2,,,2006-04-03 14:00:00,0.2
2,2006,EWF001,,0.5,B,53.610933,-115.594267,Provincial Land,Incendiary,,...,,,2006-04-03 13:23:00,0.5,2006-04-03 13:23:00,0.5,,,2006-04-03 15:00:00,0.5
3,2006,EWF003,,0.01,A,53.608867,-115.609467,Provincial Land,Incendiary,,...,,,2006-04-03 14:08:00,0.01,2006-04-03 14:08:00,0.01,,,2006-04-03 15:05:00,0.01
4,2006,PWF002,,0.1,A,56.249956,-117.050249,Provincial Land,Other Industry,Waste Disposal,...,,,2006-04-03 19:57:00,0.1,2006-04-03 20:19:00,0.1,2006-04-03 20:20:00,0.1,2006-04-05 10:18:00,0.1


In [4]:
data.columns

Index(['fire_year', 'fire_number', 'fire_name', 'current_size', 'size_class',
       'fire_location_latitude', 'fire_location_longitude', 'fire_origin',
       'general_cause_desc', 'industry_identifier_desc',
       'responsible_group_desc', 'activity_class', 'true_cause',
       'fire_start_date', 'det_agent_type', 'det_agent', 'discovered_date',
       'discovered_size', 'reported_date', 'dispatched_resource',
       'dispatch_date', 'start_for_fire_date', 'assessment_resource',
       'assessment_datetime', 'assessment_hectares', 'fire_spread_rate',
       'fire_type', 'fire_position_on_slope', 'weather_conditions_over_fire',
       'temperature', 'relative_humidity', 'wind_direction', 'wind_speed',
       'fuel_type', 'initial_action_by', 'ia_arrival_at_fire_date',
       'ia_access', 'fire_fighting_start_date', 'fire_fighting_start_size',
       'bucketing_on_fire', 'distance_from_water_source',
       'first_bucket_drop_date', 'bh_fs_date', 'bh_hectares', 'uc_fs_date',
       'u

In [5]:
sub_dataset_columns = [
    'temperature',
    'wind_speed',
    'fire_position_on_slope',
    'wind_direction',
    'relative_humidity',
    'fire_type',
    'weather_conditions_over_fire',
    'current_size'
]

sub_data = data[sub_dataset_columns]
sub_data.head()

Unnamed: 0,temperature,wind_speed,fire_position_on_slope,wind_direction,relative_humidity,fire_type,weather_conditions_over_fire,current_size
0,18.0,2.0,Flat,SW,10.0,Surface,Clear,0.1
1,12.0,10.0,Lower 1/3,SW,22.0,Surface,Clear,0.2
2,12.0,10.0,Bottom,SW,22.0,Surface,Clear,0.5
3,12.0,10.0,Flat,SW,22.0,Surface,Clear,0.01
4,6.0,2.0,Flat,SW,37.0,Surface,Clear,0.1


In [6]:
for column in sub_dataset_columns:
    if sub_data[column].isnull().any():
        print(f"The column '{column}' contains null or NA values.")
    else:
        print(f"The column '{column}' does not contain null or NA values.")

The column 'temperature' contains null or NA values.
The column 'wind_speed' contains null or NA values.
The column 'fire_position_on_slope' contains null or NA values.
The column 'wind_direction' contains null or NA values.
The column 'relative_humidity' contains null or NA values.
The column 'fire_type' contains null or NA values.
The column 'weather_conditions_over_fire' contains null or NA values.
The column 'current_size' does not contain null or NA values.


In [7]:
sub_dataset_cleaned = sub_data.dropna(subset=sub_dataset_columns)
sub_dataset_cleaned.shape

(22486, 8)

In [8]:
cat_col = [
    'fire_position_on_slope',
    'wind_direction',
    'fire_type',
    'weather_conditions_over_fire'
]
cat_subdata = sub_dataset_cleaned[cat_col]
cat_subdata.head()

Unnamed: 0,fire_position_on_slope,wind_direction,fire_type,weather_conditions_over_fire
0,Flat,SW,Surface,Clear
1,Lower 1/3,SW,Surface,Clear
2,Bottom,SW,Surface,Clear
3,Flat,SW,Surface,Clear
4,Flat,SW,Surface,Clear


In [11]:
[cat_subdata.fire_position_on_slope.unique(),
cat_subdata.wind_direction.unique(),
cat_subdata.fire_type.unique(),
cat_subdata.weather_conditions_over_fire.unique()]

[array(['Flat', 'Lower 1/3', 'Bottom', 'Upper 1/3', 'Middle 1/3'],
       dtype=object),
 array(['SW', 'S', 'W', 'E', 'NW', 'CLM', 'N', 'SE', 'NE'], dtype=object),
 array(['Surface', 'Ground', 'Crown'], dtype=object),
 array(['Clear', 'Cloudy', 'CB Wet', 'Rainshowers', 'CB Dry'], dtype=object)]

In [10]:
cat_subdata.wind_direction[cat_subdata.wind_direction == ' S'] = 'S'
cat_subdata.wind_direction[cat_subdata.wind_direction == ' NW'] = 'NW'
cat_subdata.fire_type[cat_subdata.fire_type == '   Surface'] = 'Surface'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cat_subdata.wind_direction[cat_subdata.wind_direction == ' S'] = 'S'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cat_subdata.wind_direction[cat_subdata.wind_direction == ' NW'] = 'NW'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cat_subdata.fire_type[cat_subdata.fire_type == '   Surface'] = 'Surface'


In [12]:
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
te_ary = te.fit(cat_subdata.to_numpy()).transform(cat_subdata.to_numpy())
cat_subdata_trans = pd.DataFrame(te_ary, columns=te.columns_)
cat_subdata_trans

Unnamed: 0,Bottom,CB Dry,CB Wet,CLM,Clear,Cloudy,Crown,E,Flat,Ground,...,N,NE,NW,Rainshowers,S,SE,SW,Surface,Upper 1/3,W
0,False,False,False,False,True,False,False,False,True,False,...,False,False,False,False,False,False,True,True,False,False
1,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,True,True,False,False
2,True,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,True,True,False,False
3,False,False,False,False,True,False,False,False,True,False,...,False,False,False,False,False,False,True,True,False,False
4,False,False,False,False,True,False,False,False,True,False,...,False,False,False,False,False,False,True,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22481,True,False,False,False,True,False,False,False,False,False,...,False,False,False,False,True,False,False,True,False,False
22482,False,False,False,False,True,False,False,False,True,False,...,False,False,False,False,False,True,False,True,False,False
22483,False,False,False,False,False,True,False,True,True,True,...,False,False,False,False,False,False,False,False,False,False
22484,False,False,False,False,False,True,False,False,True,False,...,False,True,False,False,False,False,False,True,False,False


In [13]:
frequent_itemsets = fpgrowth(cat_subdata_trans, min_support=0.08, use_colnames=True)

frequent_itemsets['support'] = frequent_itemsets['support'].round(4)

frequent_itemsets

Unnamed: 0,support,itemsets
0,0.8166,(Flat)
1,0.7351,(Surface)
2,0.4674,(Clear)
3,0.1206,(SW)
4,0.3608,(Cloudy)
5,0.2268,(W)
6,0.0809,(E)
7,0.2202,(Ground)
8,0.1193,(NW)
9,0.1445,(CLM)


In [9]:
# quan_col = [
#     'temperature',
#     'wind_speed',
#     'relative_humidity'
# ]

# quan_subdata = sub_dataset_cleaned[quan_col]
# quan_subdata.head()

Unnamed: 0,temperature,wind_speed,relative_humidity
0,18.0,2.0,10.0
1,12.0,10.0,22.0
2,12.0,10.0,22.0
3,12.0,10.0,22.0
4,6.0,2.0,37.0


In [32]:
frequent_itemsets = fpgrowth(cat_subdata_trans, min_support=0.5, use_colnames=True)

frequent_itemsets['support'] = frequent_itemsets['support'].round(2)

frequent_itemsets

Unnamed: 0,support,itemsets
0,0.82,(Flat)
1,0.74,(Surface)
2,0.6,"(Flat, Surface)"
