In [25]:
# !pip install pandas
# !pip install openpyxl
# !pip install mlxtend


import pandas as pd
from mlxtend.frequent_patterns import fpgrowth

In [26]:
# url = "https://open.alberta.ca/dataset/a221e7a0-4f46-4be7-9c5a-e29de9a3447e/resource/80480824-0c50-456c-9723-f9d4fc136141/download/fp-historical-wildfire-data-2006-2023.xlsx"

# data = pd.read_excel(url)

data = pd.read_excel('fp-historical-wildfire-data-2006-2023.xlsx')
data.shape

KeyboardInterrupt: 

In [None]:
data.head()

Unnamed: 0,fire_year,fire_number,fire_name,current_size,size_class,fire_location_latitude,fire_location_longitude,fire_origin,general_cause_desc,industry_identifier_desc,...,distance_from_water_source,first_bucket_drop_date,bh_fs_date,bh_hectares,uc_fs_date,uc_hectares,to_fs_date,to_hectares,ex_fs_date,ex_hectares
0,2006,PWF001,,0.1,A,56.249956,-117.18196,Private Land,Resident,,...,,,2006-04-02 22:00:00,0.01,2006-04-02 22:00:00,0.01,,,2006-04-03 10:20:00,0.1
1,2006,EWF002,,0.2,B,53.606367,-115.915733,Provincial Land,Incendiary,,...,,,2006-04-03 13:20:00,0.2,2006-04-03 13:20:00,0.2,,,2006-04-03 14:00:00,0.2
2,2006,EWF001,,0.5,B,53.610933,-115.594267,Provincial Land,Incendiary,,...,,,2006-04-03 13:23:00,0.5,2006-04-03 13:23:00,0.5,,,2006-04-03 15:00:00,0.5
3,2006,EWF003,,0.01,A,53.608867,-115.609467,Provincial Land,Incendiary,,...,,,2006-04-03 14:08:00,0.01,2006-04-03 14:08:00,0.01,,,2006-04-03 15:05:00,0.01
4,2006,PWF002,,0.1,A,56.249956,-117.050249,Provincial Land,Other Industry,Waste Disposal,...,,,2006-04-03 19:57:00,0.1,2006-04-03 20:19:00,0.1,2006-04-03 20:20:00,0.1,2006-04-05 10:18:00,0.1


In [None]:
data.columns

Index(['fire_year', 'fire_number', 'fire_name', 'current_size', 'size_class',
       'fire_location_latitude', 'fire_location_longitude', 'fire_origin',
       'general_cause_desc', 'industry_identifier_desc',
       'responsible_group_desc', 'activity_class', 'true_cause',
       'fire_start_date', 'det_agent_type', 'det_agent', 'discovered_date',
       'discovered_size', 'reported_date', 'dispatched_resource',
       'dispatch_date', 'start_for_fire_date', 'assessment_resource',
       'assessment_datetime', 'assessment_hectares', 'fire_spread_rate',
       'fire_type', 'fire_position_on_slope', 'weather_conditions_over_fire',
       'temperature', 'relative_humidity', 'wind_direction', 'wind_speed',
       'fuel_type', 'initial_action_by', 'ia_arrival_at_fire_date',
       'ia_access', 'fire_fighting_start_date', 'fire_fighting_start_size',
       'bucketing_on_fire', 'distance_from_water_source',
       'first_bucket_drop_date', 'bh_fs_date', 'bh_hectares', 'uc_fs_date',
       'u

In [None]:
sub_dataset_columns = [
    'temperature',
    'wind_speed',
    'fire_position_on_slope',
    'wind_direction',
    'relative_humidity',
    'fire_type',
    'weather_conditions_over_fire',
    'current_size'
]

sub_data = data[sub_dataset_columns]
sub_data.head()

Unnamed: 0,temperature,wind_speed,fire_position_on_slope,wind_direction,relative_humidity,fire_type,weather_conditions_over_fire,current_size
0,18.0,2.0,Flat,SW,10.0,Surface,Clear,0.1
1,12.0,10.0,Lower 1/3,SW,22.0,Surface,Clear,0.2
2,12.0,10.0,Bottom,SW,22.0,Surface,Clear,0.5
3,12.0,10.0,Flat,SW,22.0,Surface,Clear,0.01
4,6.0,2.0,Flat,SW,37.0,Surface,Clear,0.1


In [None]:
for column in sub_dataset_columns:
    if sub_data[column].isnull().any():
        print(f"The column '{column}' contains null or NA values.")
    else:
        print(f"The column '{column}' does not contain null or NA values.")

The column 'temperature' contains null or NA values.
The column 'wind_speed' contains null or NA values.
The column 'fire_position_on_slope' contains null or NA values.
The column 'wind_direction' contains null or NA values.
The column 'relative_humidity' contains null or NA values.
The column 'fire_type' contains null or NA values.
The column 'weather_conditions_over_fire' contains null or NA values.
The column 'current_size' does not contain null or NA values.


In [None]:
sub_dataset_cleaned = sub_data.dropna(subset=sub_dataset_columns)
sub_dataset_cleaned.shape

(22486, 8)

In [None]:
cat_col = [
    'fire_position_on_slope',
    'wind_direction',
    'fire_type',
    'weather_conditions_over_fire'
]
cat_subdata = sub_dataset_cleaned[cat_col]
cat_subdata.head()

Unnamed: 0,fire_position_on_slope,wind_direction,fire_type,weather_conditions_over_fire
0,Flat,SW,Surface,Clear
1,Lower 1/3,SW,Surface,Clear
2,Bottom,SW,Surface,Clear
3,Flat,SW,Surface,Clear
4,Flat,SW,Surface,Clear


In [None]:
[cat_subdata.fire_position_on_slope.unique(),
cat_subdata.wind_direction.unique(),
cat_subdata.fire_type.unique(),
cat_subdata.weather_conditions_over_fire.unique()]

[array(['Flat', 'Lower 1/3', 'Bottom', 'Upper 1/3', 'Middle 1/3'],
       dtype=object),
 array(['SW', 'S', 'W', 'E', 'NW', 'CLM', 'N', 'SE', 'NE'], dtype=object),
 array(['Surface', 'Ground', 'Crown'], dtype=object),
 array(['Clear', 'Cloudy', 'CB Wet', 'Rainshowers', 'CB Dry'], dtype=object)]

In [None]:
cat_subdata.wind_direction[cat_subdata.wind_direction == ' S'] = 'S'
cat_subdata.wind_direction[cat_subdata.wind_direction == ' NW'] = 'NW'
cat_subdata.fire_type[cat_subdata.fire_type == '   Surface'] = 'Surface'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cat_subdata.wind_direction[cat_subdata.wind_direction == ' S'] = 'S'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cat_subdata.wind_direction[cat_subdata.wind_direction == ' NW'] = 'NW'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cat_subdata.fire_type[cat_subdata.fire_type == '   Surface'] = 'Surface'


In [None]:
from mlxtend.preprocessing import TransactionEncoder

te = TransactionEncoder()
te_ary = te.fit(cat_subdata.to_numpy()).transform(cat_subdata.to_numpy())
cat_subdata_trans = pd.DataFrame(te_ary, columns=te.columns_)
cat_subdata_trans

Unnamed: 0,Bottom,CB Dry,CB Wet,CLM,Clear,Cloudy,Crown,E,Flat,Ground,...,N,NE,NW,Rainshowers,S,SE,SW,Surface,Upper 1/3,W
0,False,False,False,False,True,False,False,False,True,False,...,False,False,False,False,False,False,True,True,False,False
1,False,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,True,True,False,False
2,True,False,False,False,True,False,False,False,False,False,...,False,False,False,False,False,False,True,True,False,False
3,False,False,False,False,True,False,False,False,True,False,...,False,False,False,False,False,False,True,True,False,False
4,False,False,False,False,True,False,False,False,True,False,...,False,False,False,False,False,False,True,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22481,True,False,False,False,True,False,False,False,False,False,...,False,False,False,False,True,False,False,True,False,False
22482,False,False,False,False,True,False,False,False,True,False,...,False,False,False,False,False,True,False,True,False,False
22483,False,False,False,False,False,True,False,True,True,True,...,False,False,False,False,False,False,False,False,False,False
22484,False,False,False,False,False,True,False,False,True,False,...,False,True,False,False,False,False,False,True,False,False


In [None]:
frequent_itemsets = fpgrowth(cat_subdata_trans, min_support=0.08, use_colnames=True)

frequent_itemsets['support'] = frequent_itemsets['support'].round(4)

frequent_itemsets

Unnamed: 0,support,itemsets
0,0.8166,(Flat)
1,0.7351,(Surface)
2,0.4674,(Clear)
3,0.1206,(SW)
4,0.3608,(Cloudy)
5,0.2268,(W)
6,0.0809,(E)
7,0.2202,(Ground)
8,0.1193,(NW)
9,0.1445,(CLM)


In [None]:
# quan_col = [
#     'temperature',
#     'wind_speed',
#     'relative_humidity'
# ]

# quan_subdata = sub_dataset_cleaned[quan_col]
# quan_subdata.head()

Unnamed: 0,temperature,wind_speed,relative_humidity
0,18.0,2.0,10.0
1,12.0,10.0,22.0
2,12.0,10.0,22.0
3,12.0,10.0,22.0
4,6.0,2.0,37.0


In [None]:
frequent_itemsets = fpgrowth(cat_subdata_trans, min_support=0.5, use_colnames=True)

frequent_itemsets['support'] = frequent_itemsets['support'].round(2)

frequent_itemsets

Unnamed: 0,support,itemsets
0,0.82,(Flat)
1,0.74,(Surface)
2,0.6,"(Flat, Surface)"


In [6]:
from wildfire_data_processor import WildfireDataProcessor
from fp_growth_processor import FP_Growth_Processor
from window_algorithm import WindowAlgorithmProcessor
from kmeans_processor import KMeansProcessor
import warnings

warnings.simplefilter(action='ignore')

# Initialize WildfireDataProcessor
wildfire_processor = WildfireDataProcessor('fp-historical-wildfire-data-2006-2023.xlsx')
wildfire_processor.load_data()
wildfire_processor.preprocess_data()

# Initialize FPGrowthProcessor
fp_processor = FP_Growth_Processor(wildfire_processor, support_threshold=0.1)
fp_processor.generate_frequent_itemsets()
fp_processor.create_sub_dfs()

sub_dfs = fp_processor.get_sub_dfs()
sub_dfs_list = list(sub_dfs.keys())

# Get the sub dataset (eg: the first sub dataset with index 0)
#print(sub_dfs[sub_dfs_list[0]])
#a = sub_dfs[sub_dfs_list[-2]]
#print(a['temperature'].head())
#print(list(sub_dfs.keys())) #list of sub dataset names




Data loaded with shape: (25321, 50)


In [4]:
quantitative_column = 'temperature'
window_processor = WindowAlgorithmProcessor(sub_dfs, chosen_quantitative_column =  quantitative_column)
all_rules = window_processor.process_all_dfs()

In [5]:
all_rules

{'df_CLM_Surface': ['CLM, Surface, temperature Range: [16.7, 35.0] ==> current_size: 22.047 | Matching Rows: 1454',
  'CLM, Surface, temperature Range: [22.2, 35.0] ==> current_size: 25.47 | Matching Rows: 612',
  'CLM, Surface, temperature Range: [25.5, 35.0] ==> current_size: 27.857 | Matching Rows: 238',
  'CLM, Surface, temperature Range: [28.0, 35.0] ==> current_size: 29.662 | Matching Rows: 106',
  'CLM, Surface, temperature Range: [30.0, 35.0] ==> current_size: 31.148 | Matching Rows: 48',
  'CLM, Surface, temperature Range: [31.5, 34.5] ==> current_size: 32.525 | Matching Rows: 16',
  'CLM, Surface, temperature Range: [33.0, 34.5] ==> current_size: 33.417 | Matching Rows: 6',
  'CLM, Surface, temperature Range: [34.0, 34.5] ==> current_size: 34.25 | Matching Rows: 2'],
 'df_CLM_Flat': ['CLM, Flat, temperature Range: [18.6, 36.0] ==> current_size: 23.123 | Matching Rows: 1332',
  'CLM, Flat, temperature Range: [23.2, 36.0] ==> current_size: 26.144 | Matching Rows: 550',
  'CLM, 

In [7]:
from z_test_processor import Z_Test_Processor
# z_test_processor = z_test_processor(wildfire_processor.sub_dataset_cleaned, all_rules)
# z_test_processor.z_test()

In [8]:
z_test_processor = Z_Test_Processor(wildfire_processor.sub_dataset_cleaned, all_rules)
z_test_processor.z_test()

KeyError: 'df_CLM_Surface'

In [88]:
import scipy.stats as stats
import numpy as np

def z_test(n, X_bar, mu, sigma):
    # Compute Z-score
    Z = (X_bar - mu) / (sigma / np.sqrt(n))
    
    # Compute two-tailed p-value
    p_value = 2 * (1 - stats.norm.cdf(abs(Z)))
    
    return Z, p_value

In [93]:
mu = wildfire_processor.sub_dataset_cleaned.current_size.mean()
sigma = wildfire_processor.sub_dataset_cleaned.current_size.std()
alpha = 0.25

In [92]:
if bool(all_rules):
    for keys in all_rules:
        for val in all_rules[keys]:
            X_bar = float(val.split(" | ")[0].split(" ")[-1])
            n = float(val.split(" | ")[1].split(" ")[-1])
            print(z_test(n, X_bar, mu, sigma)[1] < alpha)
else: print("No interesting rule")

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


In [3]:
d1 = {'a': 10, 'b': 8}
d2 = {'d': 6, 'c': 4}

d2.update(d1)

# This returns None
print(d2)

{'d': 6, 'c': 4, 'a': 10, 'b': 8}


In [12]:
all_rules = {'kmeans_df_Surface_Flat': ['Surface,Flat,temperature Range: [-30.0, 38.0], wind_speed Range: [0.0, 75.0], relative_humidity Range: [50.0, 100.0] ==> current_size: 77.88298407806882 | Matching Rows: 3894'], 'kmeans_df_Clear_Flat': ['Clear,Flat,temperature Range: [-34.0, 38.1], wind_speed Range: [0.0, 89.0], relative_humidity Range: [0.0, 56.0] ==> current_size: 477.44580383724775 | Matching Rows: 6046'], 'kmeans_df_Surface_Clear': [], 'kmeans_df_Surface_Clear_Flat': [], 'kmeans_df_SW_Flat': ['SW,Flat,temperature Range: [-10.0, 39.0], wind_speed Range: [1.0, 90.0], relative_humidity Range: [0.0, 50.0] ==> current_size: 565.5552889447237 | Matching Rows: 1592'], 'kmeans_df_Cloudy_Flat': ['Cloudy,Flat,temperature Range: [-22.0, 32.0], wind_speed Range: [0.0, 70.0], relative_humidity Range: [50.0, 100.0] ==> current_size: 25.930667861945317 | Matching Rows: 2231'], 'kmeans_df_Surface_Cloudy': ['Surface,Cloudy,temperature Range: [-20.0, 31.0], wind_speed Range: [0.0, 70.0], relative_humidity Range: [50.0, 100.0] ==> current_size: 13.56125956144824 | Matching Rows: 1961'], 'kmeans_df_Surface_Cloudy_Flat': ['Surface,Cloudy,Flat,temperature Range: [-20.0, 31.0], wind_speed Range: [0.0, 70.0], relative_humidity Range: [52.0, 100.0] ==> current_size: 16.409156777014363 | Matching Rows: 1601'], 'kmeans_df_Flat_W': [], 'kmeans_df_Surface_W': [], 'kmeans_df_Clear_W': [], 'kmeans_df_Surface_Flat_W': [], 'kmeans_df_Ground_Flat': ['Ground,Flat,temperature Range: [-35.0, 32.0], wind_speed Range: [0.0, 45.0], relative_humidity Range: [49.0, 100.0] ==> current_size: 23.93641452344932 | Matching Rows: 1322'], 'kmeans_df_Clear_Ground': [], 'kmeans_df_Surface_CLM': ['Surface,CLM,temperature Range: [-11.0, 36.0], wind_speed Range: [0.0, 0.0], relative_humidity Range: [0.0, 57.0] ==> current_size: 22.973289855072466 | Matching Rows: 1380'], 'kmeans_df_CLM_Flat': ['CLM,Flat,temperature Range: [-34.0, 36.0], wind_speed Range: [0.0, 0.0], relative_humidity Range: [0.0, 57.0] ==> current_size: 26.997760716570696 | Matching Rows: 1563', 'CLM,Flat,temperature Range: [-19.0, 32.0], wind_speed Range: [0.0, 0.0], relative_humidity Range: [56.0, 100.0] ==> current_size: 6.1915528531337705 | Matching Rows: 1069']} 

In [10]:
from z_test_processor import Z_Test_Processor
z_test_processor = Z_Test_Processor(wildfire_processor.sub_dataset_cleaned, all_rules)
print(z_test_processor.z_test())

KeyError: 'df_Surface_CLM_temperature'

In [16]:
wind = {'window_df_Surface_CLM_temperature': ['Surface, CLM, temperature Range: [16.7, 35.0] ==> current_size: 22.047 | Matching Rows: 1454'], 'window_df_Flat_CLM_temperature': ['Flat, CLM, temperature Range: [18.6, 36.0] ==> current_size: 23.123 | Matching Rows: 1332'], 'window_df_Surface_CLM_relative_humidity': ['Surface, CLM, relative_humidity Range: [17.0, 100.0] ==> current_size: 53.86 | Matching Rows: 2283'], 'window_df_Flat_CLM_relative_humidity': ['Flat, CLM, relative_humidity Range: [19.0, 100.0] ==> current_size: 54.425 | Matching Rows: 2603']}

In [17]:
all_rules = {'df_Surface_Flat': ['Surface,Flat,temperature Range: [-30.0, 38.0], wind_speed Range: [0.0, 75.0], relative_humidity Range: [50.0, 100.0] ==> current_size: 77.88298407806882 | Matching Rows: 3894'], 'df_Clear_Flat': ['Clear,Flat,temperature Range: [-34.0, 38.1], wind_speed Range: [0.0, 89.0], relative_humidity Range: [0.0, 56.0] ==> current_size: 477.44580383724775 | Matching Rows: 6046'], 'df_SW_Flat': ['SW,Flat,temperature Range: [-10.0, 39.0], wind_speed Range: [1.0, 90.0], relative_humidity Range: [0.0, 50.0] ==> current_size: 565.5552889447237 | Matching Rows: 1592'], 'df_Cloudy_Flat': ['Cloudy,Flat,temperature Range: [-22.0, 32.0], wind_speed Range: [0.0, 70.0], relative_humidity Range: [50.0, 100.0] ==> current_size: 25.930667861945317 | Matching Rows: 2231'], 'df_Surface_Cloudy': ['Surface,Cloudy,temperature Range: [-20.0, 31.0], wind_speed Range: [0.0, 70.0], relative_humidity Range: [50.0, 100.0] ==> current_size: 13.56125956144824 | Matching Rows: 1961'], 'df_Surface_Cloudy_Flat': ['Surface,Cloudy,Flat,temperature Range: [-20.0, 31.0], wind_speed Range: [0.0, 70.0], relative_humidity Range: [52.0, 100.0] ==> current_size: 16.409156777014363 | Matching Rows: 1601'], 'df_Ground_Flat': ['Ground,Flat,temperature Range: [-35.0, 32.0], wind_speed Range: [0.0, 45.0], relative_humidity Range: [49.0, 100.0] ==> current_size: 23.93641452344932 | Matching Rows: 1322'], 'df_Surface_CLM': ['Surface,CLM,temperature Range: [-11.0, 36.0], wind_speed Range: [0.0, 0.0], relative_humidity Range: [0.0, 57.0] ==> current_size: 22.973289855072466 | Matching Rows: 1380'], 'df_Flat_CLM': ['Flat,CLM,temperature Range: [-34.0, 36.0], wind_speed Range: [0.0, 0.0], relative_humidity Range: [0.0, 57.0] ==> current_size: 26.997760716570696 | Matching Rows: 1563', 'Flat,CLM,temperature Range: [-19.0, 32.0], wind_speed Range: [0.0, 0.0], relative_humidity Range: [56.0, 100.0] ==> current_size: 6.1915528531337705 | Matching Rows: 1069']}

In [18]:
all_rules

{'df_Surface_Flat': ['Surface,Flat,temperature Range: [-30.0, 38.0], wind_speed Range: [0.0, 75.0], relative_humidity Range: [50.0, 100.0] ==> current_size: 77.88298407806882 | Matching Rows: 3894'],
 'df_Clear_Flat': ['Clear,Flat,temperature Range: [-34.0, 38.1], wind_speed Range: [0.0, 89.0], relative_humidity Range: [0.0, 56.0] ==> current_size: 477.44580383724775 | Matching Rows: 6046'],
 'df_SW_Flat': ['SW,Flat,temperature Range: [-10.0, 39.0], wind_speed Range: [1.0, 90.0], relative_humidity Range: [0.0, 50.0] ==> current_size: 565.5552889447237 | Matching Rows: 1592'],
 'df_Cloudy_Flat': ['Cloudy,Flat,temperature Range: [-22.0, 32.0], wind_speed Range: [0.0, 70.0], relative_humidity Range: [50.0, 100.0] ==> current_size: 25.930667861945317 | Matching Rows: 2231'],
 'df_Surface_Cloudy': ['Surface,Cloudy,temperature Range: [-20.0, 31.0], wind_speed Range: [0.0, 70.0], relative_humidity Range: [50.0, 100.0] ==> current_size: 13.56125956144824 | Matching Rows: 1961'],
 'df_Surface_C

In [19]:
wind

{'window_df_Surface_CLM_temperature': ['Surface, CLM, temperature Range: [16.7, 35.0] ==> current_size: 22.047 | Matching Rows: 1454'],
 'window_df_Flat_CLM_temperature': ['Flat, CLM, temperature Range: [18.6, 36.0] ==> current_size: 23.123 | Matching Rows: 1332'],
 'window_df_Surface_CLM_relative_humidity': ['Surface, CLM, relative_humidity Range: [17.0, 100.0] ==> current_size: 53.86 | Matching Rows: 2283'],
 'window_df_Flat_CLM_relative_humidity': ['Flat, CLM, relative_humidity Range: [19.0, 100.0] ==> current_size: 54.425 | Matching Rows: 2603']}