In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
! pip install pycaret



In [3]:
import pandas as pd
from pycaret.regression import *
from sklearn.preprocessing import LabelEncoder

In [4]:
# 데이터 로드
file_path = "/content/drive/MyDrive/Weather-health/fp-historical-wildfire-data-2006-2023.csv"
data = pd.read_csv(file_path)

# 결측치가 3000개 미만인 열들 선택
missing_data_count = data.isnull().sum()
valid_columns = [col for col in missing_data_count[missing_data_count < 3000].index.tolist() if 'hectares' not in col]
data_clean = data[valid_columns].dropna()


In [5]:
print("Valid columns: ")
valid_columns

Valid columns: 


['fire_year',
 'fire_number',
 'current_size',
 'size_class',
 'fire_location_latitude',
 'fire_location_longitude',
 'fire_origin',
 'general_cause_desc',
 'fire_start_date',
 'det_agent_type',
 'det_agent',
 'reported_date',
 'dispatched_resource',
 'dispatch_date',
 'start_for_fire_date',
 'assessment_resource',
 'assessment_datetime',
 'fire_spread_rate',
 'fire_type',
 'fire_position_on_slope',
 'weather_conditions_over_fire',
 'temperature',
 'relative_humidity',
 'wind_direction',
 'wind_speed',
 'initial_action_by',
 'bh_fs_date',
 'uc_fs_date',
 'ex_fs_date']

In [6]:
# 계절 정보 추가
def get_season(month):
    if month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    elif month in [9, 10]:
        return 'Autumn'
    else:
        return 'Winter'

# start_for_fire_date를 datetime 형식으로 변환 후 계절 정보 추가
if 'start_for_fire_date' in data_clean.columns:
    data_clean['start_for_fire_date'] = pd.to_datetime(data_clean['start_for_fire_date'])
    data_clean['season'] = data_clean['start_for_fire_date'].dt.month.apply(get_season)
    data_clean.drop(columns=['start_for_fire_date'], inplace=True)

# fire_number의 첫 번째 알파벳을 지역 코드로 변환
region_map = {
    'C': 'Calgary', 'E': 'Edson', 'H': 'High Level', 'G': 'Grande Prairie',
    'L': 'Lac La Biche', 'M': 'Fort McMurray', 'P': 'Peace River',
    'R': 'Rocky', 'S': 'Slave Lake', 'W': 'Whitecourt'
}
data_clean['fire_region'] = data_clean['fire_number'].str[0].map(region_map)
data_clean.drop(columns=['fire_number'], inplace=True)

# categorical 변수 인코딩
categorical_columns = data_clean.select_dtypes(include=['object']).columns
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    data_clean[col] = le.fit_transform(data_clean[col])
    label_encoders[col] = le


In [7]:
categorical_columns

Index(['size_class', 'fire_origin', 'general_cause_desc', 'fire_start_date',
       'det_agent_type', 'det_agent', 'reported_date', 'dispatched_resource',
       'dispatch_date', 'assessment_resource', 'assessment_datetime',
       'fire_type', 'fire_position_on_slope', 'weather_conditions_over_fire',
       'wind_direction', 'initial_action_by', 'bh_fs_date', 'uc_fs_date',
       'ex_fs_date', 'season', 'fire_region'],
      dtype='object')

In [8]:
data_clean

Unnamed: 0,fire_year,current_size,size_class,fire_location_latitude,fire_location_longitude,fire_origin,general_cause_desc,fire_start_date,det_agent_type,det_agent,...,temperature,relative_humidity,wind_direction,wind_speed,initial_action_by,bh_fs_date,uc_fs_date,ex_fs_date,season,fire_region
0,2006,0.10,0,56.249956,-117.181960,4,11,58,4,0,...,18.0,10.0,9,2.0,9,1,1,0,1,6
1,2006,0.20,1,53.606367,-115.915733,5,3,61,4,0,...,12.0,22.0,9,10.0,3,2,2,1,1,1
2,2006,0.50,1,53.610933,-115.594267,5,3,62,4,0,...,12.0,22.0,9,10.0,3,3,3,2,1,1
3,2006,0.01,0,53.608867,-115.609467,5,3,61,4,107,...,12.0,22.0,9,10.0,8,5,5,3,1,1
4,2006,0.10,0,56.249956,-117.050249,5,6,64,4,78,...,6.0,37.0,9,2.0,3,7,7,8,1,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25316,2023,0.41,1,56.456850,-116.077467,1,11,17708,3,97,...,15.0,46.0,7,10.0,3,20692,20660,20433,1,6
25317,2023,0.10,0,54.966217,-115.570850,5,7,17792,3,65,...,23.0,28.0,8,15.0,4,20790,20755,20505,1,8
25318,2023,0.30,1,55.350571,-117.119320,5,4,17970,4,0,...,16.0,45.0,3,10.0,9,20978,20934,20723,1,3
25319,2023,0.01,0,56.177685,-116.799293,5,10,18315,2,99,...,21.0,63.0,5,4.0,1,21365,21321,21038,2,6


In [9]:
data_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22344 entries, 0 to 25320
Data columns (total 29 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   fire_year                     22344 non-null  int64  
 1   current_size                  22344 non-null  float64
 2   size_class                    22344 non-null  int64  
 3   fire_location_latitude        22344 non-null  float64
 4   fire_location_longitude       22344 non-null  float64
 5   fire_origin                   22344 non-null  int64  
 6   general_cause_desc            22344 non-null  int64  
 7   fire_start_date               22344 non-null  int64  
 8   det_agent_type                22344 non-null  int64  
 9   det_agent                     22344 non-null  int64  
 10  reported_date                 22344 non-null  int64  
 11  dispatched_resource           22344 non-null  int64  
 12  dispatch_date                 22344 non-null  int64  
 13  assess

In [10]:
model_setup = setup(data=data_clean, target='current_size', train_size=0.8, session_id=123)

best_model = compare_models()

Unnamed: 0,Description,Value
0,Session id,123
1,Target,current_size
2,Target type,Regression
3,Original data shape,"(22344, 29)"
4,Transformed data shape,"(22344, 29)"
5,Transformed train set shape,"(17875, 29)"
6,Transformed test set shape,"(4469, 29)"
7,Numeric features,28
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
knn,K Neighbors Regressor,196.947,39201087.5188,4551.6103,0.2978,1.0086,299.0116,0.337
par,Passive Aggressive Regressor,425.4534,40068732.0282,4872.7099,0.0792,4.6956,10282.6599,0.107
en,Elastic Net,611.9738,40525607.1953,4939.1708,0.0251,5.2371,14061.8228,0.558
lasso,Lasso Regression,761.6983,40483847.0605,4954.1356,-0.0011,5.5743,18931.649,0.847
huber,Huber Regressor,199.8068,41407462.9226,5007.0243,-0.002,1.0621,23.2123,0.271
dummy,Dummy Regressor,387.8083,41375760.25,5005.2559,-0.0021,4.9638,9325.1558,0.033
omp,Orthogonal Matching Pursuit,864.0868,38724491.5972,4896.1413,-0.0879,5.6472,25864.415,0.049
br,Bayesian Ridge,1083.5614,37685183.9873,4876.4451,-0.1531,5.9965,36633.178,0.092
llar,Lasso Least Angle Regression,1110.4786,37681032.2092,4883.8229,-0.1656,6.0337,37872.1069,0.043
ridge,Ridge Regression,1111.9768,37680983.5911,4884.2164,-0.1662,6.0357,37945.4993,0.087


Processing:   0%|          | 0/81 [00:00<?, ?it/s]

In [11]:
best_tune = tune_model(best_model)

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,207.3135,12805873.811,3578.5296,0.1458,1.0723,700.5357
1,386.7561,181657704.401,13478.0453,0.0324,0.9062,97.2177
2,115.9786,2982972.8287,1727.1285,0.6279,1.001,99.5209
3,173.1382,14122417.7505,3757.9805,0.4161,1.0844,451.5265
4,49.864,245030.2585,495.0053,0.9217,1.0227,395.4801
5,71.2557,387781.7962,622.7213,0.8139,1.0809,101.4972
6,82.3197,2346260.113,1531.7507,0.0172,1.1454,172.736
7,163.6986,6247835.9749,2499.5672,0.3332,1.062,83.6278
8,370.149,132852424.5109,11526.1626,0.001,1.0953,67.124
9,213.7504,25850062.972,5084.2957,0.0999,1.0221,226.5844


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [12]:
best_model

In [13]:
best_tune

In [14]:
evaluate_model(best_tune)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…