In [3]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import sqlite3
import joblib

database_path = 'dataset/FPA_FOD_20170508.sqlite'
connection = sqlite3.connect(database_path)
data = pd.read_sql_query("SELECT * FROM Fires", connection)
data = data.sample(n=2000, random_state=42)

def preprocess_data(data, target_col, numerical_cols):
    X = data.drop(columns=[target_col])
    y = data[target_col]
    non_numeric_cols = X.select_dtypes(include=['object']).columns
    
    for col in non_numeric_cols:
        le = LabelEncoder()
        X[col] = X[col].fillna('')
        X[col] = X[col].apply(lambda x: str(x).encode('ascii', 'ignore').decode('ascii'))
        X[col] = le.fit_transform(X[col])
    
    X = X.fillna(0)
    min_max_scaler = MinMaxScaler()
    X[numerical_cols] = min_max_scaler.fit_transform(X[numerical_cols])
    std_scaler = StandardScaler()
    X[numerical_cols] = std_scaler.fit_transform(X[numerical_cols])
    
    smote = SMOTE(random_state=42)
    X_balanced, y_balanced = smote.fit_resample(X, y)
    
    return train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42), X, y

numerical_cols = ['DISCOVERY_DATE', 'DISCOVERY_DOY', 'CONT_DATE', 'CONT_DOY',
                  'FIRE_SIZE', 'LATITUDE', 'LONGITUDE', 'OWNER_CODE']

(X_train, X_test, y_train, y_test), X_original, y_original = preprocess_data(data, 'STAT_CAUSE_CODE', numerical_cols)
classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

joblib.dump((X_original, y_original, y_test, y_pred), "pickle_files/results_class_balance.pkl")
joblib.dump(classifier, "pickle_files/model_class_balance.pkl")

Accuracy: 0.96


['pickle_files/model_class_balance.pkl']

In [4]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import sqlite3
import joblib

database_path = 'dataset/FPA_FOD_20170508.sqlite'
connection = sqlite3.connect(database_path)
data = pd.read_sql_query("SELECT * FROM Fires", connection)
data = data.sample(n=2000, random_state=42)

def preprocess_data(data, target_col, numerical_cols):
    X = data.drop(columns=[target_col])
    y = data[target_col]
    non_numeric_cols = X.select_dtypes(include=['object']).columns
    
    for col in non_numeric_cols:
        le = LabelEncoder()
        X[col] = X[col].fillna('')
        X[col] = X[col].apply(lambda x: str(x).encode('ascii', 'ignore').decode('ascii'))
        X[col] = le.fit_transform(X[col])
    
    X = X.fillna(0)
    min_max_scaler = MinMaxScaler()
    X[numerical_cols] = min_max_scaler.fit_transform(X[numerical_cols])
    std_scaler = StandardScaler()
    X[numerical_cols] = std_scaler.fit_transform(X[numerical_cols])
    
    
    return train_test_split(X, y, test_size=0.2, random_state=42), X, y

numerical_cols = ['DISCOVERY_DATE', 'DISCOVERY_DOY', 'CONT_DATE', 'CONT_DOY',
                  'FIRE_SIZE', 'LATITUDE', 'LONGITUDE', 'OWNER_CODE']

(X_train, X_test, y_train, y_test), X_original, y_original = preprocess_data(data, 'STAT_CAUSE_CODE', numerical_cols)
classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

joblib.dump((X_original, y_original, y_test, y_pred), "pickle_files/results.pkl")
joblib.dump(classifier, "pickle_files/model.pkl")

Accuracy: 0.89


['pickle_files/model.pkl']