### Initializing libraries and declaring some global libraries.

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
import sqlite3
import joblib

DATABASE_PATH = 'dataset/FPA_FOD_20170508.sqlite'
NUMERICAL_COLS = ['DISCOVERY_DATE', 'DISCOVERY_DOY', 'CONT_DATE', 'CONT_DOY',
                  'FIRE_SIZE', 'LATITUDE', 'LONGITUDE', 'OWNER_CODE']
TARGET_COL = 'STAT_CAUSE_CODE'
DATA_SAMPLE_SIZE = 2000
RANDOM_STATE = 42

### Loading data and then preprocessing it.

In [19]:
def load_data(database_path, table_name, sample_size, random_state):
    connection = sqlite3.connect(database_path)
    data = pd.read_sql_query(f"SELECT * FROM {table_name}", connection)
    connection.close()
    return data.sample(n=sample_size, random_state=random_state)

def preprocess_data(data, target_col, numerical_cols, balance_classes=False):
    X = data.drop(columns=[target_col])
    y = data[target_col]
    non_numeric_cols = X.select_dtypes(include=['object']).columns
    
    for col in non_numeric_cols:
        le = LabelEncoder()
        X[col] = X[col].fillna('')
        X[col] = X[col].apply(lambda x: str(x).encode('ascii', 'ignore').decode('ascii'))
        X[col] = le.fit_transform(X[col])
    
    X = X.fillna(0)
    min_max_scaler = MinMaxScaler()
    X[numerical_cols] = min_max_scaler.fit_transform(X[numerical_cols])
    std_scaler = StandardScaler()
    X[numerical_cols] = std_scaler.fit_transform(X[numerical_cols])
    
    if balance_classes:
        smote = SMOTE(random_state=RANDOM_STATE)
        X, y = smote.fit_resample(X, y)
    return train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE), X, y

### Training the model and saving them as pkl files.

In [20]:
def train_and_evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    return model, y_pred, accuracy

def save_results_and_models(X, y, y_test, y_preds, models, prefix):
    joblib.dump((X, y, y_test, y_preds), f"pickle_files/{prefix}_results.pkl")
    for model_name, model in models.items():
        joblib.dump(model, f"pickle_files/{model_name.lower()}_model.pkl")

### Main code execution.

In [22]:
if __name__ == "__main__":
    data = load_data(DATABASE_PATH, "Fires", DATA_SAMPLE_SIZE, RANDOM_STATE)
    models = {
        "RandomForest": RandomForestClassifier(random_state=RANDOM_STATE),
        "SVM": SVC(random_state=RANDOM_STATE),
        "KNN": KNeighborsClassifier()
    }
    
    y_preds = {}
    for balance_classes in [True, False]:
        scenario = "class_balance" if balance_classes else "no_balance"
        (X_train, X_test, y_train, y_test), X_processed, y_processed = preprocess_data(
            data, TARGET_COL, NUMERICAL_COLS, balance_classes=balance_classes
        )
        
        for model_name, model in models.items():
            trained_model, y_pred, accuracy = train_and_evaluate_model(
                model, X_train, X_test, y_train, y_test
            )
            models[model_name] = trained_model
            y_preds[model_name] = y_pred
            print(f"Accuracy ({model_name}, {scenario}): {accuracy:.2f}")
        save_results_and_models(X_processed, y_processed, y_test, y_preds, models, scenario)


Accuracy (RandomForest, class_balance): 0.96
Accuracy (SVM, class_balance): 0.16
Accuracy (KNN, class_balance): 0.48
Accuracy (RandomForest, no_balance): 0.89
Accuracy (SVM, no_balance): 0.21
Accuracy (KNN, no_balance): 0.38
