# Best Model
The purpose of this notebook is to take the top performing models run them on what we believe to be the most effective data. We then search through the hyperparameter space of the models to try and optimise F1 measure

In [None]:
import pandas as pd
import numpy as np
from os import path
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

## Load and Process Data

In [None]:
data_file = path.join("..", "data", "zoonosis_dataset_full.csv")
target_column = "label"

In [None]:
def prepare_dataframe_for_ml(df, target_column=None, one_hot_encode=True):
    """
    Prepare a pandas DataFrame for machine learning algorithms.
    - Normalizes numerical features
    - Optionally one-hot encodes categorical features
    - Optionally separates target variable

    Parameters:
    -----------
    df : pandas.DataFrame
        The input DataFrame to prepare
    target_column : str, optional
        Name of the target column to separate
    one_hot_encode : bool, optional
        Whether to one-hot encode categorical features

    Returns:
    --------
    df_processed: pandas.DataFrame
        The processed DataFrame
    """

    # Create a copy of the dataframe to avoid modifying the original
    df_processed = df.copy()

    # Separate target if specified
    y = None
    if target_column and target_column in df_processed.columns:
        y = df_processed[target_column].replace({"nz": 0, "hzoon": 1})
        df_processed = df_processed.drop(columns=[target_column])

    # Identify numerical and categorical columns
    numerical_cols = df_processed.select_dtypes(
        include=["int64", "float64"]
    ).columns.tolist()
    categorical_cols = df_processed.select_dtypes(
        include=["object", "category", "bool"]
    ).columns.tolist()

    # Handle missing values
    df_processed[numerical_cols] = df_processed[numerical_cols].fillna(
        df_processed[numerical_cols].median()
    )
    for col in categorical_cols:
        df_processed[col] = df_processed[col].fillna(df_processed[col].mode()[0])

    # Normalize numerical features
    if numerical_cols:
        scaler = StandardScaler()
        df_processed[numerical_cols] = scaler.fit_transform(
            df_processed[numerical_cols]
        )

    # One-hot encode categorical features
    if categorical_cols and one_hot_encode:
        df_processed = pd.get_dummies(
            df_processed, columns=categorical_cols, drop_first=False
        )

    # If we have a target column, add it back to the processed dataframe
    if target_column and y is not None:
        df_processed[target_column] = y

    return df_processed

In [None]:
def remove_unimportant_columns(dataframe, columns_to_keep):
    return dataframe[columns_to_keep]

## Run Models

### XGBoost

In [None]:
xgboost = XGBClassifier()

In [None]:
data = pd.read_csv(data_file)
processed_data = prepare_dataframe_for_ml(data, target_column=target_column)
y = processed_data["label"]
X = processed_data.drop(columns=["label"])
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

### LightGBM

In [None]:
lightGBM = LGBMClassifier()

### Random Forest

In [None]:
random_forest = RandomForestClassifier()

## Plot Results
Show results of training across various values in grid search

## Compare Model Performance
On temporally split data

In [None]:
# load test data

## Get False Positives