In [63]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from os import path
import numpy as np

In [64]:
target_column = "label"
one_hot_encode = True
data_file = "H5N1.csv"

In [65]:
def prepare_dataframe_for_ml(df, target_column=None, one_hot_encode=True):
    """
    Prepare a pandas DataFrame for machine learning algorithms.
    - Normalizes numerical features
    - Optionally one-hot encodes categorical features
    - Optionally separates target variable
    
    Parameters:
    -----------
    df : pandas.DataFrame
        The input DataFrame to prepare
    target_column : str, optional
        Name of the target column to separate
    one_hot_encode : bool, optional
        Whether to one-hot encode categorical features
    
    Returns:
    --------
    df_processed: pandas.DataFrame
        The processed DataFrame
    """

    # Create a copy of the dataframe to avoid modifying the original
    df_processed = df.copy()

    # Separate target if specified
    y = None
    if target_column and target_column in df_processed.columns:
        y = df_processed[target_column].replace({"nz": 0, "hzoon": 1})
        df_processed = df_processed.drop(columns=[target_column])

    # Identify numerical and categorical columns
    numerical_cols = df_processed.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_cols = df_processed.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()

    # Handle missing values
    df_processed[numerical_cols] = df_processed[numerical_cols].fillna(df_processed[numerical_cols].median())
    for col in categorical_cols:
        df_processed[col] = df_processed[col].fillna(df_processed[col].mode()[0])

    # Normalize numerical features
    if numerical_cols:
        scaler = StandardScaler()
        df_processed[numerical_cols] = scaler.fit_transform(df_processed[numerical_cols])

    # One-hot encode categorical features
    if categorical_cols and one_hot_encode:
        df_processed = pd.get_dummies(df_processed, columns=categorical_cols, drop_first=False)

    # If we have a target column, add it back to the processed dataframe
    if target_column and y is not None:
        df_processed[target_column] = y

    return df_processed

In [62]:
data = pd.read_csv(path.join("..", "split_data", "subtypes", data_file))
data.columns

Index(['Unnamed: 0', 'Species', 'Location', 'gid', 'title', 'date', 'subtype',
       'label', 'src', 'TT_p2_Bias_HA',
       ...
       'PAAC_Xc1.A_PB2', 'PAAC_Xc1.P_PB2', 'PAAC_Xc2.lambda3_PB2',
       'PAAC_Xc1.F_PB2', 'PAAC_Xc1.C_PB2', 'PAAC_Xc1.I_PB2', 'PAAC_Xc1.W_PB2',
       'PAAC_Xc1.R_PB2', 'PAAC_Xc1.H_PB2', 'SpeciesBins'],
      dtype='object', length=970)

In [67]:
data = data.drop(
    columns=["Unnamed: 0", "Species", "gid", "title", "date", "subtype", "src"]
)

In [68]:
processed_data = prepare_dataframe_for_ml(data, target_column=target_column, one_hot_encode=one_hot_encode)
processed_data.to_csv(path.join("..", "cleaned_data", f"processed_ohe_{one_hot_encode}_{data_file}"), index=False)

  y = df_processed[target_column].replace({"nz": 0, "hzoon": 1})
