In [18]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from os import path
import numpy as np

In [19]:
target_column = "label"
one_hot_encode = True
data_file = "selected_columns_44_zoonosis_dataset_full_full.csv"

In [20]:
def prepare_dataframe_for_ml(df, target_column=None, one_hot_encode=True):
    """
    Prepare a pandas DataFrame for machine learning algorithms.
    - Normalizes numerical features
    - Optionally one-hot encodes categorical features
    - Optionally separates target variable
    
    Parameters:
    -----------
    df : pandas.DataFrame
        The input DataFrame to prepare
    target_column : str, optional
        Name of the target column to separate
    one_hot_encode : bool, optional
        Whether to one-hot encode categorical features
    
    Returns:
    --------
    df_processed: pandas.DataFrame
        The processed DataFrame
    """

    # Create a copy of the dataframe to avoid modifying the original
    df_processed = df.copy()

    # Separate target if specified
    y = None
    if target_column and target_column in df_processed.columns:
        y = df_processed[target_column].replace({"nz": 0, "hzoon": 1})
        df_processed = df_processed.drop(columns=[target_column])

    # Identify numerical and categorical columns
    numerical_cols = df_processed.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_cols = df_processed.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()

    # Handle missing values
    df_processed[numerical_cols] = df_processed[numerical_cols].fillna(df_processed[numerical_cols].median())
    for col in categorical_cols:
        df_processed[col] = df_processed[col].fillna(df_processed[col].mode()[0])

    # Normalize numerical features
    if numerical_cols:
        scaler = StandardScaler()
        df_processed[numerical_cols] = scaler.fit_transform(df_processed[numerical_cols])

    # One-hot encode categorical features
    if categorical_cols and one_hot_encode:
        df_processed = pd.get_dummies(df_processed, columns=categorical_cols, drop_first=False)

    # If we have a target column, add it back to the processed dataframe
    if target_column and y is not None:
        df_processed[target_column] = y

    return df_processed

In [21]:
data = pd.read_csv(path.join("..", "cleaned_data", data_file))
data.columns

Index(['label', 'AC_NA', 'DPC_AS_PB2', 'CTriad_g1.g1.g5_PB2',
       'CTDD_charge.3.residue75_PB2', 'CTDC_hydrophobicity_ARGP820101.G1_PB2',
       'PAAC_Xc1.Y_NA', 'TT_PA', 'CTriad_g1.g5.g1_PB2',
       'CTDD_hydrophobicity_PONP930101.2.residue50_HA',
       'CTDD_hydrophobicity_ZIMJ680101.1.residue50_M1',
       'CTDD_hydrophobicity_PONP930101.1.residue50_HA',
       'CTDT_secondarystruct.Tr1331_HA', 'CTriad_g4.g1.g1_HA',
       'CTDT_hydrophobicity_ENGD860101.Tr2332_HA', 'AT_PA',
       'CTDT_secondarystruct.Tr1221_NS1', 'PAAC_Xc1.P_M1',
       'CTDC_solventaccess.G1_HA', 'CTDT_hydrophobicity_ZIMJ680101.Tr2332_PB2',
       'CTDT_polarizability.Tr2332_PB2',
       'CTDT_hydrophobicity_ZIMJ680101.Tr1221_PB2',
       'CTDD_polarizability.1.residue50_PB2', 'CC_HA',
       'CTDD_hydrophobicity_ARGP820101.1.residue75_PB2',
       'CTDD_hydrophobicity_ARGP820101.2.residue50_PA',
       'CTDD_hydrophobicity_PONP930101.1.residue75_HA',
       'CTDD_hydrophobicity_ENGD860101.3.residue75_PB2',

In [22]:
# data = data.drop(
#     columns=["Unnamed: 0", "Species", "gid", "title", "date", "subtype", "src"]
# )

In [23]:
processed_data = prepare_dataframe_for_ml(data, target_column=target_column, one_hot_encode=one_hot_encode)
processed_data.to_csv(path.join("..", "cleaned_data", f"processed_ohe_{one_hot_encode}_{data_file}"), index=False)

  y = df_processed[target_column].replace({"nz": 0, "hzoon": 1})
