<h1>Data Preparation</h1><h2 align="center">Variable Encoding</h2>

<h3>Ordinal Encoding</h3>

In [None]:
from pandas import read_csv, DataFrame
from dslabs_functions import get_variable_types, encode_cyclic_variables, dummify

data: DataFrame = read_csv("data/stroke_mvi.csv", index_col="id", na_values="")
vars: dict[str, list] = get_variable_types(data)

yes_no: dict[str, int] = {"no": 0, "No": 0, "yes": 1, "Yes": 1}
residence_type_values: dict[str, int] = {"Rural": 0, "Urban": 1}

encoding: dict[str, dict[str, int]] = {
    "Residence_type": residence_type_values,
    "hypertension": yes_no,
    "heart_disease": yes_no,
    "ever_married": yes_no,
    "stroke": yes_no,
}
df: DataFrame = data.replace(encoding, inplace=False)
df.head()

In [None]:
for v in vars["symbolic"]:
    print(v, data[v].unique())

In [None]:
gender_values: dict[str, int] = {"Female": 0, "Other": 1, "Male": 2}
work_values: dict[str, int] = {
    "children": 0,
    "Never_worked": 1,
    "Self-employed": 2,
    "Private": 3,
    "Govt_job": 4,
}
status_values: dict[str, int] = {"never smoked": 0, "formerly smoked": 1, "smokes": 2}

encoding: dict[str, dict[str, int]] = {
    "gender": gender_values,
    "work_type": work_values,
    "smoking_status": status_values,
}

df: DataFrame = df.replace(encoding, inplace=False)
df.head()

<h3>Cyclic variables</h3>

In [None]:
from math import pi, sin, cos

data: DataFrame = read_csv(
    "data/algae.csv",
    index_col="date",
    na_values="",
    parse_dates=True,
    infer_datetime_format=True,
)

season_val: dict[str, float] = {
    "spring": 0,
    "summer": pi / 2,
    "autumn": pi,
    "winter": -pi / 2,
}
lov: dict[str, int] = {"low": 0, "medium": 1, "high": 2}
encoding: dict[str, dict] = {
    "river_depth": lov,
    "fluid_velocity": lov,
    "season": season_val,
}

data = data.replace(encoding)
data.head()

In [None]:
def encode_cyclic_variables(data: DataFrame, vars: list[str]) -> None:
    for v in vars:
        x_max: float | int = max(data[v])
        data[v + "_sin"] = data[v].apply(lambda x: round(sin(2 * pi * x / x_max), 3))
        data[v + "_cos"] = data[v].apply(lambda x: round(cos(2 * pi * x / x_max), 3))
    return


data: DataFrame | None = encode_cyclic_variables(data, ["season"])
if data is not None:
    data.head()

<h3>Dummification or One-hot Encoding</h3>

In [None]:
from numpy import ndarray
from pandas import DataFrame, read_csv, concat
from sklearn.preprocessing import OneHotEncoder


def dummify(df: DataFrame, vars_to_dummify: list[str]) -> DataFrame:
    other_vars: list[str] = [c for c in df.columns if not c in vars_to_dummify]

    enc = OneHotEncoder(
        handle_unknown="ignore", sparse_output=False, dtype="bool", drop="if_binary"
    )
    trans: ndarray = enc.fit_transform(df[vars_to_dummify])

    new_vars: ndarray = enc.get_feature_names_out(vars_to_dummify)
    dummy = DataFrame(trans, columns=new_vars, index=df.index)

    final_df: DataFrame = concat([df[other_vars], dummy], axis=1)
    return final_df


data: DataFrame = read_csv(
    "data/algae.csv", index_col="date", na_values="", parse_dates=True, dayfirst=True
)
vars: list[str] = ["river_depth", "fluid_velocity", "season"]
df: DataFrame = dummify(data, vars)
df.head(5)