In [2]:
import pandas as pd
from ipywidgets.widgets import HTML
from sklearn.linear_model import LinearRegression
import numpy as np


# from ipynb.fs.full.regression deterministic import deterministic_regression
def deterministic_regression(df: pd.DataFrame, columns_to_impute: [str], stochastic_reg=False) -> pd.DataFrame:
    """
    Imputation by linear regession (deterministic) using SKlearn.
    Only use on numerical non ordinal data such as categories.
    Also do specify the columns that you want to impute in the columns_to_impute parameter.

    Difference with Stochastic: deterministic provides thew same result/output for a certain set of inputs whereas stochastic takes randomness in to account.
    """
    df["time"] = df.index.to_series().apply(lambda x: pd.Timestamp(x)).astype(int)
    def random_imputation(df: pd.DataFrame, feature) -> pd.DataFrame:
        """Helper method for the regression imputation methods, not actually used for imputation on its own but serves as starting point for regression models"""
        number_missing = df[feature].isnull().sum()
        observed_values = df.loc[df[feature].notnull(), feature]
        df.loc[df[feature].isnull(), feature + '_imp'] = np.random.choice(observed_values, number_missing, replace=True)
        return df

    for feature in columns_to_impute:
        df[feature + '_imp'] = df[feature]
        df = random_imputation(df, feature)

    deter_data = pd.DataFrame(columns=["Det" + name for name in columns_to_impute])

    for feature in columns_to_impute:
        deter_data["Det" + feature] = df[feature + "_imp"]
        parameters = list(set(df.columns) - set(columns_to_impute) - {feature + "_imp"})

        dreg_model = LinearRegression()
        dreg_model.fit(X=df[parameters], y=df[feature + "_imp"])

        deter_data.loc[df[feature].isnull(), "Det" + feature] = dreg_model.predict(df[parameters])[df[feature].isnull()]
    if stochastic_reg == False:
        for feature in columns_to_impute:
            df[feature] = deter_data["Det" + feature]
            df.rename(columns={"Det" + feature: feature})
            df = df.drop(columns=[feature + "_imp"], axis=1)
    df = df.drop(labels=['time'], axis=1)
    return df


# REFERENCE: https://www.kaggle.com/shashankasubrahmanya/missing-data-imputation-using-regression
def stochastic_regression(df: pd.DataFrame, columns_to_impute: [str]) -> pd.DataFrame:
    """"
   Imputation by linear regession (Stochastic) using SKlearn.
   Only use on numerical non ordinal data such as categories.
   Also do specify the columns that you want to impute in the columns_to_impute parameter.

   Difference with deterministic is that stochastic takes randomness/uncertainty in to account accoring to standard deviation
   """
    df = deterministic_regression(df, columns_to_impute, stochastic_reg=True)
    random_data = pd.DataFrame(columns=["Ran" + name for name in columns_to_impute])
    for feature in columns_to_impute:
        random_data["Ran" + feature] = df[feature + "_imp"]
        parameters = list(set(df.columns) - set(columns_to_impute) - {feature + "_imp"})

        sreg_model = LinearRegression()
        sreg_model.fit(X=df[parameters], y=df[feature + '_imp'])

        predict = sreg_model.predict(df[parameters])
        std_error = (predict[df[feature].notnull()] - df.loc[df[feature].notnull(), feature + "_imp"]).std()

        random_predict = np.random.normal(size=df[feature].shape[0], loc=predict, scale=std_error)
        random_data.loc[(df[feature].isnull()) & (random_predict > 0), "Ran" + feature] = random_predict[
            (df[feature].isnull()) &
            (random_predict > 0)]
    for feature in columns_to_impute:
        df[feature] = random_data["Ran" + feature]
        df.rename(columns={"Ran" + feature: feature})
        df = df.drop(columns=[feature + "_imp"], axis=1)

    return df