In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from statsmodels.formula.api import ols
import scipy.stats as stats

pd.set_option("display.max_columns", 500)

random_seed = 2024
np.random.seed(random_seed)

In [None]:
def apply_z_score(col_name, z_score_col_name):
    col_mean = bottle_df_train[col_name].mean()
    col_std = bottle_df_train[col_name].std()

    bottle_df_train[z_score_col_name] = (bottle_df_train[col_name] - col_mean) / col_std
    bottle_df_test[z_score_col_name] = (bottle_df_test[col_name] - col_mean) / col_std

In [None]:
def impute_mean(col_name):
    col_mean = bottle_df_train[col_name].mean()

    train_null_count = bottle_df_train[col_name].isnull().sum()
    test_null_count = bottle_df_test[col_name].isnull().sum()

    bottle_df_train.loc[bottle_df_train[col_name].isnull(), col_name] = np.repeat(col_mean, train_null_count)
    bottle_df_test.loc[bottle_df_test[col_name].isnull(), col_name] = np.repeat(col_mean, test_null_count)

In [None]:
def impute_uniform(col_name, quantile_from=0.25, quantile_to=0.75):
    lower_quart = bottle_df_train[col_name].quantile(quantile_from)
    upper_quart = bottle_df_train[col_name].quantile(quantile_to)

    uniform_train = stats.uniform.rvs(loc=lower_quart, scale=upper_quart-lower_quart, size=bottle_df_train[col_name].isnull().sum())
    uniform_test = stats.uniform.rvs(loc=lower_quart, scale=upper_quart-lower_quart, size=bottle_df_test[col_name].isnull().sum())

    bottle_df_train.loc[bottle_df_train[col_name].isnull(), col_name] = uniform_train
    bottle_df_test.loc[bottle_df_test[col_name].isnull(), col_name] = uniform_test