## Load data

In [98]:
import kagglehub
import pandas as pd

def print_stats(df, names, desc):
    print(desc)
    df_cat = pd.DataFrame(columns=["Name", "Vals", "Counts"])
    for ft in names:
        df_cat.loc[len(df_cat)] = [ft] + list(np.unique(df[ft], return_counts=True))
    print(df_cat.to_markdown(), end="\n\n")

# divide features to num and cat
# here we move ordered cat features to numerical ones
num_features = ['area', 'bathrooms', 'bedrooms', 'stories', 'parking']
cat_features = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea', 'furnishingstatus']

path = kagglehub.dataset_download("yasserh/housing-prices-dataset")
df = pd.read_csv(f"{path}/Housing.csv")

assert len(num_features) + len(cat_features) + 1 == len(df.columns)

# drop duplicates and "similar" objects
df = df.drop_duplicates(subset=["area", "price"])

assert df.isna().sum().sum() == 0
assert ((df["price"] > 0) & (df.area > 0)).all()

print_stats(df, cat_features, "Cat features distributions")

print_stats(
    df, num_features[1:],
    "Cat features converted to numerical (avoid overfitting on rare samples)"
)

print("DataFrame:")
print(df.head(3).to_markdown())

Cat features distributions
|    | Name             | Vals                                         | Counts        |
|---:|:-----------------|:---------------------------------------------|:--------------|
|  0 | mainroad         | ['no' 'yes']                                 | [ 76 456]     |
|  1 | guestroom        | ['no' 'yes']                                 | [437  95]     |
|  2 | basement         | ['no' 'yes']                                 | [346 186]     |
|  3 | hotwaterheating  | ['no' 'yes']                                 | [507  25]     |
|  4 | airconditioning  | ['no' 'yes']                                 | [364 168]     |
|  5 | prefarea         | ['no' 'yes']                                 | [406 126]     |
|  6 | furnishingstatus | ['furnished' 'semi-furnished' 'unfurnished'] | [137 223 172] |

Cat features converted to numerical (avoid overfitting on rare samples)
|    | Name      | Vals          | Counts                    |
|---:|:----------|:--------------|:-

### Additional dataset

In [127]:
def load_add_data(main_df, main_num_features, main_cat_features):
    path_add = kagglehub.dataset_download("sukhmandeepsinghbrar/housing-price-dataset")

    rename_columns = {
        "bedrooms": "bedrooms",
        "bathrooms": "bathrooms",
        "sqft_lot": "area",
        "floors": "stories",
        "price": "price",
    }

    num_features = ["area", "bedrooms", "bathrooms", "stories"]
    cat_features = []

    assert sorted(num_features) == sorted(set(main_num_features) & set(rename_columns.values()))
    assert sorted(cat_features) == sorted(set(main_cat_features) & set(rename_columns.values()))

    df = pd.read_csv(f"{path_add}/Housing.csv")

    # drop duplicates
    if len(df['id'].unique()) != len(df):
        print(f"Find duplicates. Dropping them...")
        df = df.drop_duplicates(subset=["id"])
        assert len(df['id'].unique()) == len(df)

    # keep only common columns with the initial dataframe
    df = df[list(rename_columns.keys())].rename(columns=rename_columns).astype(int)

    print_stats(df, num_features[1:], "Numerical features before removing extra values")

    # remove cat features with extra values compared to orig df
    for cat_feat in num_features[1:]:
        orig_vals = main_df[cat_feat].unique()
        df = df[df[cat_feat].isin(orig_vals)]

    print_stats(df, num_features[1:], "Final numerical features")

    # check nans and incorrect samples
    assert df.isna().sum().sum() == 0
    assert ((df["price"] > 0) & (df.area > 0)).all()

    return df, num_features, cat_features

df_add, num_features_add, cat_features_add = load_add_data(df, num_features, cat_features)
print("Additional DataFrame")
print(df_add.head(3).to_markdown())

Find duplicates. Dropping them...
Numerical features before removing extra values
|    | Name      | Vals                                     | Counts                                                             |
|---:|:----------|:-----------------------------------------|:-------------------------------------------------------------------|
|  0 | bedrooms  | [ 0  1  2  3  4  5  6  7  8  9 10 11 33] | [  13  194 2737 9730 6849 1586  265   38   13    6    3    1    1] |
|  1 | bathrooms | [0 1 2 3 4 5 6 7 8]                      | [   85  8254 10481  2217   335    48    12     2     2]            |
|  2 | stories   | [1 2 3]                                  | [12447  8370   619]                                                |

Final numerical features
|    | Name      | Vals          | Counts                          |
|---:|:----------|:--------------|:--------------------------------|
|  0 | bedrooms  | [1 2 3 4 5 6] | [ 164 2709 9714 6828 1561  252] |
|  1 | bathrooms | [1 2 3 4]  

In [128]:
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.preprocessing import OneHotEncoder

def preprocess_data(df, num_features, cat_features):
    enc_cat = OneHotEncoder().fit_transform(df[cat_features]).toarray()
    return np.concatenate([df[num_features].to_numpy(), enc_cat], axis=1)

def fill_features(df, df_pre, df_add, df_add_pre, column_names, is_cat=True):
    df_add = df_add.copy()
    for col_name in column_names:
        if is_cat:
            enc = OneHotEncoder().fit(df[[col_name]])
            model = Ridge().fit(df_pre, enc.transform(df[[col_name]]).toarray())
            df_add[col_name] = enc.inverse_transform(model.predict(df_add_pre)).reshape(-1)
        else:
            model = Ridge().fit(df_pre, df[col_name])
            df_add[col_name] = np.clip(model.predict(df_add_pre).astype(int), a_min=df[col_name].min(), a_max=df[col_name].max())
    return df_add

df_pre = preprocess_data(df, ["price"] + num_features_add, cat_features_add)
df_add_pre = preprocess_data(df_add, ["price"] + num_features_add, cat_features_add)

df_add = fill_features(df, df_pre, df_add, df_add_pre, list(set(cat_features) - set(cat_features_add)))
df_add = fill_features(df, df_pre, df_add, df_add_pre, list(set(num_features) - set(num_features_add)), is_cat=False)

print(df_add.head(3).to_markdown())

### Synthetic data

In [None]:
def generate_synt(df, num_features, cat_features, num_samples):
    df_gen = pd.DataFrame(columns=df.columns)
    for ft in num_features:
        df_gen[ft] = np.random.normal(
            loc=df[ft].mean(), scale=df[ft].std(), size=num_samples
        )
    for ft in cat_features:
        vals, counts = np.unique(df[ft], return_counts=True)
        df_gen[ft] = [vals[idx] for idx in np.random.binomial(
            n=len(vals), p=np.array(counts) / sum(counts), size=num_samples,
        )]
    return df_gen

df_synt = generate_synt(df, num_features, cat_features, num_samples=5000)

## Metrics