In [62]:
import os

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns
import sklearn
from geopy.geocoders import Nominatim
from icecream import ic
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import FeatureUnion, Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [34]:
DATA_PATH = "data"
FILE_NAME = "armslengthsales_2024_valid.csv"

In [35]:
file_path = os.path.join(DATA_PATH, FILE_NAME)

In [36]:
df = pd.read_csv(file_path)
df.shape

(1417, 20)

In [37]:
CAT_FEATURES = ["PropType", "District", "Style"]
NUM_FEATURES = ["Stories", "Year_Built", "Units", "FinishedSqft"]  # , "CondoProject"]
FEATURES = CAT_FEATURES + NUM_FEATURES
TARGET = "Sale_price"

In [38]:
class NanStrTransformer(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin):
    def __init__(self, nan_str_value: str = "NAN"):
        self.nan_str_value = nan_str_value
        self.col_names = None

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        self.col_names = X.columns.tolist()
        return X.fillna(self.nan_str_value)

    def get_feature_names_out(self, input_features=None):
        return self.col_names


class RareTransformer(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin):
    def __init__(self, rare_threshold: float = 0.02):
        self.rare_threshold = rare_threshold
        self.col_names = None
        super().__init__()

    def fit(self, X, y=None):
        return self

    def transform_rare_values(
        self, data: pd.Series, threshold: int = 0.02, rare_label: str = "RARE"
    ) -> pd.Series:
        label_counts = data.value_counts() / len(data)
        return data.apply(lambda x: x if label_counts[x] > threshold else rare_label)

    def transform(self, X, y=None):
        self.col_names = X.columns.tolist()
        for column in X.columns:
            X.loc[:, column] = self.transform_rare_values(
                X[column], threshold=self.rare_threshold
            )

        return X

    def get_feature_names_out(self, input_features=None):
        return self.col_names

In [39]:
X = df[FEATURES]
y = df[TARGET]

In [40]:
numeric_preprocessor = Pipeline(
    [
        ("imputation_mean", SimpleImputer(missing_values=np.nan, strategy="mean")),
        ("scaler", StandardScaler()),
    ]
)

In [57]:
categorical_preprocessor = Pipeline(
    [
        (
            "imputation_constant",
            SimpleImputer(fill_value="missing", strategy="constant"),
        ),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ]
)

In [58]:
preprocessor = ColumnTransformer(
    [
        ("categorical", categorical_preprocessor, CAT_FEATURES),
        ("numerical", numeric_preprocessor, NUM_FEATURES),
    ]
)

In [61]:
pd.DataFrame(preprocessor.fit_transform(X, y)).to_csv("pipeline_output.csv")

In [43]:
pipe = make_pipeline(preprocessor, LogisticRegression(max_iter=500))
pipe

In [44]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [48]:
# Fit the pipeline to your training data
pipe.fit(X_train, y_train)

# Make predictions on the test set
predictions = pipe.predict(X_test)
pipe.score(X_test, y_test)

0.017605633802816902