In [None]:
import pandas as pd

df=pd.read_csv("aqar.csv")

pd.set_option("display.max_columns",None)
df.columns


- Use Y-data
- try multi layer preceptron(loss-funtion MSE, opt: adamw, Activation: Leaky ReLU)
- try a Layered ML model(Use three diffrent ml models, they output to the forth)


In [None]:
df.hist(bins=50, figsize=(20,15))

In [None]:

num_feature_cols=["area_sqm","num_bathrooms","num_bedrooms","num_rooms"]
cat_feature_cols=["location",]
bool_feature_cols=["lift"]

df : pd.DataFrame = df[(df['is_rental'] == False) & (df['is_daily_rental'] == False) & (df['sale_type'] != 'rent') & (df['sale_type'] !='daily')].copy()
pd.set_option('future.no_silent_downcasting', True)
# drop listings of land without buildings
df : pd.DataFrame = df[df['category_ga_property_category'] != 'land'].copy()
# drop listings of commercial buildings
df : pd.DataFrame = df[(df["category_ga_listing_type"]!= "office") & (df["category_ga_listing_type"]!="store") & (df["category_ga_listing_type"]!="warehouse") & (df["category_ga_listing_type"]!="lounge")].copy()

for bool_col in bool_feature_cols:
    df[bool_col] = df[bool_col].astype(int)
df['location'] = df['city'] + '_' + df['district']

target_col=["price"]


df[num_feature_cols + cat_feature_cols + bool_feature_cols + target_col].head(10)


In [None]:
df[num_feature_cols + cat_feature_cols + bool_feature_cols + target_col].hist(bins=50, figsize=(20,15))


In [None]:
from ydata_profiling import ProfileReport
profile = ProfileReport(df, title="All Aqar Data Profiling Report", explorative=True)
profile.to_file("raw_aqar_data_profiling_report.html")

In [None]:
# 4502 is the number of rows 
# 1933 is the number of missing values in num_bathrooms
df["num_bedrooms"].isna().sum()
df[num_feature_cols + cat_feature_cols + bool_feature_cols + target_col].isnull().sum()


In [None]:
# Profile Data
profile = ProfileReport(df[num_feature_cols + cat_feature_cols + bool_feature_cols + target_col].copy(), title="Aqar Dataset Profiling Report")
profile.to_file("subset_aqar_data_profiling_report.html")


In [None]:
# Model Training and Evaluation before preprocessing pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

class RareLabelGrouper(BaseEstimator, TransformerMixin):
    def __init__(self, tol=0.01, replace_with='Other'):
        self.tol = tol
        self.replace_with = replace_with
        self.frequent_labels_ = {}

    def fit(self, X, y=None):
        # Learn the frequent labels from the TRAINING set only
        self.frequent_labels_ = {}
        for col in X.columns:
            # Calculate frequency
            counts = pd.Series(X[col]).value_counts(normalize=True)
            # Keep labels that are more frequent than tolerance
            self.frequent_labels_[col] = counts[counts >= self.tol].index
        return self

    def transform(self, X):
        X = pd.DataFrame(X).copy() # Ensure we don't modify original
        for col in X.columns:
            # Get the frequent labels we learned during fit
            known_labels = self.frequent_labels_.get(col, [])
            
            # Apply the grouping
            # If a value is NOT in known_labels, replace it with 'Other'
            X[col] = X[col].where(X[col].isin(known_labels), self.replace_with)
        return X

In [None]:

# Data Pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, RobustScaler, TargetEncoder, PolynomialFeatures

pipe1= ColumnTransformer([
    ("num_pipeline", Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
    ]), num_feature_cols),
    ("cat_pipeline", Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("rare_grouper", RareLabelGrouper(tol=0.02, replace_with='other')),
        ("onehot", OneHotEncoder(handle_unknown="ignore",sparse_output=False))
    ]), cat_feature_cols),
    ("bool_pipeline", Pipeline([
        ("imputer", SimpleImputer(strategy="constant", fill_value=0)),
    ]), bool_feature_cols)
])

pipe2 = ColumnTransformer([
    ("num_pipeline", Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
    ]), num_feature_cols),
    ("cat_pipeline", Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("target_enc", TargetEncoder())
    ]), cat_feature_cols),
    ("bool_pipeline", Pipeline([
        ("imputer", SimpleImputer(strategy="constant", fill_value=0)),
    ]), bool_feature_cols)
])

pipe3 = ColumnTransformer([
    ("num_pipeline", Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
    ]), num_feature_cols),
    ("cat_pipeline", Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("rare_grouper", RareLabelGrouper(tol=0.02, replace_with='other')),
        ("target_enc", TargetEncoder())
    ]), cat_feature_cols),
    ("bool_pipeline", Pipeline([
        ("imputer", SimpleImputer(strategy="constant", fill_value=0)),
    ]), bool_feature_cols)
])
pipe4 = ColumnTransformer([
    ("num_pipeline", Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
    ]), num_feature_cols),
    ("cat_pipeline", Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("target_enc", TargetEncoder())
    ]), cat_feature_cols),
    ("bool_pipeline", Pipeline([
        ("imputer", SimpleImputer(strategy="constant", fill_value=0)),
    ]), bool_feature_cols)
])




catboost_pipe = ColumnTransformer([
    ("num_pipeline", Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
    ]), num_feature_cols),

    ("cat_pipeline", Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
    ]), cat_feature_cols),

    ("bool_pipeline", Pipeline([
        ("imputer", SimpleImputer(strategy="constant", fill_value=0)),
    ]), bool_feature_cols)
])


# Define features and target
X = df[num_feature_cols + cat_feature_cols + bool_feature_cols]
y = np.log1p(df[target_col])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from catboost import CatBoostRegressor, Pool

from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor,HistGradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import cross_validate
import numpy as np

from sklearn.model_selection import cross_validate, GridSearchCV
import numpy as np

cb=CatBoostRegressor(loss_function="RMSE",
            eval_metric="R2",
            random_seed=42,
            verbose=False
)

# Define models with hyperparameter grids
model_configs = {
    "HistGB": {
        'model': HistGradientBoostingRegressor(random_state=42),
        'params': {
            'model__max_iter': [300, 500],
            'model__max_depth': [5, 8],
            'model__learning_rate': [0.03, 0.05]
        }
    },
    "XGBoost": {
        'model': XGBRegressor(
            random_state=42,
            objective='reg:squarederror',
            tree_method='hist'
        ),
        'params': {
            'model__n_estimators': [300],
            'model__max_depth': [5, 7],
            'model__learning_rate': [0.05]
        }
    },
    "RandomForest": {
        'model': RandomForestRegressor(random_state=42),
        'params': {
            'model__n_estimators': [200],
            'model__max_depth': [15, 20],
            'model__min_samples_leaf': [3, 5]
        }
    },
    "CatBoost": {
        "model": cb,
        "params": {
            "model__iterations": [500, 800],
            "model__depth": [5, 6, 8],
            "model__learning_rate": [0.03, 0.05],
            "model__l2_leaf_reg": [3, 5, 7]
        }
    }
}

pipelines = {
    'pipe1_no_scaling': pipe1,
    "pipe2_no_grouper": pipe2,
    "pipe3_with_grouper": pipe3,
    "pipe4_target_enc": pipe4,
    "catboost_pipe": catboost_pipe
}

# Store results
results = []
best_models = {}

for pipe_name, pipe in pipelines.items():
    pipelines[pipe_name] = pipe.set_output(transform="pandas")
    for model_name, config in model_configs.items():
        print(f"\nTraining {model_name} with {pipe_name}...")
        
        if model_name == "CatBoost" and pipe_name != "catboost_pipe":
            print(f"Skipping {model_name} with {pipe_name} due to incompatible preprocessing.")
            continue
        if model_name != "CatBoost" and pipe_name == "catboost_pipe":
            print(f"Skipping {model_name} with {pipe_name} due to incompatible preprocessing.")
            continue
        
        
            

        
        # Create pipeline
        full_pipeline = Pipeline([
            ("preprocessing", pipe),
            ("model", config['model'])
        ])
        fit_params = {}
        if model_name == "CatBoost":
            pipe.fit(X_train, y_train)
            feature_names = pipe.get_feature_names_out()
            
            # 2. Identify indices of columns that came from 'cat_pipeline'
            # (Matches the name you gave in ColumnTransformer)
            cat_features_idx = [
                i for i, col in enumerate(feature_names) 
                if "cat_pipeline__" in col
            ]
            
            # 3. Pass these CORRECTED indices to the model fit params
            fit_params = {
                'model__cat_features': cat_features_idx,
                # 'model__plot': True # Optional: View training plot
            }

        # Tune hyperparameters if params exist
        if config['params']:
            grid_search = GridSearchCV(
                full_pipeline,
                config['params'],
                cv=5,
                scoring='r2',
                n_jobs=-1,
                verbose=1,
            )
            grid_search.fit(X_train, y_train.values.ravel(), **fit_params)
            best_pipeline = grid_search.best_estimator_
            best_params = grid_search.best_params_
        else:
            best_pipeline = full_pipeline
            best_params = {}
        
        # Cross-validation with best model
        cv_results = cross_validate(
            best_pipeline,
            X_train,
            y_train.values.ravel(),
            cv=5,
            scoring=['r2', 'neg_root_mean_squared_error', 'neg_mean_absolute_error'],
            return_train_score=True,
            params=fit_params
        )
        
        # Store results
        results.append({
            'Pipeline': pipe_name,
            'Model': model_name,
            'Train R2': cv_results['train_r2'].mean(),
            'Test R2': cv_results['test_r2'].mean(),
            'Test R2 Std': cv_results['test_r2'].std(),
            'Test RMSE': -cv_results['test_neg_root_mean_squared_error'].mean(),
            'Test MAE': -cv_results['test_neg_mean_absolute_error'].mean(),
            'Best Params': str(best_params)
        })
        
        # Store best model
        best_models[f"{pipe_name}_{model_name}"] = best_pipeline

# Create results DataFrame
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('Test R2', ascending=False)

print("\n=== Model Comparison Results (with Hyperparameter Tuning) ===")
print(results_df.to_string(index=False))

# Find best combination
best = results_df.iloc[0]
print(f"\n=== Best Model ===")
print(f"Pipeline: {best['Pipeline']}")
print(f"Model: {best['Model']}")
print(f"Test R2: {best['Test R2']:.4f} (Â±{best['Test R2 Std']:.4f})")
print(f"Test MSE: {best['Test RMSE']:.2f}")
print(f"Test MAE: {best['Test MAE']:.2f}")
print(f"Best Params: {best['Best Params']}")

# Final evaluation on test set
best_model_key = f"{best['Pipeline']}_{best['Model']}"
final_model = best_models[best_model_key]
y_pred = np.expm1(final_model.predict(X_test))
test_r2 = r2_score(np.expm1(y_test), y_pred)
test_mse = mean_squared_error(np.expm1(y_test), y_pred)

print(f"\n=== Final Test Set Performance ===")
print(f"Test R2: {test_r2:.4f}")
print(f"Test MSE: {test_mse:.2f}")

        
results_df

In [None]:
results_df.to_csv("model_comparison_results.csv", index=False)
results_df.to_json("model_comparison_results.json", orient="records")



In [None]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import Ridge

# After you find your best params from the loop, define the fixed models
estimators = [
    ('cat', best_models['CatBoost_Pipe_CatBoost']),
    ('xgb', best_models['OneHot_NoScaling_XGBoost']),
    ('hist', best_models['TargetEnc_NoScaling_HistGB'])
]

stacking_reg = StackingRegressor(
    estimators=estimators,
    final_estimator=Ridge(alpha=1.0), # A simple linear model to combine predictions
    cv=5
)

# Fit this manually at the end
print("Training Stacking Ensemble...")
stacking_reg.fit(X_train, y_train.values.ravel())
score = stacking_reg.score(X_test, y_test)
print(f"Stacking R2: {score}")

In [None]:

# Create a profiling-only pipeline (imputation + scaling, but no OHE)
profiling_pipe = ColumnTransformer([
    ("num_pipeline", Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ]), num_feature_cols),
    ("cat_pipeline", Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
    ]), cat_feature_cols),
    ("bool_pipeline", Pipeline([
        ("imputer", SimpleImputer(strategy="constant", fill_value=0)),
    ]), bool_feature_cols)
], remainder='drop').set_output(transform="pandas")

# Transform for profiling
X_train_for_profiling = profiling_pipe.fit_transform(X_train)
X_train_for_profiling['price'] = y_train.values

# Generate profile
profile_preprocessed = ProfileReport(X_train_for_profiling, title="Aqar Preprocessed Data Profiling Report (No OHE)")
profile_preprocessed.to_file("aqar_preprocessed_data_profiling_report.html")
print("Preprocessed data profiling report saved!")
print(f"\nProfiled data shape: {X_train_for_profiling.shape}")
print("Categorical columns preserved for easier interpretation")

In [None]:
# plot histograms of preprocessed features
X_train_for_profiling.hist(bins=50, figsize=(20,15))