In [None]:
import pandas as pd

df=pd.read_csv("aqar.csv")

pd.set_option("display.max_columns",None)
df.columns


- Use Y-data
- try multi layer preceptron(loss-funtion MSE, opt: adamw, Activation: Leaky ReLU)
- try a Layered ML model(Use three diffrent ml models, they output to the forth)


In [None]:
df.hist(bins=50, figsize=(20,15))

In [None]:

num_feature_cols=["area_sqm","num_bedrooms","num_rooms"]
cat_feature_cols=["district","city_grouped"]
bool_feature_cols=["lift"]

df : pd.DataFrame = df[(df['is_rental'] == False) & (df['is_daily_rental'] == False) & (df['sale_type'] != 'rent') & (df['sale_type'] !='daily')].copy()
pd.set_option('future.no_silent_downcasting', True)
# drop listings of land without buildings
df : pd.DataFrame = df[df['category_ga_property_category'] != 'land'].copy()
# drop listings of commercial buildings
df : pd.DataFrame = df[(df["category_ga_listing_type"]!= "office") & (df["category_ga_listing_type"]!="store") & (df["category_ga_listing_type"]!="warehouse") & (df["category_ga_listing_type"]!="lounge")].copy()

for bool_col in bool_feature_cols:
    df[bool_col] = df[bool_col].astype(int)

# combine rare cities into 'Other'
city_counts = df['city'].value_counts(normalize=True)
rare_cities = city_counts[city_counts < 0.02].index
df['city_grouped'] = df['city'].apply(lambda x: 'other' if x in rare_cities else x)

target_col=["price"]


df[num_feature_cols + cat_feature_cols + bool_feature_cols + target_col].head()


In [None]:
df[num_feature_cols + cat_feature_cols + bool_feature_cols + target_col].hist(bins=50, figsize=(20,15))


In [None]:
# 4502 is the number of rows 
# 1933 is the number of missing values in num_bathrooms
df["num_bedrooms"].isna().sum()
df[num_feature_cols + cat_feature_cols + bool_feature_cols + target_col].isnull().sum()


In [None]:
# Profile Data
from ydata_profiling import ProfileReport
profile = ProfileReport(df[num_feature_cols + cat_feature_cols + bool_feature_cols + target_col].copy(), title="Aqar Dataset Profiling Report")
profile.to_file("aqar_data_profiling_report.html")


In [None]:
# Model Training and Evaluation before preprocessing pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

droppy = df.dropna(subset=num_feature_cols + cat_feature_cols + bool_feature_cols + target_col)

# OHE for categorical features
df_encoded = pd.get_dummies(droppy[num_feature_cols + cat_feature_cols + bool_feature_cols + target_col], drop_first=True)
X = df_encoded.drop("price", axis=1)
y = df_encoded["price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize and train the model
model = LinearRegression()
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)


In [None]:

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

rf_model = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1, scoring='r2')
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [None]:
from sklearn.model_selection import cross_val_score

linear_cross_val_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
print(f"Cross-validated R^2 scores: {linear_cross_val_scores}")
print(f"Average Cross-validated R^2 score: {linear_cross_val_scores.mean()}")
rf_cross_val_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='r2')
print(f"Random Forest Cross-validated R^2 scores: {rf_cross_val_scores}")
print(f"Random Forest Average Cross-validated R^2 score: {rf_cross_val_scores.mean()}")

In [None]:
# Data Pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

pipe= ColumnTransformer([
    ("num_pipeline", Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ]), num_feature_cols),
    ("cat_pipeline", Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ]), cat_feature_cols),
    ("bool_pipeline", Pipeline([
        ("imputer", SimpleImputer(strategy="constant", fill_value=0)),
    ]), bool_feature_cols)
])


# Define features and target
X = df[num_feature_cols + cat_feature_cols + bool_feature_cols]
y = df[target_col]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=df['city_grouped']
)
X_train_prepared = pipe.fit_transform(X_train)
X_test_prepared = pipe.transform(X_test)
X_train_prepared, X_test_prepared

In [None]:

# Create a profiling-only pipeline (imputation + scaling, but no OHE)
profiling_pipe = ColumnTransformer([
    ("num_pipeline", Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ]), num_feature_cols),
    ("cat_pipeline", Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
    ]), cat_feature_cols),
    ("bool_pipeline", Pipeline([
        ("imputer", SimpleImputer(strategy="constant", fill_value=0)),
    ]), bool_feature_cols)
], remainder='drop')

profiling_pipe.set_output(transform="pandas")

# Transform for profiling
X_train_for_profiling = profiling_pipe.fit_transform(X_train)
X_train_for_profiling['price'] = y_train.values

# Generate profile
profile_preprocessed = ProfileReport(X_train_for_profiling, title="Aqar Preprocessed Data Profiling Report (No OHE)")
profile_preprocessed.to_file("aqar_preprocessed_data_profiling_report.html")
print("Preprocessed data profiling report saved!")
print(f"\nProfiled data shape: {X_train_for_profiling.shape}")
print("Categorical columns preserved for easier interpretation")

In [None]:
# plot histograms of preprocessed features
X_train_for_profiling.hist(bins=50, figsize=(20,15))

In [None]:

# Initialize and train the model
model = LinearRegression()

model.fit(X_train_prepared, y_train)


# Make predictions
y_pred = model.predict(X_test_prepared)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

In [None]:

rf_model = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, n_jobs=-1, scoring='r2')
grid_search.fit(X_train_prepared, y_train.values.ravel())

In [None]:
model_rf = grid_search.best_estimator_
model_rf.fit(X_train_prepared, y_train.values.ravel())
y_pred_rf = model_rf.predict(X_test_prepared)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
print(f"Random Forest Mean Squared Error: {mse_rf}")
print(f"Random Forest R^2 Score: {r2_rf}")

In [None]:
pipeline_linear_cross_val_scores = cross_val_score(model, X_train_prepared, y_train.values.ravel(), cv=5, scoring='r2')
print(f"Cross-validated R^2 scores: {pipeline_linear_cross_val_scores}")
print(f"Average Cross-validated R^2 score: {pipeline_linear_cross_val_scores.mean()}")

pipeline_rf_cross_val_scores = cross_val_score(model_rf, X_train_prepared, y_train.values.ravel(), cv=5, scoring='r2')
print(f"Random Forest Cross-validated R^2 scores: {pipeline_rf_cross_val_scores}")
print(f"Random Forest Average Cross-validated R^2 score: {pipeline_rf_cross_val_scores.mean()}")


In [None]:
# check if pipeline improves cross-validation scores
# available vars = linear_cross_val_scores, rf_cross_val_scores, pipeline_linear_cross_val_scores,pipeline_rf_cross_val_scores

if (pipeline_linear_cross_val_scores.mean() > linear_cross_val_scores.mean()) and (pipeline_rf_cross_val_scores.mean() > rf_cross_val_scores.mean()):
    print("The preprocessing pipeline has improved the cross-validation scores for both Linear Regression and Random Forest models.")
    print(f"Linear Regression improvement: {pipeline_linear_cross_val_scores.mean() - linear_cross_val_scores.mean()}")
    print(f"Random Forest improvement: {pipeline_rf_cross_val_scores.mean() - rf_cross_val_scores.mean()}")
else:
    print("The preprocessing pipeline did not improve the cross-validation scores for both models.")
    print(f"Linear Regression change: {pipeline_linear_cross_val_scores.mean() - linear_cross_val_scores.mean()}")
    print(f"Random Forest change: {pipeline_rf_cross_val_scores.mean() - rf_cross_val_scores.mean()}")
    
print("Model training and evaluation complete.")
print(f"Pipelined Cross-Validated R^2 scores: {pipeline_linear_cross_val_scores}")
print(f"Pipelined Random Forest Cross-Validated R^2 scores: {pipeline_rf_cross_val_scores}")
print(f"non-pipelined Linear Regression Cross-Validated R^2 scores: {linear_cross_val_scores}")
print(f"non-pipelined Random Forest Cross-Validated R^2 scores: {rf_cross_val_scores}")