In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [5]:
df = pd.read_csv('../etl/data/property_ml_ready.csv')
df.tail(40)

Unnamed: 0,property_id,deal_type,user_type,district,type_name,size_sqm,rooms,floor,year_built,post_date,sell_date,renovation_status,estimated_saleprice,estimated_rentprice
1396,2895,Sale,Agent,Nor Nork,House,159.2,3,9,2015,2024-01-23,2025-03-12,Partially Renovated,221314.41,885.26
1397,2897,Rent,Owner,Avan,House,165.7,3,2,2008,2024-06-12,,Newly Renovated,311521.58,1246.09
1398,2906,Sale,Owner,Avan,House,177.5,5,12,1975,2025-05-02,2025-05-01,Not Renovated,357039.18,1428.16
1399,2908,Rent,Agent,Kentron,House,43.7,1,4,2002,2025-01-26,2025-05-01,Not Renovated,54413.03,217.65
1400,2911,Rent,Agent,Shengavit,House,172.8,4,8,1982,2023-07-23,2025-05-01,Partially Renovated,324534.92,1298.14
1401,2912,Sale,Owner,Arabkir,House,156.7,2,9,2016,2024-10-27,2025-03-01,Not Renovated,198701.03,794.8
1402,2914,Rent,Owner,Nor Nork,House,124.9,1,11,1970,2025-03-11,,Newly Renovated,184750.99,739.0
1403,2919,Rent,Owner,Avan,House,175.9,6,9,1977,2022-08-11,2025-04-03,Partially Renovated,254913.55,1019.65
1404,2921,Sale,Buyer,Kentron,House,155.4,2,8,1983,2023-09-14,,Not Renovated,197412.08,789.65
1405,2925,Rent,Owner,Erebuni,House,100.4,2,9,2013,2024-11-17,,Not Renovated,173953.78,695.82


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1423 entries, 0 to 1422
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   property_id          1423 non-null   int64  
 1   deal_type            1423 non-null   object 
 2   user_type            1423 non-null   object 
 3   district             1423 non-null   object 
 4   type_name            1423 non-null   object 
 5   size_sqm             1423 non-null   float64
 6   status               1423 non-null   object 
 7   rooms                1423 non-null   int64  
 8   floor                1423 non-null   int64  
 9   year_built           1423 non-null   int64  
 10  renovation_status    1423 non-null   object 
 11  estimated_saleprice  1423 non-null   float64
 12  estimated_rentprice  1423 non-null   float64
dtypes: float64(3), int64(4), object(6)
memory usage: 144.7+ KB


In [4]:
df.isnull().sum()

property_id            0
deal_type              0
user_type              0
district               0
type_name              0
size_sqm               0
status                 0
rooms                  0
floor                  0
year_built             0
renovation_status      0
estimated_saleprice    0
estimated_rentprice    0
dtype: int64

In [5]:
df.duplicated().sum().any()

np.False_

In [6]:
df.dropna(inplace=True)

In [7]:
df.describe()

Unnamed: 0,property_id,size_sqm,rooms,floor,year_built,estimated_saleprice,estimated_rentprice
count,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0
mean,1502.613493,114.713212,3.537597,6.465214,1993.24877,221190.432586,884.761862
std,876.017436,50.204404,1.690076,3.39728,17.006906,116219.309695,464.877256
min,1.0,25.0,1.0,1.0,1965.0,28081.37,112.33
25%,730.5,69.85,2.0,4.0,1978.0,128306.335,513.225
50%,1481.0,116.8,4.0,6.0,1993.0,208308.93,833.24
75%,2275.0,158.5,5.0,9.0,2008.0,291172.465,1164.69
max,2997.0,200.0,6.0,12.0,2023.0,683244.03,2732.98


In [8]:
df.shape

(1423, 13)

In [9]:
# Define categorical columns
categorical_cols = ['type_name', 'district', 'status', 'renovation_status']

# One-hot encode categorical columns
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Features to exclude from modeling
exclude_columns = ['estimated_rent_price', 'estimated_sales_price', 'id', 'property_id',
 'deal_type',
 'user_type',]

# Determine which columns are features
all_columns = df_encoded.columns.tolist()
feature_columns = [col for col in all_columns if col not in exclude_columns]
feature_columns

['size_sqm',
 'rooms',
 'floor',
 'year_built',
 'estimated_saleprice',
 'estimated_rentprice',
 'district_Arabkir',
 'district_Avan',
 'district_Davtashen',
 'district_Erebuni',
 'district_Kanaker-Zeytun',
 'district_Kentron',
 'district_Malatia-Sebastia',
 'district_Nor Nork',
 'district_Nork-Marash',
 'district_Nubarashen',
 'district_Shengavit',
 'renovation_status_Not Renovated',
 'renovation_status_Partially Renovated']

In [1]:
def evaluate_model(model, X_test, y_test, label=""):
    y_pred = model.predict(X_test)
    print(f"\n--- {label} Model Evaluation ---")
    print("MAE:", mean_absolute_error(y_test, y_pred))
    print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
    print("R² Score:", r2_score(y_test, y_pred))


In [11]:
# ----------------- Rent Price Model -----------------
X_rent = df_encoded[feature_columns]
y_rent = df_encoded['estimated_rentprice']

X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_rent, y_rent, test_size=0.2, random_state=42)

model_rent = RandomForestRegressor(n_estimators=100, random_state=42)
model_rent.fit(X_train_r, y_train_r)

evaluate_model(model_rent, X_test_r, y_test_r, label="Rent Price")


--- Rent Price Model Evaluation ---
MAE: 1.8065270175438903
RMSE: 12.818789033484327
R² Score: 0.9992433165259985


In [12]:
# ----------------- Sales Price Model -----------------
X_sales = df_encoded[feature_columns]
y_sales = df_encoded['estimated_saleprice']
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_sales, y_sales, test_size=0.2, random_state=42)

model_sales = RandomForestRegressor(n_estimators=100, random_state=42)
model_sales.fit(X_train_s, y_train_s)

evaluate_model(model_sales, X_test_s, y_test_s, label="Sales Price")


--- Sales Price Model Evaluation ---
MAE: 452.0154473684318
RMSE: 2903.74201899575
R² Score: 0.9993787654497182


In [13]:
import joblib 

joblib.dump(model_sales, 'models/sales_price_model.pkl')
joblib.dump(model_rent, 'models/rent_price_model.pkl')

['models/rent_price_model.pkl']

In [14]:
df_encoded["predicted_rentprice"] = model_rent.predict(X_rent)
df_encoded["predicted_saleprice"] = model_sales.predict(X_sales)

In [15]:
prediction_df = df[['property_id']].copy()
prediction_df["actual_rentprice"] = df['estimated_rentprice']
prediction_df["predicted_rentprice"] = df_encoded["predicted_rentprice"]
prediction_df["actual_saleprice"] = df['estimated_saleprice']
prediction_df["predicted_saleprice"] = df_encoded["predicted_saleprice"]

In [16]:
# Save to file
output_path = "property_predictions.csv"
prediction_df.to_csv(output_path, index=False)
print(f"\n✅ Prediction CSV saved: {output_path} | Shape: {prediction_df.shape}")


✅ Prediction CSV saved: property_predictions.csv | Shape: (1423, 5)


In [6]:
ready_df = pd.read_csv('output/predictions.csv')
ready_df.head(90)

Unnamed: 0,predicted_sell_price,predicted_rent_price,prob_sold_within_5_months
0,182613.2781,724.5796,0.178413
1,136315.8913,549.0840,0.147066
2,323056.0785,1272.4737,0.131100
3,227694.6124,932.0888,0.122786
4,231190.0352,920.3851,0.195324
...,...,...,...
85,266422.0331,1050.4118,0.166737
86,220536.6284,880.6428,0.168164
87,373398.0834,1491.2522,0.134159
88,244711.4238,978.0669,0.158210
