In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [7]:
df = pd.read_csv('../etl/data/property_ml_ready.csv')
df.head(20)

Unnamed: 0,property_id,deal_type,user_type,district,type_name,size_sqm,rooms,floor,year_built,post_date,sell_date,renovation_status,estimated_saleprice,estimated_rentprice
0,2,Sale,Buyer,Arabkir,Apartment,130.7,4,12,2018,2023-01-25,2024-11-27,Newly Renovated,410075.66,1640.3
1,8,Rent,Buyer,Ajapnyak,Apartment,74.4,4,4,2013,2023-08-15,,Not Renovated,155361.41,621.45
2,10,Sale,Agent,Shengavit,Apartment,55.5,5,7,1971,2021-01-23,2025-01-12,Not Renovated,109457.83,437.83
3,11,Sale,Agent,Erebuni,Apartment,139.4,6,6,1998,2023-06-06,2024-12-26,Not Renovated,315209.53,1260.84
4,12,Sale,Agent,Avan,Apartment,65.8,3,11,1967,2024-08-22,2025-03-22,Not Renovated,124375.76,497.5
5,13,Rent,Owner,Malatia-Sebastia,Apartment,133.2,4,8,2021,2025-04-17,2025-01-03,Not Renovated,148426.72,593.71
6,14,Rent,Agent,Avan,Apartment,90.4,2,7,2008,2025-12-17,2025-04-06,Newly Renovated,159511.66,638.05
7,21,Rent,Buyer,Arabkir,Apartment,65.6,2,1,1980,2024-05-05,,Not Renovated,72694.77,290.78
8,23,Rent,Buyer,Malatia-Sebastia,Apartment,129.8,1,5,2017,2025-02-03,2025-03-28,Partially Renovated,290719.47,1162.88
9,25,Sale,Owner,Kentron,Apartment,189.9,5,6,1982,2024-07-24,2025-05-04,Newly Renovated,382383.77,1529.54


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1423 entries, 0 to 1422
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   property_id          1423 non-null   int64  
 1   deal_type            1423 non-null   object 
 2   user_type            1423 non-null   object 
 3   district             1423 non-null   object 
 4   type_name            1423 non-null   object 
 5   size_sqm             1423 non-null   float64
 6   status               1423 non-null   object 
 7   rooms                1423 non-null   int64  
 8   floor                1423 non-null   int64  
 9   year_built           1423 non-null   int64  
 10  renovation_status    1423 non-null   object 
 11  estimated_saleprice  1423 non-null   float64
 12  estimated_rentprice  1423 non-null   float64
dtypes: float64(3), int64(4), object(6)
memory usage: 144.7+ KB


In [4]:
df.isnull().sum()

property_id            0
deal_type              0
user_type              0
district               0
type_name              0
size_sqm               0
status                 0
rooms                  0
floor                  0
year_built             0
renovation_status      0
estimated_saleprice    0
estimated_rentprice    0
dtype: int64

In [5]:
df.duplicated().sum().any()

np.False_

In [6]:
df.dropna(inplace=True)

In [7]:
df.describe()

Unnamed: 0,property_id,size_sqm,rooms,floor,year_built,estimated_saleprice,estimated_rentprice
count,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0,1423.0
mean,1502.613493,114.713212,3.537597,6.465214,1993.24877,221190.432586,884.761862
std,876.017436,50.204404,1.690076,3.39728,17.006906,116219.309695,464.877256
min,1.0,25.0,1.0,1.0,1965.0,28081.37,112.33
25%,730.5,69.85,2.0,4.0,1978.0,128306.335,513.225
50%,1481.0,116.8,4.0,6.0,1993.0,208308.93,833.24
75%,2275.0,158.5,5.0,9.0,2008.0,291172.465,1164.69
max,2997.0,200.0,6.0,12.0,2023.0,683244.03,2732.98


In [8]:
df.shape

(1423, 13)

In [9]:
# Define categorical columns
categorical_cols = ['type_name', 'district', 'status', 'renovation_status']

# One-hot encode categorical columns
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Features to exclude from modeling
exclude_columns = ['estimated_rent_price', 'estimated_sales_price', 'id', 'property_id',
 'deal_type',
 'user_type',]

# Determine which columns are features
all_columns = df_encoded.columns.tolist()
feature_columns = [col for col in all_columns if col not in exclude_columns]
feature_columns

['size_sqm',
 'rooms',
 'floor',
 'year_built',
 'estimated_saleprice',
 'estimated_rentprice',
 'district_Arabkir',
 'district_Avan',
 'district_Davtashen',
 'district_Erebuni',
 'district_Kanaker-Zeytun',
 'district_Kentron',
 'district_Malatia-Sebastia',
 'district_Nor Nork',
 'district_Nork-Marash',
 'district_Nubarashen',
 'district_Shengavit',
 'renovation_status_Not Renovated',
 'renovation_status_Partially Renovated']

In [1]:
def evaluate_model(model, X_test, y_test, label=""):
    y_pred = model.predict(X_test)
    print(f"\n--- {label} Model Evaluation ---")
    print("MAE:", mean_absolute_error(y_test, y_pred))
    print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
    print("R² Score:", r2_score(y_test, y_pred))


In [11]:
# ----------------- Rent Price Model -----------------
X_rent = df_encoded[feature_columns]
y_rent = df_encoded['estimated_rentprice']

X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_rent, y_rent, test_size=0.2, random_state=42)

model_rent = RandomForestRegressor(n_estimators=100, random_state=42)
model_rent.fit(X_train_r, y_train_r)

evaluate_model(model_rent, X_test_r, y_test_r, label="Rent Price")


--- Rent Price Model Evaluation ---
MAE: 1.8065270175438903
RMSE: 12.818789033484327
R² Score: 0.9992433165259985


In [12]:
# ----------------- Sales Price Model -----------------
X_sales = df_encoded[feature_columns]
y_sales = df_encoded['estimated_saleprice']
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_sales, y_sales, test_size=0.2, random_state=42)

model_sales = RandomForestRegressor(n_estimators=100, random_state=42)
model_sales.fit(X_train_s, y_train_s)

evaluate_model(model_sales, X_test_s, y_test_s, label="Sales Price")


--- Sales Price Model Evaluation ---
MAE: 452.0154473684318
RMSE: 2903.74201899575
R² Score: 0.9993787654497182


In [13]:
import joblib 

joblib.dump(model_sales, 'models/sales_price_model.pkl')
joblib.dump(model_rent, 'models/rent_price_model.pkl')

['models/rent_price_model.pkl']

In [14]:
df_encoded["predicted_rentprice"] = model_rent.predict(X_rent)
df_encoded["predicted_saleprice"] = model_sales.predict(X_sales)

In [15]:
prediction_df = df[['property_id']].copy()
prediction_df["actual_rentprice"] = df['estimated_rentprice']
prediction_df["predicted_rentprice"] = df_encoded["predicted_rentprice"]
prediction_df["actual_saleprice"] = df['estimated_saleprice']
prediction_df["predicted_saleprice"] = df_encoded["predicted_saleprice"]

In [16]:
# Save to file
output_path = "property_predictions.csv"
prediction_df.to_csv(output_path, index=False)
print(f"\n✅ Prediction CSV saved: {output_path} | Shape: {prediction_df.shape}")


✅ Prediction CSV saved: property_predictions.csv | Shape: (1423, 5)


In [7]:
ready_df = pd.read_csv('../output/property_predictions_final.csv')
ready_df.tail(30)

Unnamed: 0,property_id,post_date,actual_rent_price,actual_sales_price,actual_sold,actual_days_on_market,predicted_rent_price,predicted_sales_price,will_sell,sell_probability,predicted_days_to_sale,predicted_sale_date
1419,2952,2021-08-22,857.95,214488.5,1,1312.0,863.3616,214976.8542,1,0.86,1163.62,2024-10-28
1420,2954,2021-09-18,897.46,224364.64,0,1565.0,840.793,210730.033,1,0.7,738.13,2023-09-26
1421,2956,2024-10-08,422.31,105576.29,1,88.0,509.3758,125786.7641,1,0.93,363.25,2025-10-06
1422,2957,2024-05-23,963.62,240904.2,0,587.0,1038.3753,255121.655,1,0.52,821.26,2026-08-22
1423,2958,2025-12-28,444.35,111086.58,1,-334.0,415.9908,104592.9347,1,0.83,11.35,2026-01-08
1424,2962,2021-05-04,311.88,77969.05,0,1702.0,307.9885,76140.8558,0,0.24,,The probability of being sold is very low
1425,2966,2025-10-28,1420.37,355091.89,0,64.0,1463.2301,358457.3733,0,0.32,,The probability of being sold is very low
1426,2967,2025-01-20,1073.29,268322.5,0,345.0,1146.654,287869.1718,0,0.26,,The probability of being sold is very low
1427,2968,2024-08-05,785.07,196267.66,0,513.0,761.0142,190968.4148,0,0.23,,The probability of being sold is very low
1428,2973,2024-04-14,455.26,113814.19,1,300.0,425.1671,106280.4336,1,0.87,445.0,2025-07-03
