In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
df = pd.read_csv('property_ml_ready.csv')
df.head(20)

Unnamed: 0,property_id,deal_type,user_type,district,type_name,size_sqm,status,rooms,floor,year_built,renovation_status,estimated_saleprice,estimated_rentprice
0,1,Sale,Buyer,Davtashen,,99.8,Available,4,8,2003,Newly Renovated,269242.07,1076.97
1,2,Sale,Owner,Kentron,House,96.4,Available,1,4,1993,Newly Renovated,164102.48,656.41
2,3,Rent,Buyer,Erebuni,House,53.1,Available,1,11,1994,Newly Renovated,78494.18,313.98
3,4,Sale,Buyer,Nubarashen,,60.8,Available,2,11,1966,Partially Renovated,115714.62,462.86
4,5,Sale,Agent,Ajapnyak,House,81.7,Available,2,7,2000,Partially Renovated,148955.74,595.82
5,6,Rent,Owner,Ajapnyak,House,44.0,Available,1,12,1965,Newly Renovated,102406.77,409.63
6,7,Rent,Buyer,Kentron,,160.2,Available,3,11,2014,Not Renovated,173065.03,692.26
7,8,Sale,Agent,Kanaker-Zeytun,,193.1,Available,3,3,2009,Not Renovated,406418.67,1625.67
8,9,Sale,Buyer,Shengavit,,69.5,Available,4,11,1997,Newly Renovated,112919.03,451.68
9,10,Sale,Owner,Davtashen,House,138.9,Available,6,5,1979,Partially Renovated,292136.9,1168.55


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   property_id          3000 non-null   int64  
 1   deal_type            3000 non-null   object 
 2   user_type            2935 non-null   object 
 3   district             2941 non-null   object 
 4   type_name            1488 non-null   object 
 5   size_sqm             3000 non-null   float64
 6   status               3000 non-null   object 
 7   rooms                3000 non-null   int64  
 8   floor                3000 non-null   int64  
 9   year_built           3000 non-null   int64  
 10  renovation_status    3000 non-null   object 
 11  estimated_saleprice  3000 non-null   float64
 12  estimated_rentprice  3000 non-null   float64
dtypes: float64(3), int64(4), object(6)
memory usage: 304.8+ KB


In [4]:
df.isnull().sum()

property_id               0
deal_type                 0
user_type                65
district                 59
type_name              1512
size_sqm                  0
status                    0
rooms                     0
floor                     0
year_built                0
renovation_status         0
estimated_saleprice       0
estimated_rentprice       0
dtype: int64

In [5]:
df.duplicated().sum().any()

np.False_

In [6]:
df.dropna(inplace=True)

In [7]:
df.describe()

Unnamed: 0,property_id,size_sqm,rooms,floor,year_built,estimated_saleprice,estimated_rentprice
count,1419.0,1419.0,1419.0,1419.0,1419.0,1419.0,1419.0
mean,1503.539112,112.116984,3.457364,6.432699,1993.882311,218506.100042,874.024397
std,863.466208,51.318134,1.695046,3.502601,16.90685,116310.459081,465.241911
min,2.0,25.0,1.0,1.0,1965.0,28514.03,114.06
25%,759.5,67.25,2.0,3.0,1979.0,123114.365,492.46
50%,1504.0,111.9,4.0,6.0,1994.0,202476.84,809.91
75%,2236.5,156.3,5.0,9.0,2009.0,293232.075,1172.93
max,3000.0,200.0,6.0,12.0,2023.0,584340.11,2337.36


In [8]:
df.shape

(1419, 13)

In [9]:
# Define categorical columns
categorical_cols = ['type_name', 'district', 'status', 'renovation_status']

# One-hot encode categorical columns
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Features to exclude from modeling
exclude_columns = ['estimated_rent_price', 'estimated_sales_price', 'id', 'property_id',
 'deal_type',
 'user_type',]

# Determine which columns are features
all_columns = df_encoded.columns.tolist()
feature_columns = [col for col in all_columns if col not in exclude_columns]
feature_columns

['size_sqm',
 'rooms',
 'floor',
 'year_built',
 'estimated_saleprice',
 'estimated_rentprice',
 'district_Arabkir',
 'district_Avan',
 'district_Davtashen',
 'district_Erebuni',
 'district_Kanaker-Zeytun',
 'district_Kentron',
 'district_Malatia-Sebastia',
 'district_Nor Nork',
 'district_Nork-Marash',
 'district_Nubarashen',
 'district_Shengavit',
 'renovation_status_Not Renovated',
 'renovation_status_Partially Renovated']

In [10]:
def evaluate_model(model, X_test, y_test, label=""):
    y_pred = model.predict(X_test)
    print(f"\n--- {label} Model Evaluation ---")
    print("MAE:", mean_absolute_error(y_test, y_pred))
    print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))
    print("R² Score:", r2_score(y_test, y_pred))


In [11]:
# ----------------- Rent Price Model -----------------
X_rent = df_encoded[feature_columns]
y_rent = df_encoded['estimated_rentprice']

X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X_rent, y_rent, test_size=0.2, random_state=42)

model_rent = RandomForestRegressor(n_estimators=100, random_state=42)
model_rent.fit(X_train_r, y_train_r)

evaluate_model(model_rent, X_test_r, y_test_r, label="Rent Price")


--- Rent Price Model Evaluation ---
MAE: 1.153401408450708
RMSE: 2.07876475147437
R² Score: 0.9999793886633994


In [12]:
# ----------------- Sales Price Model -----------------
X_sales = df_encoded[feature_columns]
y_sales = df_encoded['estimated_saleprice']
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_sales, y_sales, test_size=0.2, random_state=42)

model_sales = RandomForestRegressor(n_estimators=100, random_state=42)
model_sales.fit(X_train_s, y_train_s)

evaluate_model(model_sales, X_test_s, y_test_s, label="Sales Price")


--- Sales Price Model Evaluation ---
MAE: 303.81806725354227
RMSE: 725.2501642421555
R² Score: 0.9999598586888828


In [13]:
import joblib 

joblib.dump(model_sales, 'models/sales_price_model.pkl')
joblib.dump(model_rent, 'models/rent_price_model.pkl')

['models/rent_price_model.pkl']

In [14]:
df_encoded["predicted_rentprice"] = model_rent.predict(X_rent)
df_encoded["predicted_saleprice"] = model_sales.predict(X_sales)

In [15]:
prediction_df = df[['property_id']].copy()
prediction_df["actual_rentprice"] = df['estimated_rentprice']
prediction_df["predicted_rentprice"] = df_encoded["predicted_rentprice"]
prediction_df["actual_saleprice"] = df['estimated_saleprice']
prediction_df["predicted_saleprice"] = df_encoded["predicted_saleprice"]

In [17]:
# Save to file
output_path = "property_predictions.csv"
prediction_df.to_csv(output_path, index=False)
print(f"\n✅ Prediction CSV saved: {output_path} | Shape: {prediction_df.shape}")


✅ Prediction CSV saved: property_predictions.csv | Shape: (1419, 5)


In [18]:
ready_df = pd.read_csv(output_path)
ready_df.head(20)

Unnamed: 0,property_id,actual_rentprice,predicted_rentprice,actual_saleprice,predicted_saleprice
0,2,656.41,657.2355,164102.48,164299.6101
1,3,313.98,313.8998,78494.18,78492.5449
2,5,595.82,595.7556,148955.74,148951.4556
3,6,409.63,409.6667,102406.77,102454.056
4,10,1168.55,1168.1662,292136.9,292282.5111
5,11,1012.72,1012.9251,253180.07,253200.903
6,12,388.35,388.2076,97087.14,97071.6266
7,13,1719.75,1721.6485,429936.6,430408.3042
8,14,1155.1,1154.5774,288774.6,288612.4814
9,16,849.52,849.476,212380.67,212382.172
