In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
from math import ceil
import arabic_reshaper
from bidi.algorithm import get_display

In [2]:
def process_arabic_text(text):
    if isinstance(text, str):
        reshaped_text = arabic_reshaper.reshape(text)
        return get_display(reshaped_text)
    return text

In [3]:
df = pd.read_csv("Apts_Updated.csv", encoding = 'utf-8-sig')
df = df.drop(df.columns[0], axis=1)
df.head()

Unnamed: 0,السعر بالشيكل,المدينة,عدد الغرف,عدد الحمامات,مفروشة,مساحة البناء,الطابق,عمر البناء,العقار مرهون,طريقة الدفع,مصعد,موقف سيارات,السعر_الاصلي
0,173.669824,نابلس,3,2,0,138,2,2,False,0,True,True,397476
1,174.110113,رام الله والبيرة,3,2,0,105,3,0,False,0,False,False,400000
2,131.617115,جنين,3,2,0,165,1,1,False,2,False,False,198738
3,316.393336,بيت لحم,3,3,2,300,11,3,False,0,True,True,1780608
4,194.854459,رام الله والبيرة,3,3,0,185,2,3,False,0,True,True,530000


In [4]:
before_counts = df["المدينة"].value_counts().sort_values(ascending=False)
print("\nCity counts BEFORE augmentation:")
print(before_counts.to_string())


City counts BEFORE augmentation:
المدينة
رام الله والبيرة    418
نابلس                75
بيت لحم              37
طولكرم               25
جنين                 20
الخليل               18
القدس                14
قلقيلية               3
أريحا                 3
طوباس                 1
غزة                   1


In [5]:
# parameters
target_per_city = 150         # desired min rows per city after augmentation
price_noise_pct = 0.03        # 3% std dev relative noise for price
area_noise_pct  = 0.03        # 3% std dev relative noise for area
rooms_prob = 0.15             # chance to +/- 1 room for duplicated row
baths_prob = 0.12             # chance to +/- 1 bath for duplicated row
random_seed = 42

np.random.seed(random_seed)
random.seed(random_seed)

In [6]:
# group cities with less than 15 rows as others
small_cities = before_counts[before_counts < 15].index.tolist()
df2 = df.copy()
df2["المدينة"] = df2["المدينة"].apply(lambda c: "أخرى" if c in small_cities else c)

In [7]:
grouped_counts = df2["المدينة"].value_counts().sort_values(ascending=False)
print("\nCity counts AFTER grouping small cities into 'أخرى':")
print(grouped_counts.to_string())


City counts AFTER grouping small cities into 'أخرى':
المدينة
رام الله والبيرة    418
نابلس                75
بيت لحم              37
طولكرم               25
أخرى                 22
جنين                 20
الخليل               18


In [8]:
def decide_target(n):
    if n >= 150:
        return 400         
    else:
        return 4*n

In [9]:
city_targets = {}
for city, cnt in grouped_counts.items():
    tgt = decide_target(cnt)
    city_targets[city] = {"count": int(cnt), "target": int(tgt)}

print("\nPer-city augmentation targets (city : current -> target):")
for city, d in city_targets.items():
    print(f" {city}: {d['count']} -> {d['target']}")


Per-city augmentation targets (city : current -> target):
 رام الله والبيرة: 418 -> 400
 نابلس: 75 -> 300
 بيت لحم: 37 -> 148
 طولكرم: 25 -> 100
 أخرى: 22 -> 88
 جنين: 20 -> 80
 الخليل: 18 -> 72


In [10]:
rows_to_add = []
for city, info in city_targets.items():
    current = info["count"]
    target = info["target"]
    if target <= current:
        continue  # no augmentation for this city

    deficit = target - current
    source = df2[df2["المدينة"] == city]
    if source.empty:
        continue

    sampled = source.sample(n=deficit, replace=True, random_state=random_seed)
    for _, r in sampled.iterrows():
        newr = r.copy()

        # jitter area (مساحة البناء)
        try:
            area = float(newr["مساحة البناء"])
            area_jitter = np.random.normal(loc=area, scale=abs(area)*area_noise_pct)
            newr["مساحة البناء"] = int(max(10, round(area_jitter)))
        except Exception:
            pass

        # jitter rooms (عدد الغرف)
        try:
            rooms = int(newr["عدد الغرف"])
            if np.random.rand() < rooms_prob:
                rooms += np.random.choice([-1, 1])
            newr["عدد الغرف"] = int(max(1, rooms))
        except Exception:
            pass

        # jitter baths (عدد الحمامات)
        try:
            baths = int(newr["عدد الحمامات"])
            if np.random.rand() < baths_prob:
                baths += np.random.choice([-1, 1])
            newr["عدد الحمامات"] = int(max(1, baths))
        except Exception:
            pass

        # jitter age (عمر البناء) mildly within 0..5
        try:
            age = int(newr["عمر البناء"])
            age_j = age + np.random.choice([-1, 0, 1], p=[0.1, 0.8, 0.1])
            newr["عمر البناء"] = int(min(max(age_j, 0), 5))
        except Exception:
            pass

        # jitter original price (السعر_الاصلي)
        try:
            price = float(newr["السعر_الاصلي"])
            price_j = np.random.normal(loc=price, scale=abs(price)*price_noise_pct)
            newr["السعر_الاصلي"] = int(max(1000, round(price_j)))
            # update transformed price consistently if transformed column exists
            if "السعر بالشيكل" in newr.index:
                try:
                    newr["السعر بالشيكل"] = newr["السعر_الاصلي"] ** 0.4
                except Exception:
                    newr["السعر بالشيكل"] = float(newr["السعر_الاصلي"]) ** 0.4
        except Exception:
            pass

        rows_to_add.append(newr)

print(f"\nTotal synthetic rows to add: {len(rows_to_add)}")

aug_df = pd.concat([df2, pd.DataFrame(rows_to_add)], ignore_index=True)


print("\nCity counts AFTER augmentation (top 30):")
after_counts = aug_df["المدينة"].value_counts().sort_values(ascending=False)
print(after_counts.head(30).to_string())

print("\nShapes: original -> grouped -> augmented")
print(" original:", df.shape)
print(" grouped :", df2.shape)
print(" augmented:", aug_df.shape)

# show a few synthetic rows for inspection (they should appear at the tail)
print("\nSample appended rows (last 12 rows):")
display(aug_df.tail(12))


Total synthetic rows to add: 591

City counts AFTER augmentation (top 30):
المدينة
رام الله والبيرة    418
نابلس               300
بيت لحم             148
طولكرم              100
أخرى                 88
جنين                 80
الخليل               72

Shapes: original -> grouped -> augmented
 original: (615, 13)
 grouped : (615, 13)
 augmented: (1206, 13)

Sample appended rows (last 12 rows):


Unnamed: 0,السعر بالشيكل,المدينة,عدد الغرف,عدد الحمامات,مفروشة,مساحة البناء,الطابق,عمر البناء,العقار مرهون,طريقة الدفع,مصعد,موقف سيارات,السعر_الاصلي
1194,158.121657,الخليل,3,3,0,143,11,1,False,0,True,True,314397
1195,161.025526,الخليل,4,2,0,141,6,1,False,0,True,True,329031
1196,186.291763,الخليل,5,3,1,182,4,4,False,2,True,True,473679
1197,154.576517,الخليل,3,2,0,140,11,0,False,0,True,True,297070
1198,180.802199,الخليل,5,3,0,204,1,1,False,2,True,True,439551
1199,198.637891,الخليل,4,4,1,179,4,2,False,0,False,True,556103
1200,178.148755,الخليل,4,3,0,146,3,1,False,0,True,True,423601
1201,154.249435,الخليل,3,2,0,147,11,1,False,0,True,True,295501
1202,159.0173,الخليل,4,2,0,143,6,2,False,0,True,True,318868
1203,299.180275,الخليل,1,1,1,410,4,1,False,0,True,True,1548219


In [11]:
aug_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1206 entries, 0 to 1205
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   السعر بالشيكل  1206 non-null   float64
 1   المدينة        1206 non-null   object 
 2   عدد الغرف      1206 non-null   int64  
 3   عدد الحمامات   1206 non-null   int64  
 4   مفروشة         1206 non-null   int64  
 5   مساحة البناء   1206 non-null   int64  
 6   الطابق         1206 non-null   int64  
 7   عمر البناء     1206 non-null   int64  
 8   العقار مرهون   1206 non-null   bool   
 9   طريقة الدفع    1206 non-null   int64  
 10  مصعد           1206 non-null   bool   
 11  موقف سيارات    1206 non-null   bool   
 12  السعر_الاصلي   1206 non-null   int64  
dtypes: bool(3), float64(1), int64(8), object(1)
memory usage: 97.9+ KB


In [13]:

df_model = pd.read_csv("Augmented_Data.csv", index_col=0)

# Feature Engineering
# 1. Area to Rooms Ratio
df_model['area_to_rooms_ratio'] = df_model['مساحة البناء'] / df_model['عدد الغرف']

# 2. Room to Bathrooms Ratio
df_model['room_to_bathrooms_ratio'] = df_model['عدد الغرف'] / df_model['عدد الحمامات']

# Replace any potential infinite values with NaN and then fill with 0
df_model.replace([np.inf, -np.inf], np.nan, inplace=True)
df_model.fillna(0, inplace=True)

print("Engineered features created:")
print(df_model[['area_to_rooms_ratio', 'room_to_bathrooms_ratio']].head())

Engineered features created:
   area_to_rooms_ratio  room_to_bathrooms_ratio
0            46.000000                      1.5
1            35.000000                      1.5
2            55.000000                      1.5
3           100.000000                      1.0
4            61.666667                      1.0


In [17]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# Define features (X) and target (y)
X = df_model.drop(columns=['السعر_الاصلي', 'السعر بالشيكل'])
y = df_model['السعر_الاصلي']

# One-Hot Encode the 'المدينة' column
X = pd.get_dummies(X, columns=['المدينة'], drop_first=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training features shape: {X_train.shape}")
print(f"Testing features shape: {X_test.shape}")

Training features shape: (964, 18)
Testing features shape: (242, 18)
