# Analyzing the Effect of Elevator on Predicted Prices
This notebook is a **cut subscript** extracted from the main price prediction project. It depends on variables, preprocessing, and models defined elsewhere, so running it independently will cause **errors**.

In [19]:
city_categories = [col.replace('المدينة_', '') for col in X.columns if col.startswith('المدينة_')]

example_input = {
    'عدد الغرف': 5,
    'عدد الحمامات': 2,
    'مفروشة': 0,
    'مساحة البناء': 120,
    'الطابق': 3,
    'عمر البناء': 5,
    'العقار مرهون': False,
    'طريقة الدفع': 0,
    'مصعد': True,
    'موقف سيارات': False,   # only used for post-prediction adjustment
    'المدينة': 'رام الله'
}

feature_cols_for_prediction = [col for col in X.columns if col != 'موقف سيارات']

predicted_price = predict_price(final_model, example_input, city_categories, feature_columns=feature_cols_for_prediction)
print(f"Predicted Price (شيكل): {predicted_price:.2f}")


Predicted Price (شيكل): 247133.34


In [20]:
city_categories = [col.replace('المدينة_', '') for col in X.columns if col.startswith('المدينة_')]

example_input = {
    'عدد الغرف': 5,
    'عدد الحمامات': 2,
    'مفروشة': 0,
    'مساحة البناء': 120,
    'الطابق': 3,
    'عمر البناء': 5,
    'العقار مرهون': False,
    'طريقة الدفع': 0,
    'مصعد': False,
    'موقف سيارات': False,   # only used for post-prediction adjustment
    'المدينة': 'رام الله'
}

feature_cols_for_prediction = [col for col in X.columns if col != 'موقف سيارات']

predicted_price = predict_price(final_model, example_input, city_categories, feature_columns=feature_cols_for_prediction)
print(f"Predicted Price (شيكل): {predicted_price:.2f}")


Predicted Price (شيكل): 201836.84


Currently the elevator feature seems to a have an unrealistic effect on the price, so we are going to analyze it's effect on price more closely so we can use that in post predection adjustment instead of using it in the training directly

In [21]:
df['مصعد'].value_counts()

مصعد
True     783
False    460
Name: count, dtype: int64

In [22]:
df.groupby('مصعد')['السعر بالشيكل'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
مصعد,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
False,460.0,385422.417391,197408.16191,5222.0,234385.0,349865.5,487127.25,1038688.0
True,783.0,449183.720307,275383.160365,6124.0,311197.0,426604.0,558721.5,5007960.0


## Define function to find similar property pairs
This function:
- Splits data into two groups (with an elevator / without an elevator).
- Converts boolean features to integers for easier comparison.
- Can limit comparisons to the same floor
- Matches each non-elevator sample with a elevator sample having nearly identical features within a given tolerance.


In [23]:
def find_similar_pairs_elevator(
    df,
    feature_cols,
    elevator_col="مصعد",
    floor_col="الطابق",
    same_floor=True,
    floor_value=None,
    normalize=True,
    tolerance=0.10,
    max_pairs=100,
    top_k_per_no=3,
):
    df_no = df[df[elevator_col].astype(int) == 0].copy()
    df_yes = df[df[elevator_col].astype(int) == 1].copy()

    # optionally filter by floor_value
    if floor_value is not None:
        df_no = df_no[df_no[floor_col] == floor_value]
        df_yes = df_yes[df_yes[floor_col] == floor_value]

    # nothing to match
    if df_no.shape[0] == 0 or df_yes.shape[0] == 0:
        return []

    # comparison columns
    comp_cols = [c for c in feature_cols if c not in {elevator_col, floor_col}]

    # copy and convert bools to ints
    df_no_comp = df_no[comp_cols].copy()
    df_yes_comp = df_yes[comp_cols].copy()

    bool_cols = df_no_comp.select_dtypes(include=['bool']).columns.tolist()
    for c in bool_cols:
        df_no_comp[c] = df_no_comp[c].astype(int)
        df_yes_comp[c] = df_yes_comp[c].astype(int)

    # ensure same column order
    df_yes_comp = df_yes_comp[comp_cols]
    df_no_comp = df_no_comp[comp_cols]

    if normalize:
        numeric_cols = df_no_comp.select_dtypes(include=[np.number]).columns.tolist()
        # compute min/max across both sets
        mins = pd.concat([df_no_comp[numeric_cols], df_yes_comp[numeric_cols]], axis=0).min()
        maxs = pd.concat([df_no_comp[numeric_cols], df_yes_comp[numeric_cols]], axis=0).max()
        ranges = (maxs - mins).replace(0, 1.0)  # avoid divide-by-zero

        # apply (value - min) / range
        df_no_norm = df_no_comp.copy()
        df_yes_norm = df_yes_comp.copy()
        for col in numeric_cols:
            df_no_norm[col] = (df_no_comp[col] - mins[col]) / ranges[col]
            df_yes_norm[col] = (df_yes_comp[col] - mins[col]) / ranges[col]
    else:
        df_no_norm = df_no_comp.copy()
        df_yes_norm = df_yes_comp.copy()

    yes_vals = df_yes_norm.values
    yes_index = df_yes_norm.index.to_numpy()
    pairs = []

    for idx_no in df_no_norm.index:
        no_vec = df_no_norm.loc[idx_no].values  

        dists = np.abs(yes_vals - no_vec).sum(axis=1) 

        candidate_mask = dists <= tolerance
        if candidate_mask.sum() == 0:
            # optionally allow fallback to take top-k nearest
            top_k = np.argsort(dists)[:top_k_per_no]
            selected_idxs = top_k
        else:
            # order the masked candidates by distance and take top_k_per_no
            candidate_idxs = np.where(candidate_mask)[0]
            ordered = candidate_idxs[np.argsort(dists[candidate_idxs])]
            selected_idxs = ordered[:top_k_per_no]

        for sel in selected_idxs:
            idx_yes = yes_index[sel]
            dist = float(dists[sel])
            pairs.append((idx_no, idx_yes, dist))
            if len(pairs) >= max_pairs:
                return pairs

    return pairs


In [36]:
numeric_feature_cols = [
    "عدد الغرف",
    "عدد الحمامات",
    "مفروشة",
    "مساحة البناء",
    "الطابق",
    "عمر البناء",
    "العقار مرهون",
    "مصعد",
    "موقف سيارات"
]

all_pairs_by_floor = {}  # use dict, not list
for f in sorted(df["الطابق"].dropna().unique()):
    pair = find_similar_pairs_elevator(df, numeric_feature_cols,
                                     elevator_col="مصعد",
                                     floor_col="الطابق",
                                     floor_value=f,
                                     normalize=True,
                                     tolerance=0.08,
                                     max_pairs=200,
                                     top_k_per_no=2)
    all_pairs_by_floor[f] = pair
    print(f, "pairs:", len(pair))


-2 pairs: 16
-1 pairs: 20
0 pairs: 94
1 pairs: 186
2 pairs: 156
3 pairs: 105
4 pairs: 95
5 pairs: 56
6 pairs: 73
7 pairs: 36
8 pairs: 0
9 pairs: 24
10 pairs: 6
11 pairs: 22


In [40]:
price_col = "السعر بالشيكل"
all_stats = {}

for f, pairs in all_pairs_by_floor.items():
    percent_diffs = []
    print("="*80)
    print(f"Floor {f} → {len(pairs)} matched pairs")
    print("="*80)

    for idx_no, idx_yes, _dist in pairs:
        no_row = df.loc[idx_no].copy()
        yes_row = df.loc[idx_yes].copy()

        actual_no = no_row[price_col]
        actual_yes = yes_row[price_col]

        if actual_no > 0:
            percent_diff = (actual_yes - actual_no) / actual_no * 100
            percent_diffs.append(percent_diff)

        # Detailed comparison
        print(f"Pair: No elevator idx {idx_no}, Elevator idx {idx_yes}")
        print(f"Actual prices: No elevator {actual_no}, Elevator {actual_yes}")
        print(f"Actual difference: {actual_yes - actual_no:.0f}")
        print(f"Actual percent difference: {percent_diff:.2f}%")
        print('-'*50)

    if percent_diffs:
        stats = {
            "mean": np.mean(percent_diffs),
            "median": np.median(percent_diffs),
            "min": np.min(percent_diffs),
            "max": np.max(percent_diffs),
            "count": len(percent_diffs)
        }
        all_stats[f] = stats
        print(f"Summary for floor {f}:")
        print(f"  Mean elevator effect:   {stats['mean']:.2f}%")
        print(f"  Median elevator effect: {stats['median']:.2f}%")
        print(f"  Min effect: {stats['min']:.2f}%")
        print(f"  Max effect: {stats['max']:.2f}%")
    else:
        all_stats[f] = {}
        print("⚠️ No valid pairs on this floor.")


Floor -2 → 16 matched pairs
Pair: No elevator idx 8, Elevator idx 272
Actual prices: No elevator 156898, Elevator 352412
Actual difference: 195514
Actual percent difference: 124.61%
--------------------------------------------------
Pair: No elevator idx 8, Elevator idx 304
Actual prices: No elevator 156898, Elevator 250000
Actual difference: 93102
Actual percent difference: 59.34%
--------------------------------------------------
Pair: No elevator idx 16, Elevator idx 272
Actual prices: No elevator 204028, Elevator 352412
Actual difference: 148384
Actual percent difference: 72.73%
--------------------------------------------------
Pair: No elevator idx 16, Elevator idx 192
Actual prices: No elevator 204028, Elevator 463700
Actual difference: 259672
Actual percent difference: 127.27%
--------------------------------------------------
Pair: No elevator idx 18, Elevator idx 272
Actual prices: No elevator 352412, Elevator 352412
Actual difference: 0
Actual percent difference: 0.00%
-----

In [41]:
summary_rows = []
for f, stats in all_stats.items():
    if stats:  # skip empty floors
        summary_rows.append({
            "Floor": f,
            "Pairs": stats["count"],
            "Mean %": round(stats["mean"], 2),
            "Median %": round(stats["median"], 2),
            "Min %": round(stats["min"], 2),
            "Max %": round(stats["max"], 2)
        })

summary_df = pd.DataFrame(summary_rows).sort_values("Floor")
display(summary_df)

print("\n=== Aggregated elevator effect across ALL floors ===")
all_diffs = [pct for stats in all_stats.values() if stats for pct in [stats["mean"]]]
if all_diffs:
    print(f"Overall mean of floor means:   {np.mean(all_diffs):.2f}%")
    print(f"Overall median of floor means: {np.median(all_diffs):.2f}%")
else:
    print("⚠️ No valid comparisons found.")


Unnamed: 0,Floor,Pairs,Mean %,Median %,Min %,Max %
0,-2,16,50.14,48.69,-28.57,127.27
1,-1,20,1.9,-0.42,-88.02,85.0
2,0,94,542.21,-0.0,-78.96,7839.08
3,1,186,18.92,-21.64,-98.92,1491.39
4,2,156,-5.1,-25.11,-96.32,518.27
5,3,105,56.64,10.0,-69.35,672.41
6,4,95,20.4,1.94,-64.92,287.87
7,5,56,-31.63,-48.23,-85.65,112.96
8,6,73,-6.35,-3.14,-50.14,78.69
9,7,36,30.72,34.68,-46.5,80.83



=== Aggregated elevator effect across ALL floors ===
Overall mean of floor means:   468.22%
Overall median of floor means: 18.92%


## Conclusion

Right now, the analysis shows that the effect of having an elevator on apartment prices is highly inconsistent across floors, with extreme outliers and widely varying median and mean impacts. Because of this unreliability, we will drop the "مصعد" feature from the model.