# Analyzing the Effect of Parking on Predicted Prices
This notebook is a **cut subscript** extracted from the main price prediction project. It depends on variables, preprocessing, and models defined elsewhere, so running it independently will cause **errors**. Its purpose is to analyze the effect of parking on predicted prices by finding nearly identical property pairs differing only by parking availability, predicting their prices, and measuring the average percentage price increase due to parking.

In [93]:
# example
city_categories = [col.replace('المدينة_', '') for col in X.columns if col.startswith('المدينة_')]

example_input = {
    'عدد الغرف': 3,
    'عدد الحمامات': 2,
    'مفروشة': 0,
    'مساحة البناء': 128,
    'الطابق': 2,
    'عمر البناء': 0,
    'العقار مرهون': False,
    'طريقة الدفع': 2,
    'مصعد': True,
    'موقف سيارات': False,
    'المدينة': 'نابلس'
}

predicted_price = predict_price(final_model, example_input, city_categories)
print(f"Predicted Price (شيكل): {predicted_price:.2f}")

Predicted Price (شيكل): 198424.22


In [95]:
# example
city_categories = [col.replace('المدينة_', '') for col in X.columns if col.startswith('المدينة_')]

example_input = {
    'عدد الغرف': 3,
    'عدد الحمامات': 2,
    'مفروشة': 0,
    'مساحة البناء': 128,
    'الطابق': 1,
    'عمر البناء': 0,
    'العقار مرهون': False,
    'طريقة الدفع': 2,
    'مصعد': True,
    'موقف سيارات': True,
    'المدينة': 'نابلس'
}

predicted_price = predict_price(final_model, example_input, city_categories)
print(f"Predicted Price (شيكل): {predicted_price:.2f}")

Predicted Price (شيكل): 251801.69


Currently the parking feature seems too have a way higher sensitivty than other features, it looks like currently the model associate "no parking" with cheaper apartments, we need to investigate that

In [40]:
df['موقف سيارات'].value_counts()


موقف سيارات
True     1174
False     336
Name: count, dtype: int64

In [81]:
#check feature importance and effect on predictions.
import shap
import numpy as np
import pandas as pd

explainer = shap.TreeExplainer(final_model)
shap_values = explainer.shap_values(X)

# Create a DataFrame of mean absolute SHAP values per feature (importance)
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'mean_abs_shap': np.abs(shap_values).mean(axis=0)
}).sort_values(by='mean_abs_shap', ascending=False)

print("Top 10 features by mean absolute SHAP value:")
print(feature_importance.head(10))

# Print summary stats for 'موقف سيارات' SHAP values
parking_shap = shap_values[:, X.columns.get_loc('موقف سيارات')]
print("\nStatistics for 'موقف سيارات' SHAP values:")
print(f"Mean: {parking_shap.mean():.4f}")
print(f"Std: {parking_shap.std():.4f}")
print(f"Min: {parking_shap.min():.4f}")
print(f"Max: {parking_shap.max():.4f}")

# Print mean predicted price effect when parking is True vs False
parking_true_mask = X['موقف سيارات'] == 1
parking_false_mask = X['موقف سيارات'] == 0

print(f"\nMean SHAP value when 'موقف سيارات' == True: {parking_shap[parking_true_mask].mean():.4f}")
print(f"Mean SHAP value when 'موقف سيارات' == False: {parking_shap[parking_false_mask].mean():.4f}")

# Optionally, print a few example SHAP values for parking
print("\nExample SHAP values for 'موقف سيارات' (first 10 samples):")
print(parking_shap[:10])


Top 10 features by mean absolute SHAP value:
                     feature  mean_abs_shap
14  المدينة_رام الله والبيرة      15.220955
9                 area_trans      11.743759
4                 عمر البناء       5.087993
3                     الطابق       4.877995
8                موقف سيارات       4.182355
1               عدد الحمامات       4.040043
6                طريقة الدفع       3.021667
12           المدينة_بيت لحم       2.730191
0                  عدد الغرف       2.493217
2                     مفروشة       2.478566

Statistics for 'موقف سيارات' SHAP values:
Mean: 0.0250
Std: 5.7450
Min: -34.6040
Max: 7.2415

Mean SHAP value when 'موقف سيارات' == True: 2.6892
Mean SHAP value when 'موقف سيارات' == False: -9.2838

Example SHAP values for 'موقف سيارات' (first 10 samples):
[  4.9923735 -11.805341  -10.474586    3.2149081   1.0586965   1.5593865
  -4.3943114  -7.7702556 -11.000396    1.982968 ]


In [80]:
df.groupby('موقف سيارات')['السعر بالشيكل'].describe()


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
موقف سيارات,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
False,336.0,349708.824405,299152.671115,5230.0,199838.75,299096.0,418748.25,3153160.0
True,1174.0,482879.822828,367411.478668,3710.0,310726.25,444642.5,577022.25,5007960.0


In [99]:
# 1) Check basic distribution
print("\n--- Parking value counts ---")
print(df['موقف سيارات'].value_counts(dropna=False))

print("\n--- Mean and median price by parking ---")
print(df.groupby('موقف سيارات')['السعر بالشيكل'].agg(['mean', 'median', 'count']))

# 2) Correlation with top features
top_features = ["area_trans", "عمر البناء", "الطابق", "عدد الحمامات", "عدد الغرف"]
for feat in top_features:
    corr = df[['موقف سيارات', feat]].corr().iloc[0,1]
    print(f"Correlation between موقف سيارات and {feat}: {corr:.3f}")

# 3) Look at SHAP values in detail for parking
import numpy as np

# feature_names list from your training features, you can get it from X.columns
feature_names = list(X.columns)

parking_index = feature_names.index('موقف سيارات')
parking_shap = shap_values[:, parking_index]

# Add the SHAP values to df for analysis
df['parking_shap'] = parking_shap

print("\n--- Top 10 rows with largest positive SHAP for parking ---")
print(df.nlargest(10, 'parking_shap')[['موقف سيارات', 'السعر بالشيكل', 'area_trans', 'المدينة', 'parking_shap']])

print("\n--- Top 10 rows with largest negative SHAP for parking ---")
print(df.nsmallest(10, 'parking_shap')[['موقف سيارات', 'السعر بالشيكل', 'area_trans', 'المدينة', 'parking_shap']])

# 4) Check if the negative SHAPs are all when parking=False
print("\n--- Crosstab of SHAP sign vs parking ---")
print(pd.crosstab(df['موقف سيارات'], df['parking_shap'] > 0))



--- Parking value counts ---
موقف سيارات
True     1174
False     336
Name: count, dtype: int64

--- Mean and median price by parking ---
                      mean    median  count
موقف سيارات                                
False        349708.824405  299096.0    336
True         482879.822828  444642.5   1174
Correlation between موقف سيارات and area_trans: 0.099
Correlation between موقف سيارات and عمر البناء: -0.230
Correlation between موقف سيارات and الطابق: 0.083
Correlation between موقف سيارات and عدد الحمامات: 0.179
Correlation between موقف سيارات and عدد الغرف: 0.009

--- Top 10 rows with largest positive SHAP for parking ---
     موقف سيارات  السعر بالشيكل  area_trans           المدينة  parking_shap
19          True        5007960   12.649111  رام الله والبيرة     44.757790
20         False        3153160   12.449900  رام الله والبيرة     41.787388
739        False        3148319   12.449900  رام الله والبيرة     41.787388
514         True        3709596   12.247449  رام الله 

In [101]:
# 1) Analyze parking with city interactions (mean price per city+parking)
print("\n--- Mean price grouped by city and parking ---")
print(df.groupby(['المدينة', 'موقف سيارات'])['السعر بالشيكل'].mean().sort_values(ascending=False).head(20))

# 2) Count samples where parking=True but SHAP < 0, print sample details
print("\n--- Samples with موقف سيارات=True but negative SHAP ---")
neg_parking_shap_true = df[(df['موقف سيارات'] == True) & (df['parking_shap'] < 0)]
print(neg_parking_shap_true[['السعر بالشيكل', 'area_trans', 'المدينة', 'parking_shap']].head(10))

# 3) Check distribution of SHAP values for parking within different cities
print("\n--- SHAP mean for parking by city ---")
print(df.groupby('المدينة')['parking_shap'].mean().sort_values(ascending=False))



--- Mean price grouped by city and parking ---
المدينة           موقف سيارات
رام الله والبيرة  True           624325.220751
                  False          534104.474227
بيت لحم           True           478485.513812
الخليل            True           473740.857143
القدس             True           398560.042857
الخليل            False          383234.555556
نابلس             True           364544.505102
طولكرم            True           352916.333333
بيت لحم           False          343803.052632
أخرى              True           313515.379310
طولكرم            False          297973.418605
أخرى              False          290024.318182
القدس             False          273989.900000
نابلس             False          262231.701923
جنين              True           245812.382353
                  False          203352.312500
Name: السعر بالشيكل, dtype: float64

--- Samples with موقف سيارات=True but negative SHAP ---
    السعر بالشيكل  area_trans           المدينة  parking_shap
4          5300

In [103]:
print(df[(df['موقف سيارات'] == True) & (df['parking_shap'] < 0)][
    ['السعر بالشيكل', 'area_trans', 'المدينة', 'parking_shap', 'عدد الغرف', 'عدد الحمامات', 'عمر البناء', 'الطابق']
].head(10))


    السعر بالشيكل  area_trans           المدينة  parking_shap  عدد الغرف  \
4          530000   13.601471  رام الله والبيرة     -3.365403          3   
5          172588   11.180340            طولكرم     -3.208519          3   
14         612084   13.038405  رام الله والبيرة     -5.039831          3   
17         550000   11.704700           بيت لحم     -1.268223          3   
22         649180   13.674794  رام الله والبيرة     -2.146302          3   
23         500000   12.449900           بيت لحم     -4.386281          3   
24         574988   13.114877  رام الله والبيرة     -4.500825          3   
25         649180   16.340135  رام الله والبيرة     -1.996356          2   
26         593536   12.449900  رام الله والبيرة    -13.558932          3   
28         389508   10.954451  رام الله والبيرة     -2.203974          3   

    عدد الحمامات  عمر البناء  الطابق  
4              3           3       2  
5              2           2       3  
14             2           1       4  
17     

## Step 1: Define function to find similar property pairs
This function:
- Splits data into two groups (with parking / without parking).
- Converts boolean features to integers for easier comparison.
- Matches each non-parking sample with a parking sample having nearly identical features within a given tolerance.


In [119]:
def find_similar_pairs(df, feature_cols, parking_col='موقف سيارات', tolerance=0.05, max_pairs=10):
    df_no_parking = df[df[parking_col] == False]
    df_parking = df[df[parking_col] == True]

    # Convert boolean columns to int for subtraction
    bool_cols = df[feature_cols].select_dtypes(include='bool').columns
    df_no_parking_num = df_no_parking[feature_cols].copy()
    df_parking_num = df_parking[feature_cols].copy()

    for col in bool_cols:
        df_no_parking_num[col] = df_no_parking_num[col].astype(int)
        df_parking_num[col] = df_parking_num[col].astype(int)

    pairs = []

    for idx_no in df_no_parking_num.index:
        no_row = df_no_parking_num.loc[idx_no]

        diffs = df_parking_num.sub(no_row).abs()

        # Sum of differences per row
        total_diff = diffs.sum(axis=1)

        # Find parking samples close to no_parking sample within tolerance
        close_matches = total_diff[total_diff <= tolerance]

        for idx_yes in close_matches.index[:max_pairs - len(pairs)]:
            pairs.append((idx_no, idx_yes))
            if len(pairs) >= max_pairs:
                break
        if len(pairs) >= max_pairs:
            break

    return pairs


## Step 2: Prepare feature set and find matching pairs
- Add parking information back into the feature dataset.
- Remove parking from the feature comparison list.
- Use the `find_similar_pairs` function to get up to 10 pairs of nearly identical properties.


In [None]:
# Copy X and add the parking column from original df for filtering
X_feat = X.copy()
X_feat['موقف سيارات'] = df['موقف سيارات']

# Now feature columns excluding parking
feature_cols = list(X_feat.columns)
feature_cols.remove('موقف سيارات')

pairs = find_similar_pairs(X_feat, feature_cols, parking_col='موقف سيارات', tolerance=0.05, max_pairs=10)

## Step 3: Predict and compare prices for matched pairs
For each matched pair:
- Predict the price for the property without parking.
- Predict the price for the property with parking.
- Calculate and print the price difference in both absolute and percentage terms.
- Compute summary statistics (mean, median, min, max) of the parking effect across all pairs.

In [121]:
percent_diffs = []

for idx_no_parking, idx_parking in pairs:
    no_parking_row = df.loc[idx_no_parking].copy()
    parking_row = df.loc[idx_parking].copy()
    
    pred_no_parking = predict_price(final_model, no_parking_row.to_dict(), city_categories)
    pred_parking = predict_price(final_model, parking_row.to_dict(), city_categories)
    
    if pred_no_parking > 0:
        percent_diff = (pred_parking - pred_no_parking) / pred_no_parking * 100
        percent_diffs.append(percent_diff)
    
    print(f"Pair: No parking idx {idx_no_parking}, Parking idx {idx_parking}")
    print(f"Actual prices: No parking {no_parking_row['السعر بالشيكل']}, Parking {parking_row['السعر بالشيكل']}")
    print(f"Predicted prices: No parking {pred_no_parking:.2f}, Parking {pred_parking:.2f}")
    print(f"Predicted price difference: {pred_parking - pred_no_parking:.2f}")
    print(f"Predicted percent difference: {percent_diff:.2f}%")
    print('-'*50)

import numpy as np
print(f"Mean parking effect (% increase): {np.mean(percent_diffs):.2f}%")
print(f"Median parking effect (% increase): {np.median(percent_diffs):.2f}%")
print(f"Min parking effect (%): {np.min(percent_diffs):.2f}%")
print(f"Max parking effect (%): {np.max(percent_diffs):.2f}%")


Pair: No parking idx 30, Parking idx 95
Actual prices: No parking 556440, Parking 500796
Predicted prices: No parking 513188.31, Parking 507525.78
Predicted price difference: -5662.53
Predicted percent difference: -1.10%
--------------------------------------------------
Pair: No parking idx 30, Parking idx 186
Actual prices: No parking 556440, Parking 463700
Predicted prices: No parking 513188.31, Parking 507525.78
Predicted price difference: -5662.53
Predicted percent difference: -1.10%
--------------------------------------------------
Pair: No parking idx 30, Parking idx 308
Actual prices: No parking 556440, Parking 547500
Predicted prices: No parking 513188.31, Parking 507525.78
Predicted price difference: -5662.53
Predicted percent difference: -1.10%
--------------------------------------------------
Pair: No parking idx 30, Parking idx 344
Actual prices: No parking 556440, Parking 500796
Predicted prices: No parking 513188.31, Parking 507525.78
Predicted price difference: -5662.

# Conclusion

We see that having parking results in a very small effect on price. For safe measure, we will use the estimated 1.1% (the median) increase in price when parking is available in our prediction model.