## 1. Advanced Data Engineering

We implement a dynamic cleaning pipeline that adapts to the distribution of each city.

In [None]:
df = pd.read_csv('../../../data/raw/source_1/Property-Prices-in-Tunisia.csv')

# --- 1.1 Structural Cleaning ---
def clean_identifiers(s):
    s = str(s).strip()
    if 'Vendre' in s: return 'Vendre'
    if 'Louer' in s: return 'Louer'
    return s

df['type'] = df['type'].apply(clean_identifiers)
for col in ['category', 'city', 'region']:
    df[col] = df[col].astype(str).str.strip()

# Handle numeric missing values
num_cols = ['room_count', 'bathroom_count', 'size']
df[num_cols] = df[num_cols].replace(-1, np.nan)

# Scope Filter
df = df[
    (df['type'] == 'Vendre') &
    (df['category'] == 'Appartements') &
    (df['city'].isin(['Tunis', 'Ariana', 'Ben arous', 'La manouba']))
].copy()

# --- 1.2 Initial Hard Filters (Sanity Check) ---
df = df.dropna(subset=num_cols)
df = df[(df['size'] > 30) & (df['size'] < 600) & (df['price'] > 50000)]

In [None]:
# --- 1.3 Statistical Outlier Removal (IQR Method) ---
# We derive 'price_per_m2' to detect anomalies relative to size.
df['price_per_m2'] = df['price'] / df['size']

def remove_outliers_iqr(group):
    Q1 = group['price_per_m2'].quantile(0.25)
    Q3 = group['price_per_m2'].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return group[(group['price_per_m2'] >= lower_bound) &
                 (group['price_per_m2'] <= upper_bound)]

print(f"Entries before IQR cleaning: {len(df)}")
# Apply IQR filtering per City to respect local market realities
df = df.groupby('city', group_keys=False).apply(remove_outliers_iqr)
print(f"Entries after IQR cleaning: {len(df)}")

sns.boxplot(data=df, x='city', y='price_per_m2')
plt.title("Price per mÂ² Distribution after Outlier Removal")
plt.show()

In [None]:
df['region'].unique()
# df['region'] = df['region'].replace('Ariana', 'Ariana Ville')
# df['region'] = df['region'].replace('La Manouba', 'Manouba Ville')