## 1. Advanced Data Engineering

We implement a dynamic cleaning pipeline that adapts to the distribution of each city.

In [None]:
df = pd.read_csv('../../../data/processed/merged.csv')
df['city'].value_counts()

In [None]:
df['region'].unique()

In [None]:
# --- 1.1 Structural Cleaning ---
for col in ['city', 'region']:
    df[col] = df[col].astype(str).str.strip()
    df[col] = df[col].astype(str).str.lower()

print("Unique cities:", df['city'].unique())
# Handle numeric missing values
num_cols = ['room_count', 'bathroom_count', 'size']
df[num_cols] = df[num_cols].replace(-1, np.nan)

# Scope Filter
df = df[
    (df['city'].isin(['tunis', 'ariana', 'ben arous', 'la manouba']))
].copy()

In [None]:
# --- 1.3 Statistical Outlier Removal (IQR Method) ---
# We derive 'price_per_m2' to detect anomalies relative to size.
df['price_per_m2'] = df['price'] / df['size']

def remove_outliers_iqr(group):
    Q1 = group['price_per_m2'].quantile(0.25)
    Q3 = group['price_per_m2'].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return group[(group['price_per_m2'] >= lower_bound) &
                 (group['price_per_m2'] <= upper_bound)]

print(f"Entries before IQR cleaning: {len(df)}")
# Apply IQR filtering per City to respect local market realities
df = df.groupby('city', group_keys=False).apply(remove_outliers_iqr)
print(f"Entries after IQR cleaning: {len(df)}")

sns.boxplot(data=df, x='city', y='price_per_m2')
plt.title("Price per m² Distribution after Outlier Removal")
plt.show()

In [None]:
df['region'].unique()
# df['region'] = df['region'].replace('Ariana', 'Ariana Ville')
# df['region'] = df['region'].replace('La Manouba', 'Manouba Ville')

In [None]:
def clean_region_names(df, region_column='region'):
    """
    Cleans and merges duplicate region names in the dataset.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        The input dataset
    region_column : str
        The name of the column containing region names (default: 'region')
    
    Returns:
    --------
    pandas.DataFrame
        Dataset with standardized region names (all columns preserved)
    """
    
    # Define grouped regions (first item in each list is the canonical name)
    grouped_regions = [
        ["Ariana Ville"],
        ["Jardins D'el Menzah", "Jardins El Menzah"],
        ["Ennasr"],
        ["Autres villes"],
        ["Borj Louzir"],
        ["La Soukra"],
        ["Ghazela"],
        ["Ariana"],
        ["Chotrana"],
        ["Raoued"],
        ["Mnihla"],
        ["Ettadhamen"],
        ["Sidi Thabet"],
        ["Fouchana"],
        ["Mornag"],
        ["Medina Jedida"],
        ["El Mourouj"],
        ["Hammam Chott"],
        ["Ezzahra"],
        ["Boumhel"],
        ["Hammam Lif"],
        ["Radès", "Rads"],
        ["Mégrine", "Mgrine"],
        ["Ben arous", "Ben Arous"],
        ["Manouba Ville"],
        ["Oued Ellil"],
        ["Denden"],
        ["La manouba", "La Manouba"],
        ["Douar Hicher"],
        ["Le Bardo"],
        ["L'aouina", "L Aouina"],
        ["La Marsa"],
        ["La Goulette"],
        ["Carthage"],
        ["Agba"],
        ["Ettahrir"],
        ["Menzah"],
        ["Tunis"],
        ["Sidi Daoud"],
        ["Le Kram"],
        ["El Kabaria"],
        ["El Ouardia"],
        ["Manar"],
        ["Ezzouhour"],
        ["Centre Urbain Nord"],
        ["Médina"],
        ["Centre Ville - Lafayette", "Centre Ville Lafayette"],
        ["Sidi Bou Said"],
        ["Hraïria", "Hraria"],
        ["Sidi Hassine"],
        ["Mutuelleville"],
        ["Ain Zaghouan Nord"],
        ["Chotrana 1"],
        ["Cit Ennasr 2"],
        ["Bab Souika"],
        ["Borj Cedria"],
        ["El Mourouj 5"],
        ["El Menzah 7"],
        ["Jardins De Carthage"],
        ["El Omrane Suprieur"],
        ["Ain Zaghouen", "Ain Zaghouan"],
        ["El Mourouj 6"],
        ["El Mourouj 1"],
        ["El Manar 1"],
        ["Riadh Andalous"],
        ["El Menzah 9"],
        ["Ariana Essoughra"],
        ["Cit Olympique"],
        ["El Menzah 4"],
        ["Les Jardins El_Menzah_2"],
        ["Montplaisir"],
        ["Dar Fadhal"],
        ["El Menzah 5"],
        ["Mohamedia"],
        ["El Manar 2"],
        ["Cit El Khadra"],
        ["Cite Ennkhilet"],
        ["Ain Zaghouan Sud"],
        ["Tunis Belvedere"],
        ["Gammarth"],
        ["Lac 2"],
        ["Ksar Said"],
        ["Cit Hedi Nouira"],
        ["El Menzah 6"],
        ["Les Jardins El_Menzah_1"],
        ["Lac 1"],
        ["El Mourouj 4"]
    ]
    
    # Create mapping dictionary (all variants map to the first item in each group)
    region_mapping = {}
    for group in grouped_regions:
        canonical_name = group[0]  # First name is the standard
        for variant in group:
            region_mapping[variant] = canonical_name
    
    # Create a copy of the entire dataframe to avoid modifying the original
    df_cleaned = df.copy()
    
    # Apply the mapping ONLY to the region column
    df_cleaned[region_column] = df_cleaned[region_column].replace(region_mapping)
    
    return df_cleaned

df_cleaned = clean_region_names(df, region_column='region')
df = df_cleaned.copy()

In [None]:
def test_region_cleaning(df_original, df_cleaned, region_column='region'):
    """
    Tests whether the region cleaning function worked correctly.
    
    Parameters:
    -----------
    df_original : pandas.DataFrame
        The original dataset before cleaning
    df_cleaned : pandas.DataFrame
        The cleaned dataset after cleaning
    region_column : str
        The name of the column containing region names
    
    Returns:
    --------
    bool : True if all tests passed, False otherwise
    """
    
    # Test 1: Check if dataframe shapes are the same
    if df_original.shape != df_cleaned.shape:
        print(f"❌ FAILED: Shape changed from {df_original.shape} to {df_cleaned.shape}")
        return False
    
    # Test 2: Check if duplicates were merged (unique regions should be less or equal)
    original_unique = df_original[region_column].nunique()
    cleaned_unique = df_cleaned[region_column].nunique()
    
    if cleaned_unique > original_unique:
        print(f"❌ FAILED: Unique regions increased from {original_unique} to {cleaned_unique}")
        return False
    
    # Test 3: Check specific duplicate pairs were merged
    duplicate_pairs = [
        ("Radès", "Rads"),
        ("Mégrine", "Mgrine"),
        ("Ben arous", "Ben Arous"),
        ("La manouba", "La Manouba"),
        ("L'aouina", "L Aouina"),
        ("Jardins D'el Menzah", "Jardins El Menzah"),
        ("Hraïria", "Hraria"),
        ("Ain Zaghouen", "Ain Zaghouan"),
        ("Centre Ville - Lafayette", "Centre Ville Lafayette")
    ]
    
    for pair in duplicate_pairs:
        # Check if both variants existed in original
        if all(variant in df_original[region_column].values for variant in pair):
            # Check if both still exist in cleaned (they shouldn't)
            if pair[0] in df_cleaned[region_column].values and pair[1] in df_cleaned[region_column].values:
                print(f"❌ FAILED: {pair[0]} and {pair[1]} were not merged")
                return False
    
    # Test 4: Check no rows were lost
    if len(df_original) != len(df_cleaned):
        print(f"❌ FAILED: Rows changed from {len(df_original)} to {len(df_cleaned)}")
        return False
    
    # All tests passed
    print(f"✅ PASSED: Cleaning successful!")
    print(f"   - Rows preserved: {len(df_cleaned)}")
    print(f"   - Unique regions: {original_unique} → {cleaned_unique}")
    print(f"   - Merged: {original_unique - cleaned_unique} duplicate regions")
    return True


# Usage example:
# df = pd.read_csv('your_data.csv')
# df_cleaned = clean_region_names(df, region_column='region')
is_ok = test_region_cleaning(df, df_cleaned, region_column='region')
# 
if is_ok:
    print("Everything is OK!")
else:
    print("Something went wrong!")
