# Data Cleaning and Preprocessing

This notebook handles:
- Cleaning numerical columns
- Geographic filtering (Grand Tunis focus)
- Property type filtering
- Price normalization
- Outlier removal
- Data validation

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
# Load the data
df = pd.read_csv('/content/data_prices_cleaned.csv')
print("Original DataFrame shape:", df.shape)

## Step 1: Cleaning Numerical Columns

In [None]:
print("\n" + "="*80)
print("STEP 1: CLEANING NUMERICAL COLUMNS")
print("="*80)

# Function to clean and convert numerical columns
def clean_numeric_column(series):
    """Clean and convert a series to numeric type"""
    cleaned_series = series.astype(str).str.replace(' ', '', regex=False)  # Remove spaces
    cleaned_series = cleaned_series.str.replace(',', '.', regex=False)  # Replace comma with dot for decimals
    # Replace common non-numeric strings with NaN
    cleaned_series = cleaned_series.replace(['À Vendre', 'À Louer', 'Location', '+', 'Ref924a', 'IFC Marsa', 'sale', 'nan', 'None'], pd.NA)
    # Convert to numeric, coercing errors to NaN
    return pd.to_numeric(cleaned_series, errors='coerce')

# Apply cleaning to numerical columns
df['superficie'] = clean_numeric_column(df['superficie'])
df['chambres'] = clean_numeric_column(df['chambres'])
df['salles_de_bains'] = clean_numeric_column(df['salles_de_bains'])
df['price'] = clean_numeric_column(df['price'])


print(f"✓ Cleaned 'superficie' - Type: {df['superficie'].dtype}")
print(f"✓ Cleaned 'chambres' - Type: {df['chambres'].dtype}")
print(f"✓ Cleaned 'salles_de_bains' - Type: {df['salles_de_bains'].dtype}")
print(f"✓ Converted 'price' to thousands - Type: {df['price'].dtype}")

In [None]:
df

## Step 2: Filtering Data

In [None]:
print("\n" + "="*80)
print("STEP 2: FILTERING DATA")
print("="*80)

print(f"Before filtering: {len(df):,} records")

# Rename columns as requested
df = df.rename(columns={
    'superficie': 'size',
    'chambres': 'room_count',
    'salles_de_bains': 'bathroom_count'
})

# Filter for Grand Tunis apartments for sale
grand_tunis_states = ['Ben Arous', 'Tunis', 'La Manouba', 'Ariana']
df = df[
    (df['transaction'] == 'sale') &
    (df['category'] == 'Appartements') &
    (df['state'].isin(grand_tunis_states))
].copy()

df['price'] = df['price']/1000
df = df[(df['size'] < 500) & (df['size'] >= 24)]
df= df[df['price']>20]
df=df[~(df['price']/df['size']>6)]

df = df[~((df['size'] > 70) & (df['price'] < 70))]
df= df[~((df['size']<90) & (df['price'])>1000)]
df=df[(df['room_count']>0) & (df['room_count']<10)]
df= df[df['bathroom_count']>=0]


# Drop multiple columns
df = df.drop(columns=['contact', 'category', 'location', 'descriptions', 'currency' , 'date','transaction','titles','shops','profiles'])





print(f"After filtering (Grand Tunis apartments for sale): {len(df):,} records")

In [None]:
df

## Step 3: Removing Invalid Rows

In [None]:
print("\n" + "="*80)
print("STEP 3: REMOVING INVALID ROWS")
print("="*80)

print(f"Before removing NaN values: {len(df):,} records")

# Drop rows where key numerical columns are NaN
df.dropna(subset=['price', 'size', 'room_count', 'bathroom_count'], inplace=True)

print(f"After removing NaN values: {len(df):,} records")

## Final Data Summary

In [None]:
print("\n" + "="*80)
print("FINAL DATAFRAME INFO")
print("="*80)
print(f"Final shape: {df.shape}")
print(f"\nData types:")
print(df.dtypes)

print("\n" + "="*80)
print("SUMMARY STATISTICS")
print("="*80)
print("\nDescriptive statistics for key numerical columns (price in kTND):")
print(df[['price', 'size', 'room_count', 'bathroom_count']].describe())

print("\n" + "="*80)
print("SIZE STATISTICS")
print("="*80)
print(f"Mean: {df['size'].mean():.2f} m²")
print(f"Median: {df['size'].median():.2f} m²")
print(f"Min: {df['size'].min():.2f} m²")
print(f"Max: {df['size'].max():.2f} m²")
print(f"Std Dev: {df['size'].std():.2f} m²")
print(f"Count: {df['size'].count():,} records")

print("\n" + "="*80)
print("DATA READY FOR ANALYSIS!")
print("="*80)