# Feature Distribution Analysis

This notebook analyzes:
- Property size distributions
- Room count distributions
- Bathroom count distributions
- Geographic distributions

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
# Load the preprocessed data
# Note: Run 02_Data_Cleaning_and_Preprocessing.ipynb first
df = pd.read_csv('/content/data_prices_cleaned.csv')

# Apply the same preprocessing steps
def clean_numeric_column(series):
    cleaned_series = series.astype(str).str.replace(' ', '', regex=False)
    cleaned_series = cleaned_series.str.replace(',', '.', regex=False)
    cleaned_series = cleaned_series.replace(['À Vendre', 'À Louer', 'Location', '+', 'Ref924a', 'IFC Marsa', 'sale', 'nan', 'None'], pd.NA)
    return pd.to_numeric(cleaned_series, errors='coerce')

df['superficie'] = clean_numeric_column(df['superficie'])
df['chambres'] = clean_numeric_column(df['chambres'])
df['salles_de_bains'] = clean_numeric_column(df['salles_de_bains'])
df['price'] = clean_numeric_column(df['price'])

df = df.rename(columns={'superficie': 'size', 'chambres': 'room_count', 'salles_de_bains': 'bathroom_count'})
grand_tunis_states = ['Ben Arous', 'Tunis', 'La Manouba', 'Ariana']
df = df[(df['transaction'] == 'sale') & (df['category'] == 'Appartements') & (df['state'].isin(grand_tunis_states))].copy()
df['price'] = df['price']/1000
df = df[(df['size'] < 500) & (df['size'] >= 24)]
df = df[df['price']>20]
df = df[~(df['price']/df['size']>6)]
df = df[~((df['size'] > 70) & (df['price'] < 70))]
df = df[~((df['size']<90) & (df['price'])>1000)]
df = df[(df['room_count']>0) & (df['room_count']<10)]
df = df[df['bathroom_count']>=0]
df = df.drop(columns=['contact', 'category', 'location', 'descriptions', 'currency' , 'date','transaction','titles','shops','profiles'])
df.dropna(subset=['price', 'size', 'room_count', 'bathroom_count'], inplace=True)

print(f"Data loaded: {df.shape}")

## Size Distribution Analysis

In [None]:
print("Descriptive statistics for 'size' column:")
display(df['size'].describe())

plt.figure(figsize=(10, 6))
sns.histplot(df['size'], bins=30, kde=True, palette='viridis')
plt.title('Distribution of Property Sizes')
plt.xlabel('Size (square meters)')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()

In [None]:
# Count apartments with size > 250
count = len(df[df['size'] > 250])
print(f"Apartments with size > 250 m²: {count}")

In [None]:
count = len(df[df['size'] > 400])
print(f"Apartments with size > 400 m²: {count}")

In [None]:
df[df['size']>400]

## Room Count Distribution

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='room_count', palette='viridis')
plt.title('Distribution of Properties by Room Count')
plt.xlabel('Number of Rooms')
plt.ylabel('Number of Properties')
plt.tight_layout()
plt.show()

In [None]:
print("Descriptive statistics of price by room count (in kTND):")
display(df.groupby('room_count')['price'].describe())

## Bathroom Count Distribution

In [None]:
print("Descriptive statistics for 'bathroom_count' column:")
display(df['bathroom_count'].describe())

plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='bathroom_count', palette='viridis')
plt.title('Distribution of Properties by Bathroom Count')
plt.xlabel('Number of Bathrooms')
plt.ylabel('Number of Properties')
plt.tight_layout()
plt.show()

In [None]:
print("Descriptive statistics of price by bathroom count (in kTND):")
display(df.groupby('bathroom_count')['price'].describe())

## Geographic Distribution

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='state', order=df['state'].value_counts().index, palette='viridis')
plt.title('Number of Properties per City')
plt.xlabel('City')
plt.ylabel('Number of Properties')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 10))
sns.countplot(data=df, y='city', order=df['city'].value_counts().index, palette='viridis')
plt.title('Number of Properties per Region')
plt.xlabel('Number of Properties')
plt.ylabel('Region')
plt.show()