# Price Analysis

This notebook analyzes:
- Price distributions
- Price relationships with features
- Geographic price analysis

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
# Load the preprocessed data
# Note: Run 02_Data_Cleaning_and_Preprocessing.ipynb first
df = pd.read_csv('/content/data_prices_cleaned.csv')

# Apply the same preprocessing steps
def clean_numeric_column(series):
    cleaned_series = series.astype(str).str.replace(' ', '', regex=False)
    cleaned_series = cleaned_series.str.replace(',', '.', regex=False)
    cleaned_series = cleaned_series.replace(['À Vendre', 'À Louer', 'Location', '+', 'Ref924a', 'IFC Marsa', 'sale', 'nan', 'None'], pd.NA)
    return pd.to_numeric(cleaned_series, errors='coerce')

df['superficie'] = clean_numeric_column(df['superficie'])
df['chambres'] = clean_numeric_column(df['chambres'])
df['salles_de_bains'] = clean_numeric_column(df['salles_de_bains'])
df['price'] = clean_numeric_column(df['price'])

df = df.rename(columns={'superficie': 'size', 'chambres': 'room_count', 'salles_de_bains': 'bathroom_count'})
grand_tunis_states = ['Ben Arous', 'Tunis', 'La Manouba', 'Ariana']
df = df[(df['transaction'] == 'sale') & (df['category'] == 'Appartements') & (df['state'].isin(grand_tunis_states))].copy()
df['price'] = df['price']/1000
df = df[(df['size'] < 500) & (df['size'] >= 24)]
df = df[df['price']>20]
df = df[~(df['price']/df['size']>6)]
df = df[~((df['size'] > 70) & (df['price'] < 70))]
df = df[~((df['size']<90) & (df['price'])>1000)]
df = df[(df['room_count']>0) & (df['room_count']<10)]
df = df[df['bathroom_count']>=0]
df = df.drop(columns=['contact', 'category', 'location', 'descriptions', 'currency' , 'date','transaction','titles','shops','profiles'])
df.dropna(subset=['price', 'size', 'room_count', 'bathroom_count'], inplace=True)

print(f"Data loaded: {df.shape}")

## Basic Price Statistics

In [None]:
df['price'].describe()

In [None]:
df[(df['size']<100 ) & (df['price']>1000)]

## Price vs Size Relationship

In [None]:
plt.figure(figsize=(10, 6))
sns.regplot(x='size', y=np.log1p(df['price']), data=df, scatter_kws={'alpha':0.6}, line_kws={'color':'red'})
plt.title('Price Distribution per Size (Log-transformed Price)')
plt.xlabel('Size (square meters)')
plt.ylabel('Log(Price in kTND + 1)')
plt.tight_layout()
plt.show()

## Overall Price Distribution

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(y=np.log1p(df['price']), palette='viridis')
plt.title('Overall Distribution of Property Prices (Log-transformed)')
plt.ylabel('Log(Price in kTND + 1)')
plt.tight_layout()
plt.show()

## Price Distribution by Region

In [None]:
plt.figure(figsize=(12, 8))
sns.boxplot(x=np.log1p(df['price']), y='city', data=df.sort_values('price', ascending=False), palette='plasma')
plt.title('Price Distribution per Region (Log-transformed)')
plt.xlabel('Log(Price in kTND + 1)')
plt.ylabel('Region')
plt.tight_layout()
plt.show()

## Price Distribution by City

In [None]:
plt.figure(figsize=(12, 8))
sns.boxplot(x=np.log1p(df['price']), y='state', data=df.sort_values('price', ascending=False), palette='mako')
plt.title('Price Distribution per City (Log-transformed)')
plt.xlabel('Log(Price in kTND + 1)')
plt.ylabel('City')
plt.tight_layout()
plt.show()

## Price Distribution by Bathroom Count

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='bathroom_count', y=np.log1p(df['price']), data=df, palette='cubehelix')
plt.title('Price Distribution by Bathroom Count (Log-transformed)')
plt.xlabel('Number of Bathrooms')
plt.ylabel('Log(Price in kTND + 1)')
plt.tight_layout()
plt.show()

## Price Distribution by Room Count

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x='room_count', y=np.log1p(df['price']), data=df, palette='magma')
plt.title('Price Distribution by Room Count (Log-transformed)')
plt.xlabel('Number of Rooms')
plt.ylabel('Log(Price in kTND + 1)')
plt.tight_layout()
plt.show()

## Price per Square Meter Analysis

In [None]:
df[df['price']/df['size']>6]

In [None]:
df.describe()