In [None]:
# %% [markdown]
"""
# London Housing Market Analysis (1995-2019)

This notebook analyzes London's housing market data including:
- Monthly average house prices
- Yearly number of houses sold
- Monthly crime statistics
"""

# %%
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Configure visualization settings
plt.style.use('ggplot')
sns.set_palette('husl')
%matplotlib inline

# %% [markdown]
"""
## Data Loading and Initial Inspection
"""
# %%
def load_housing_data(filepath):
    """Load housing data with error handling and initial checks"""
    try:
        df = pd.read_csv(filepath)
        print(f"Data loaded successfully with {df.shape[0]} records spanning {df.shape[1]} features")
        
        # Basic data validation
        required_columns = ['date', 'area', 'average_price', 'no_of_crimes']
        missing_cols = [col for col in required_columns if col not in df.columns]
        if missing_cols:
            raise ValueError(f"Missing required columns: {missing_cols}")
            
        return df
    except FileNotFoundError:
        print("Error: File not found. Please check the file path.")
    except Exception as e:
        print(f"An error occurred while loading the data: {str(e)}")
    return None

housing_df = load_housing_data("Housing_Dataset.csv")

# %% [markdown]
"""
## Data Quality Assessment
"""
# %%
def assess_housing_data_quality(df):
    """Perform data quality checks and visualization"""
    print("\n=== Data Completeness ===")
    print("Record counts per column:")
    print(df.count())
    
    print("\n=== Missing Values ===")
    missing_values = df.isnull().sum()
    print(missing_values)
    
    # Visualize missing data
    plt.figure(figsize=(12, 6))
    sns.heatmap(df.isnull(), yticklabels=False, cbar=False, cmap='viridis')
    plt.title('Missing Values Heatmap', fontsize=14)
    plt.show()
    
    return missing_values

missing_values = assess_housing_data_quality(housing_df)

# %% [markdown]
"""
## Data Preparation
"""
# %%
def prepare_housing_data(df):
    """Clean and prepare housing data for analysis"""
    # Convert date column to datetime
    df['date'] = pd.to_datetime(df['date'])
    
    # Extract year and month
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    
    # Reorder columns for better readability
    column_order = ['date', 'year', 'month'] + [col for col in df.columns if col not in ['date', 'year', 'month']]
    df = df[column_order]
    
    return df

housing_df = prepare_housing_data(housing_df)
display(housing_df.head())

# %% [markdown]
"""
## Housing Market Analysis
"""
# %%
def analyze_housing_data(df):
    """Perform comprehensive housing market analysis"""
    
    # (A) Date conversion already handled in preparation
    
    # (B.1 & B.2) Year and month columns already added
    
    # (C) Remove year and month columns (demonstration only)
    print("\n=== Removing Year and Month Columns (Demonstration) ===")
    temp_df = df.drop(['year', 'month'], axis=1)
    display(temp_df.head())
    
    # (D) Areas with zero crimes
    print("\n=== Areas with Zero Reported Crimes ===")
    zero_crimes = df[df['no_of_crimes'] == 0]
    print(f"Found {len(zero_crimes)} records with zero crimes")
    display(zero_crimes.head())
    
    # (E) Price trends in England
    print("\n=== Price Trends in England ===")
    england_data = df[df['area'] == 'england']
    england_prices = england_data.groupby('year')['average_price'].agg(['min', 'max', 'mean'])
    england_prices.columns = ['Min Price', 'Max Price', 'Mean Price']
    display(england_prices.style.format("{:,.2f}").background_gradient(cmap='Blues'))
    
    # (F) Crime statistics by area
    print("\n=== Crime Statistics by Area ===")
    crime_stats = df.groupby('area')['no_of_crimes'].agg(['min', 'max', 'mean', 'count'])
    crime_stats.columns = ['Min Crimes', 'Max Crimes', 'Mean Crimes', 'Record Count']
    display(crime_stats.sort_values('Max Crimes', ascending=False).head(10).style.format("{:,.2f}"))
    
    # (G) Affordable areas (price < £100,000)
    print("\n=== Areas with Average Price Below £100,000 ===")
    affordable_areas = df[df['average_price'] < 100000]['area'].value_counts()
    print(f"Found {len(affordable_areas)} areas with affordable pricing")
    display(affordable_areas.head(10))
    
    return df

housing_df = analyze_housing_data(housing_df)

# %% [markdown]
"""
## Time Series Visualizations
"""
# %%
def create_housing_visualizations(df):
    """Generate insightful visualizations of housing data"""
    
    # England price trends over time
    england_data = df[df['area'] == 'england']
    plt.figure(figsize=(14, 7))
    sns.lineplot(data=england_data, x='year', y='average_price', 
                 estimator='mean', errorbar=None)
    plt.title('England: Average House Price Trends (1995-2019)', fontsize=14)
    plt.xlabel('Year')
    plt.ylabel('Average Price (£)')
    plt.grid(True)
    plt.show()
    
    # Crime vs Price relationship
    plt.figure(figsize=(12, 6))
    sns.scatterplot(data=df.sample(1000), x='no_of_crimes', y='average_price', 
                    hue='year', palette='coolwarm', alpha=0.7)
    plt.title('Crime Rate vs House Price Relationship', fontsize=14)
    plt.xlabel('Number of Crimes')
    plt.ylabel('Average Price (£)')
    plt.xscale('log')
    plt.yscale('log')
    plt.show()

create_housing_visualizations(housing_df)

# %% [markdown]
"""
## Key Findings Summary
1. **Data Quality**:
   - Dataset contains [X] records from 1995-2019
   - Missing values found primarily in [Y] columns

2. **Price Trends**:
   - England's average prices ranged from £[MIN] to £[MAX]
   - Steady price increase observed from [YEAR1] to [YEAR2]

3. **Crime Statistics**:
   - [AREA1] had the highest maximum crimes ([N] incidents)
   - [COUNT] areas reported periods with zero crimes

4. **Affordable Housing**:
   - [Z] areas consistently had average prices below £100,000
   - Most affordable area: [AREA2] with [P] records
"""