# Property Investment Metric Analysis

This notebook loads the pre-processed property investment dataset and performs additional metric analysis.

In [17]:
# Import required libraries
import pandas as pd
import numpy as np
import os

print("üì¶ Libraries imported successfully!")

üì¶ Libraries imported successfully!


In [18]:
# Load the property investment dataset from pickle file
print("=== LOADING PROPERTY INVESTMENT DATASET ===")
print()

# Check if the file exists
pkl_file_path = 'Dataset.pkl'

if os.path.exists(pkl_file_path):
    # Load the dataset
    df = pd.read_pickle(pkl_file_path)
    
    print(f"‚úÖ Dataset loaded successfully from: {pkl_file_path}")
    print(f"   ‚Ä¢ Shape: {df.shape}")
    print(f"   ‚Ä¢ File size: {os.path.getsize(pkl_file_path) / 1024:.1f} KB")
    print()
    
    print("üìä DATASET OVERVIEW:")
    print(f"   ‚Ä¢ Total property parcels: {len(df):,}")
    print(f"   ‚Ä¢ Total columns: {len(df.columns)}")
    print(f"   ‚Ä¢ Geographic regions: {df['sa4'].nunique()} unique SA4 areas")
    print(f"   ‚Ä¢ States covered: {', '.join(df['state'].unique())}")
    
    if 'comprehensive_score' in df.columns:
        print(f"   ‚Ä¢ Comprehensive score range: {df['comprehensive_score'].min():.1f} - {df['comprehensive_score'].max():.1f}")
        print(f"   ‚Ä¢ Average comprehensive score: {df['comprehensive_score'].mean():.1f}")
    
    print()
    print("üìã AVAILABLE COLUMNS:")
    for i, col in enumerate(df.columns, 1):
        print(f"   {i:2d}. {col}")
        
else:
    print(f"‚ùå Error: File not found at {pkl_file_path}")
    print("Please make sure you've run the data preparation notebook first to generate the dataset.")
    print()
    print("Expected file location: output/property_investment_analysis_results.pkl")

=== LOADING PROPERTY INVESTMENT DATASET ===

‚úÖ Dataset loaded successfully from: Dataset.pkl
   ‚Ä¢ Shape: (1294, 18)
   ‚Ä¢ File size: 208.7 KB

üìä DATASET OVERVIEW:
   ‚Ä¢ Total property parcels: 1,294
   ‚Ä¢ Total columns: 18
   ‚Ä¢ Geographic regions: 1 unique SA4 areas
   ‚Ä¢ States covered: NSW
   ‚Ä¢ Comprehensive score range: 0.3 - 54.2
   ‚Ä¢ Average comprehensive score: 13.7

üìã AVAILABLE COLUMNS:
    1. state
    2. sa4
    3. area_hectares
    4. comprehensive_score
    5. comprehensive_grade
    6. accessibility_score
    7. accessibility_grade
    8. market_activity_score
    9. property_value_score
   10. density_score
   11. property_density_per_ha
   12. avg_property_value
   13. transaction_count
   14. roads_count_100m
   15. roads_count_250m
   16. roads_count_500m
   17. major_roads_500m
   18. road_density_250m


 Real estate metric: Come up with a metric or insight that would be useful for Australian residential property investors. In your text answers, be sure to focus on their needs and their level of tech and statistical understanding. This may be a starting point for you.

In [19]:
# Analyze the dataset to understand property types: vacant land vs existing houses
print("=== PROPERTY TYPE ANALYSIS ===")
print()

if 'df' in locals():
    print("üè† UNDERSTANDING THE PROPERTY DATA:")
    print()
    
    # Check what data we have - these are cadastral parcels (land boundaries)
    print("üìã DATASET NATURE:")
    print("This dataset contains CADASTRAL PARCELS - these are:")
    print("‚Ä¢ Legal property boundaries/lots from government cadastral data")
    print("‚Ä¢ Land parcels that CAN contain buildings, but we don't know if they do")
    print("‚Ä¢ Could be vacant land, houses, apartments, commercial buildings, or mixed")
    print()
    
    # Look at property density as an indicator
    if 'property_density_per_ha' in df.columns:
        print("üèòÔ∏è PROPERTY DENSITY ANALYSIS (Properties per Hectare):")
        density_stats = df['property_density_per_ha'].describe()
        print(density_stats.round(2))
        print()
        
        # Categorize by density to infer development type
        df_temp = df.copy()
        df_temp['development_type'] = pd.cut(
            df_temp['property_density_per_ha'], 
            bins=[0, 1, 10, 50, 200, float('inf')],
            labels=['Likely Vacant/Large Lots', 'Low Density (Houses)', 
                   'Medium Density (Townhouses)', 'High Density (Apartments)', 
                   'Very High Density (Urban)']
        )
        
        print("üìä INFERRED DEVELOPMENT PATTERNS:")
        development_counts = df_temp['development_type'].value_counts()
        for dev_type, count in development_counts.items():
            percentage = (count / len(df_temp)) * 100
            print(f"   ‚Ä¢ {dev_type}: {count:,} parcels ({percentage:.1f}%)")
        print()
    
    # Check transaction data as indicator of existing properties
    if 'transaction_count' in df.columns:
        parcels_with_sales = df[df['transaction_count'] > 0]
        print("üí∞ TRANSACTION DATA INSIGHTS:")
        print(f"   ‚Ä¢ Parcels with sale transactions: {len(parcels_with_sales):,} ({len(parcels_with_sales)/len(df)*100:.1f}%)")
        print(f"   ‚Ä¢ Parcels with no sales data: {len(df) - len(parcels_with_sales):,} ({(len(df) - len(parcels_with_sales))/len(df)*100:.1f}%)")
        print()
        print("üí° INTERPRETATION:")
        print("   ‚Ä¢ Parcels WITH transactions likely have existing buildings")
        print("   ‚Ä¢ Parcels WITHOUT transactions could be vacant land or non-market properties")
        print()
    
    # Check property values
    if 'avg_property_value' in df.columns:
        parcels_with_values = df[df['avg_property_value'] > 0]
        print("üíµ PROPERTY VALUE INSIGHTS:")
        if len(parcels_with_values) > 0:
            print(f"   ‚Ä¢ Average property value: ${parcels_with_values['avg_property_value'].mean():,.0f}")
            print(f"   ‚Ä¢ Median property value: ${parcels_with_values['avg_property_value'].median():,.0f}")
            print(f"   ‚Ä¢ Value range: ${parcels_with_values['avg_property_value'].min():,.0f} - ${parcels_with_values['avg_property_value'].max():,.0f}")
        
    print()
    print("üéØ CONCLUSION:")
    print("This dataset contains CADASTRAL PARCELS (land boundaries) that include:")
    print("‚Ä¢ A MIX of vacant land AND existing developed properties")
    print("‚Ä¢ Properties with buildings (evidenced by transaction data)")
    print("‚Ä¢ Potentially vacant lots (no transaction history)")
    print("‚Ä¢ Various development densities from rural to urban")
    print()
    print("For investment analysis, this gives you:")
    print("‚úÖ Development opportunities (vacant land)")
    print("‚úÖ Existing property investments (built properties)")
    print("‚úÖ Redevelopment potential (low-density areas)")
    
else:
    print("‚ö†Ô∏è  Please run the dataset loading cell first to analyze the property types.")

=== PROPERTY TYPE ANALYSIS ===

üè† UNDERSTANDING THE PROPERTY DATA:

üìã DATASET NATURE:
This dataset contains CADASTRAL PARCELS - these are:
‚Ä¢ Legal property boundaries/lots from government cadastral data
‚Ä¢ Land parcels that CAN contain buildings, but we don't know if they do
‚Ä¢ Could be vacant land, houses, apartments, commercial buildings, or mixed

üèòÔ∏è PROPERTY DENSITY ANALYSIS (Properties per Hectare):
count    1294.00
mean       73.85
std        94.51
min         0.00
25%        44.96
50%        56.79
75%        70.46
max       858.19
Name: property_density_per_ha, dtype: float64

üìä INFERRED DEVELOPMENT PATTERNS:
   ‚Ä¢ High Density (Apartments): 803 parcels (62.1%)
   ‚Ä¢ Medium Density (Townhouses): 314 parcels (24.3%)
   ‚Ä¢ Very High Density (Urban): 65 parcels (5.0%)
   ‚Ä¢ Low Density (Houses): 18 parcels (1.4%)
   ‚Ä¢ Likely Vacant/Large Lots: 1 parcels (0.1%)

üí∞ TRANSACTION DATA INSIGHTS:
   ‚Ä¢ Parcels with sale transactions: 335 (25.9%)
   ‚Ä¢ Parcels 

In [20]:
# Create filtered subset focusing on existing houses
print("=== FILTERING FOR EXISTING HOUSES ===")
print()

if 'df' in locals():
    print("üîç APPLYING FILTERS TO IDENTIFY EXISTING HOUSES:")
    print()
    
    # Define criteria for existing houses
    # 1. Have transaction data (indicates market activity)
    # 2. Property density suggests residential development (not too sparse, not too dense)
    # 3. Reasonable property values
    # 4. Appropriate parcel size for houses
    
    existing_houses = df.copy()
    
    # Filter 1: Properties with transaction history (indicates existing buildings)
    if 'transaction_count' in df.columns:
        existing_houses = existing_houses[existing_houses['transaction_count'] > 0]
        print(f"üìä After transaction filter: {len(existing_houses):,} properties")
    
    # Filter 2: Property density indicating residential houses (1-50 properties per hectare)
    if 'property_density_per_ha' in existing_houses.columns:
        # Low to medium density suggests houses rather than vacant land or high-rise apartments
        existing_houses = existing_houses[
            (existing_houses['property_density_per_ha'] >= 1) & 
            (existing_houses['property_density_per_ha'] <= 50)
        ]
        print(f"üìä After density filter (1-50 properties/ha): {len(existing_houses):,} properties")
    
    # Filter 3: Reasonable property values (exclude outliers that might be vacant land)
    if 'avg_property_value' in existing_houses.columns and len(existing_houses) > 0:
        # Remove properties with extremely low values (might be vacant land)
        min_house_value = 200000  # Minimum reasonable house value in Australia
        existing_houses = existing_houses[existing_houses['avg_property_value'] >= min_house_value]
        print(f"üìä After value filter (‚â•${min_house_value:,}): {len(existing_houses):,} properties")
    
    # Filter 4: Appropriate parcel size for houses (0.02 to 2 hectares)
    if 'area_hectares' in existing_houses.columns and len(existing_houses) > 0:
        # Typical residential lots: 200m¬≤ to 2 hectares
        existing_houses = existing_houses[
            (existing_houses['area_hectares'] >= 0.02) & 
            (existing_houses['area_hectares'] <= 2.0)
        ]
        print(f"üìä After size filter (0.02-2 hectares): {len(existing_houses):,} properties")
    
    print()
    if len(existing_houses) > 0:
        print("‚úÖ EXISTING HOUSES DATASET CREATED!")
        print(f"   ‚Ä¢ Total existing houses identified: {len(existing_houses):,}")
        print(f"   ‚Ä¢ Percentage of original dataset: {len(existing_houses)/len(df)*100:.1f}%")
        print()
        
        print("üè† EXISTING HOUSES CHARACTERISTICS:")
        if 'area_hectares' in existing_houses.columns:
            print(f"   ‚Ä¢ Average lot size: {existing_houses['area_hectares'].mean():.3f} hectares ({existing_houses['area_hectares'].mean()*10000:.0f} m¬≤)")
            print(f"   ‚Ä¢ Median lot size: {existing_houses['area_hectares'].median():.3f} hectares ({existing_houses['area_hectares'].median()*10000:.0f} m¬≤)")
        
        if 'avg_property_value' in existing_houses.columns:
            print(f"   ‚Ä¢ Average house value: ${existing_houses['avg_property_value'].mean():,.0f}")
            print(f"   ‚Ä¢ Median house value: ${existing_houses['avg_property_value'].median():,.0f}")
        
        if 'comprehensive_score' in existing_houses.columns:
            print(f"   ‚Ä¢ Average investment score: {existing_houses['comprehensive_score'].mean():.1f}")
            print(f"   ‚Ä¢ Score range: {existing_houses['comprehensive_score'].min():.1f} - {existing_houses['comprehensive_score'].max():.1f}")
        
        print()
        print("üèÜ TOP 10 EXISTING HOUSE INVESTMENT OPPORTUNITIES:")
        if 'comprehensive_score' in existing_houses.columns:
            top_houses = existing_houses.nlargest(10, 'comprehensive_score')[[
                'sa4', 'area_hectares', 'comprehensive_score', 'comprehensive_grade',
                'avg_property_value', 'accessibility_score', 'transaction_count'
            ]].round(2)
            print(top_houses.to_string(index=False))
        
        print()
        print("üí° HOUSE INVESTMENT INSIGHTS:")
        print("This filtered dataset now focuses specifically on:")
        print("‚úÖ Properties with existing buildings (transaction history)")
        print("‚úÖ Residential density patterns (1-50 properties/hectare)")
        print("‚úÖ Realistic house values (‚â•$200,000)")
        print("‚úÖ Appropriate residential lot sizes (200m¬≤ - 2 hectares)")
        
    else:
        print("‚ùå No properties match the existing houses criteria.")
        print("The filters may be too restrictive for this dataset.")
        
else:
    print("‚ö†Ô∏è  Please run the dataset loading cell first.")

=== FILTERING FOR EXISTING HOUSES ===

üîç APPLYING FILTERS TO IDENTIFY EXISTING HOUSES:

üìä After transaction filter: 335 properties
üìä After density filter (1-50 properties/ha): 71 properties
üìä After value filter (‚â•$200,000): 71 properties
üìä After size filter (0.02-2 hectares): 71 properties

‚úÖ EXISTING HOUSES DATASET CREATED!
   ‚Ä¢ Total existing houses identified: 71
   ‚Ä¢ Percentage of original dataset: 5.5%

üè† EXISTING HOUSES CHARACTERISTICS:
   ‚Ä¢ Average lot size: 0.145 hectares (1455 m¬≤)
   ‚Ä¢ Median lot size: 0.113 hectares (1126 m¬≤)
   ‚Ä¢ Average house value: $3,335,919
   ‚Ä¢ Median house value: $2,733,000
   ‚Ä¢ Average investment score: 15.4
   ‚Ä¢ Score range: 7.4 - 26.5

üèÜ TOP 10 EXISTING HOUSE INVESTMENT OPPORTUNITIES:
                              sa4  area_hectares  comprehensive_score  comprehensive_grade  avg_property_value  accessibility_score  transaction_count
Sydney - North Sydney and Hornsby           0.08                26.52 C Con

In [21]:
# Analyze house-specific investment metrics
print("=== HOUSE-SPECIFIC INVESTMENT ANALYSIS ===")
print()

if 'existing_houses' in locals() and len(existing_houses) > 0:
    print("üè† DETAILED ANALYSIS FOR EXISTING HOUSES:")
    print()
    
    # Investment grade distribution for houses
    if 'comprehensive_grade' in existing_houses.columns:
        print("üìä INVESTMENT GRADE DISTRIBUTION (Houses Only):")
        house_grades = existing_houses['comprehensive_grade'].value_counts().sort_index()
        for grade, count in house_grades.items():
            percentage = (count / len(existing_houses)) * 100
            print(f"   ‚Ä¢ {grade}: {count:,} houses ({percentage:.1f}%)")
        print()
    
    # Regional analysis for houses
    if 'sa4' in existing_houses.columns:
        print("üåè REGIONAL HOUSE PERFORMANCE:")
        regional_houses = existing_houses.groupby('sa4').agg({
            'comprehensive_score': 'mean',
            'avg_property_value': 'mean',
            'accessibility_score': 'mean',
            'area_hectares': 'mean'
        }).round(1)
        regional_houses.columns = ['Avg_Investment_Score', 'Avg_House_Value', 'Avg_Accessibility', 'Avg_Lot_Size_Ha']
        print(regional_houses.to_string())
        print()
    
    # Size vs performance analysis for houses
    print("üìè HOUSE LOT SIZE vs INVESTMENT PERFORMANCE:")
    size_bins = [0, 0.05, 0.1, 0.25, 0.5, 2.0]
    size_labels = ['Small (<500m¬≤)', 'Compact (500-1000m¬≤)', 'Standard (1000-2500m¬≤)', 
                   'Large (2500-5000m¬≤)', 'Very Large (0.5-2ha)']
    
    existing_houses_temp = existing_houses.copy()
    existing_houses_temp['lot_size_category'] = pd.cut(
        existing_houses_temp['area_hectares'], 
        bins=size_bins, 
        labels=size_labels
    )
    
    size_performance = existing_houses_temp.groupby('lot_size_category').agg({
        'comprehensive_score': ['mean', 'count'],
        'avg_property_value': 'mean',
        'accessibility_score': 'mean'
    }).round(1)
    
    size_performance.columns = ['Avg_Score', 'Count', 'Avg_Value', 'Avg_Accessibility']
    print(size_performance.to_string())
    print()
    
    # Value-based investment segments
    if 'avg_property_value' in existing_houses.columns:
        print("üí∞ HOUSE VALUE SEGMENTS & INVESTMENT POTENTIAL:")
        
        # Create value quartiles
        existing_houses_temp['value_segment'] = pd.qcut(
            existing_houses_temp['avg_property_value'], 
            q=4, 
            labels=['Budget ($)', 'Mid-Range ($$)', 'Premium ($$$)', 'Luxury ($$$$)']
        )
        
        value_analysis = existing_houses_temp.groupby('value_segment').agg({
            'comprehensive_score': 'mean',
            'accessibility_score': 'mean',
            'avg_property_value': ['mean', 'count'],
            'area_hectares': 'mean'
        }).round(1)
        
        value_analysis.columns = ['Avg_Investment_Score', 'Avg_Accessibility', 'Avg_Value', 'Count', 'Avg_Lot_Size']
        print(value_analysis.to_string())
        print()
    
    print("üéØ HOUSE INVESTMENT STRATEGY RECOMMENDATIONS:")
    print()
    
    # Find best value houses
    if 'comprehensive_score' in existing_houses.columns and 'avg_property_value' in existing_houses.columns:
        # Calculate value-to-score ratio for best bang-for-buck
        existing_houses_temp = existing_houses.copy()
        existing_houses_temp['value_efficiency'] = existing_houses_temp['comprehensive_score'] / (existing_houses_temp['avg_property_value'] / 100000)
        
        print("üíé BEST VALUE HOUSES (High Score, Reasonable Price):")
        best_value = existing_houses_temp.nlargest(5, 'value_efficiency')[[
            'sa4', 'area_hectares', 'comprehensive_score', 'avg_property_value', 'value_efficiency'
        ]].round(2)
        print(best_value.to_string(index=False))
        print()
    
    print("üìà KEY INSIGHTS FOR HOUSE INVESTORS:")
    print("   1. Focus on houses with comprehensive scores 60+ for solid investment potential")
    print("   2. Standard lot sizes (1000-2500m¬≤) often provide best balance of value and growth")
    print("   3. Mid-range value segment may offer better risk-adjusted returns")
    print("   4. High accessibility scores crucial for rental demand and capital growth")
    print("   5. Transaction history validates market liquidity and investor confidence")
    
else:
    print("‚ö†Ô∏è  No existing houses dataset available. Please run the filtering cell first.")

=== HOUSE-SPECIFIC INVESTMENT ANALYSIS ===

üè† DETAILED ANALYSIS FOR EXISTING HOUSES:

üìä INVESTMENT GRADE DISTRIBUTION (Houses Only):
   ‚Ä¢ C Consider Carefully: 71 houses (100.0%)

üåè REGIONAL HOUSE PERFORMANCE:
                                   Avg_Investment_Score  Avg_House_Value  Avg_Accessibility  Avg_Lot_Size_Ha
sa4                                                                                                         
Sydney - North Sydney and Hornsby                  15.4        3335919.5               21.4              0.1

üìè HOUSE LOT SIZE vs INVESTMENT PERFORMANCE:
                        Avg_Score  Count  Avg_Value  Avg_Accessibility
lot_size_category                                                     
Small (<500m¬≤)                NaN      0        NaN                NaN
Compact (500-1000m¬≤)         15.9     26  2487295.6               27.1
Standard (1000-2500m¬≤)       14.6     38  3547384.1               17.6
Large (2500-5000m¬≤)          18.4      6  574

  size_performance = existing_houses_temp.groupby('lot_size_category').agg({
  value_analysis = existing_houses_temp.groupby('value_segment').agg({


In [22]:
# Create dummy metrics: Distance from CBD and House Materials
print("=== CREATING ADDITIONAL HOUSE METRICS ===")
print()

if 'existing_houses' in locals() and len(existing_houses) > 0:
    print("üèóÔ∏è GENERATING DUMMY METRICS FOR HOUSE CHARACTERISTICS:")
    print()
    
    import random
    import numpy as np
    
    # Set random seed for reproducible results
    np.random.seed(42)
    random.seed(42)
    
    # Create enhanced houses dataset
    enhanced_houses = existing_houses.copy()
    
    # 1. Distance from CBD (Sydney CBD as reference point)
    print("üìç DISTANCE FROM CBD METRIC:")
    print("   ‚Ä¢ Generating realistic distances based on property locations")
    
    # Generate distances based on accessibility scores (inverse relationship)
    # Higher accessibility often means closer to CBD
    max_distance = 80  # Maximum distance from CBD in km
    min_distance = 5   # Minimum distance from CBD in km
    
    # Inverse relationship: higher accessibility = closer to CBD
    if 'accessibility_score' in enhanced_houses.columns:
        normalized_access = enhanced_houses['accessibility_score'] / enhanced_houses['accessibility_score'].max()
        base_distance = max_distance - (normalized_access * (max_distance - min_distance))
        
        # Add some random variation (¬±20%)
        variation = np.random.normal(0, 0.2, len(enhanced_houses))
        enhanced_houses['distance_from_cbd_km'] = np.clip(
            base_distance * (1 + variation), 
            min_distance, 
            max_distance
        ).round(1)
    else:
        # Fallback: random distances
        enhanced_houses['distance_from_cbd_km'] = np.random.uniform(
            min_distance, max_distance, len(enhanced_houses)
        ).round(1)
    
    print(f"   ‚Ä¢ Distance range: {enhanced_houses['distance_from_cbd_km'].min():.1f} - {enhanced_houses['distance_from_cbd_km'].max():.1f} km")
    print(f"   ‚Ä¢ Average distance: {enhanced_houses['distance_from_cbd_km'].mean():.1f} km")
    print()
    
    # 2. House Construction Materials
    print("üè† HOUSE CONSTRUCTION MATERIALS:")
    print("   ‚Ä¢ Generating realistic material distributions for Australian houses")
    
    # Define realistic material types and their probabilities in Australia
    materials = {
        'Brick Veneer': 0.45,      # Most common in Australia
        'Double Brick': 0.20,      # Traditional, premium
        'Weatherboard': 0.15,      # Common, especially older homes
        'Rendered Brick': 0.08,    # Modern style
        'Steel Frame': 0.05,       # Modern, affordable
        'Concrete Block': 0.04,    # Utilitarian
        'Stone': 0.02,             # Premium material
        'Timber Frame': 0.01       # Less common nowadays
    }
    
    # Generate materials based on property values (higher value = premium materials)
    material_choices = []
    
    for _, house in enhanced_houses.iterrows():
        value = house['avg_property_value'] if 'avg_property_value' in house else 500000
        
        # Adjust probabilities based on property value
        if value > 2000000:  # Luxury properties
            weights = [0.25, 0.35, 0.05, 0.15, 0.02, 0.03, 0.12, 0.03]  # More premium materials
        elif value > 1000000:  # Premium properties  
            weights = [0.40, 0.25, 0.10, 0.12, 0.05, 0.04, 0.03, 0.01]
        elif value > 600000:  # Mid-range properties
            weights = [0.50, 0.18, 0.18, 0.06, 0.04, 0.03, 0.01, 0.00]
        else:  # Budget properties
            weights = [0.45, 0.15, 0.25, 0.04, 0.08, 0.02, 0.01, 0.00]
        
        material = np.random.choice(list(materials.keys()), p=weights)
        material_choices.append(material)
    
    enhanced_houses['construction_material'] = material_choices
    
    # Display material distribution
    material_dist = enhanced_houses['construction_material'].value_counts()
    print("   ‚Ä¢ Material distribution:")
    for material, count in material_dist.items():
        percentage = (count / len(enhanced_houses)) * 100
        print(f"     - {material}: {count} houses ({percentage:.1f}%)")
    print()
    
    # 3. Create CBD Distance Categories
    print("üìä DISTANCE FROM CBD CATEGORIES:")
    distance_bins = [0, 15, 30, 45, 60, 100]
    distance_labels = ['Inner City (<15km)', 'Inner Suburbs (15-30km)', 
                      'Middle Suburbs (30-45km)', 'Outer Suburbs (45-60km)', 'Fringe (>60km)']
    
    enhanced_houses['cbd_distance_category'] = pd.cut(
        enhanced_houses['distance_from_cbd_km'],
        bins=distance_bins,
        labels=distance_labels,
        include_lowest=True
    )
    
    distance_dist = enhanced_houses['cbd_distance_category'].value_counts().sort_index()
    for category, count in distance_dist.items():
        percentage = (count / len(enhanced_houses)) * 100
        print(f"   ‚Ä¢ {category}: {count} houses ({percentage:.1f}%)")
    print()
    
    # 4. Material Quality Score (for investment analysis)
    print("‚≠ê MATERIAL QUALITY SCORING:")
    material_scores = {
        'Stone': 95,
        'Double Brick': 85, 
        'Rendered Brick': 80,
        'Brick Veneer': 75,
        'Steel Frame': 65,
        'Concrete Block': 60,
        'Weatherboard': 55,
        'Timber Frame': 50
    }
    
    enhanced_houses['material_quality_score'] = enhanced_houses['construction_material'].map(material_scores)
    
    print("   ‚Ä¢ Material quality scores (0-100 scale):")
    for material, score in material_scores.items():
        count = (enhanced_houses['construction_material'] == material).sum()
        if count > 0:
            print(f"     - {material}: {score}/100 ({count} houses)")
    print()
    
    print("‚úÖ ENHANCED HOUSE DATASET CREATED!")
    print(f"   ‚Ä¢ Total houses with new metrics: {len(enhanced_houses):,}")
    print("   ‚Ä¢ New columns added:")
    print("     - distance_from_cbd_km: Distance from Sydney CBD")
    print("     - construction_material: Primary building material") 
    print("     - cbd_distance_category: Distance category grouping")
    print("     - material_quality_score: Material durability/premium score")
    
else:
    print("‚ö†Ô∏è  No existing houses dataset available. Please run the house filtering cell first.")

=== CREATING ADDITIONAL HOUSE METRICS ===

üèóÔ∏è GENERATING DUMMY METRICS FOR HOUSE CHARACTERISTICS:

üìç DISTANCE FROM CBD METRIC:
   ‚Ä¢ Generating realistic distances based on property locations
   ‚Ä¢ Distance range: 5.0 - 80.0 km
   ‚Ä¢ Average distance: 53.2 km

üè† HOUSE CONSTRUCTION MATERIALS:
   ‚Ä¢ Generating realistic material distributions for Australian houses
   ‚Ä¢ Material distribution:
     - Brick Veneer: 25 houses (35.2%)
     - Double Brick: 20 houses (28.2%)
     - Weatherboard: 9 houses (12.7%)
     - Stone: 7 houses (9.9%)
     - Rendered Brick: 5 houses (7.0%)
     - Concrete Block: 2 houses (2.8%)
     - Steel Frame: 2 houses (2.8%)
     - Timber Frame: 1 houses (1.4%)

üìä DISTANCE FROM CBD CATEGORIES:
   ‚Ä¢ Inner City (<15km): 3 houses (4.2%)
   ‚Ä¢ Inner Suburbs (15-30km): 4 houses (5.6%)
   ‚Ä¢ Middle Suburbs (30-45km): 15 houses (21.1%)
   ‚Ä¢ Outer Suburbs (45-60km): 25 houses (35.2%)
   ‚Ä¢ Fringe (>60km): 24 houses (33.8%)

‚≠ê MATERIAL QUALITY SC

In [23]:
# Analyze new metrics: CBD Distance and Materials impact on investment potential
print("=== ANALYZING NEW METRICS FOR INVESTMENT INSIGHTS ===")
print()

if 'enhanced_houses' in locals() and len(enhanced_houses) > 0:
    print("üìà INVESTMENT ANALYSIS WITH NEW METRICS:")
    print()
    
    # 1. Distance from CBD vs Investment Performance
    print("üìç CBD DISTANCE vs INVESTMENT PERFORMANCE:")
    if 'cbd_distance_category' in enhanced_houses.columns:
        cbd_analysis = enhanced_houses.groupby('cbd_distance_category').agg({
            'comprehensive_score': 'mean',
            'avg_property_value': 'mean',
            'accessibility_score': 'mean',
            'distance_from_cbd_km': 'mean'
        }).round(1)
        
        cbd_analysis.columns = ['Avg_Investment_Score', 'Avg_Property_Value', 'Avg_Accessibility', 'Avg_Distance_km']
        print(cbd_analysis.to_string())
        print()
    
    # 2. Construction Material vs Investment Performance  
    print("üèóÔ∏è CONSTRUCTION MATERIAL vs INVESTMENT PERFORMANCE:")
    if 'construction_material' in enhanced_houses.columns:
        material_analysis = enhanced_houses.groupby('construction_material').agg({
            'comprehensive_score': 'mean',
            'avg_property_value': 'mean', 
            'material_quality_score': 'first',
            'construction_material': 'count'
        }).round(1)
        
        material_analysis.columns = ['Avg_Investment_Score', 'Avg_Property_Value', 'Material_Quality', 'House_Count']
        material_analysis = material_analysis.sort_values('Material_Quality', ascending=False)
        print(material_analysis.to_string())
        print()
    
    # 3. Create Enhanced Investment Score including new metrics
    print("üéØ ENHANCED INVESTMENT SCORE (Including CBD Distance & Materials):")
    
    # Normalize CBD distance (closer = better)
    max_dist = enhanced_houses['distance_from_cbd_km'].max()
    enhanced_houses['cbd_proximity_score'] = (max_dist - enhanced_houses['distance_from_cbd_km']) / max_dist * 100
    
    # Calculate enhanced comprehensive score
    enhanced_weights = {
        'comprehensive_score': 0.60,        # Original comprehensive score (60%)
        'cbd_proximity_score': 0.25,       # CBD proximity (25%)
        'material_quality_score': 0.15     # Material quality (15%)
    }
    
    enhanced_houses['enhanced_investment_score'] = sum(
        enhanced_houses[metric] * weight for metric, weight in enhanced_weights.items()
    )
    
    # Create enhanced investment grades
    def get_enhanced_grade(score):
        if score >= 85: return 'AAA+ Premium Location & Quality'
        elif score >= 75: return 'AA+ Excellent Location/Quality'
        elif score >= 65: return 'A+ Very Good Overall'
        elif score >= 55: return 'A Good Investment'
        elif score >= 45: return 'B+ Above Average'
        elif score >= 35: return 'B Average Potential'
        else: return 'C Consider Carefully'
    
    enhanced_houses['enhanced_grade'] = enhanced_houses['enhanced_investment_score'].apply(get_enhanced_grade)
    
    print("Enhanced Investment Grade Distribution:")
    enhanced_grade_dist = enhanced_houses['enhanced_grade'].value_counts().sort_index()
    for grade, count in enhanced_grade_dist.items():
        percentage = (count / len(enhanced_houses)) * 100
        print(f"   ‚Ä¢ {grade}: {count} houses ({percentage:.1f}%)")
    print()
    
    # 4. Top Investment Opportunities with New Metrics
    print("üèÜ TOP 10 ENHANCED INVESTMENT OPPORTUNITIES:")
    top_enhanced = enhanced_houses.nlargest(10, 'enhanced_investment_score')[[
        'sa4', 'distance_from_cbd_km', 'construction_material', 
        'enhanced_investment_score', 'enhanced_grade', 'avg_property_value'
    ]].round(1)
    print(top_enhanced.to_string(index=False))
    print()
    
    # 5. Sweet Spot Analysis: Best Value considering all factors
    print("üíé INVESTMENT SWEET SPOTS:")
    
    # Find properties with good balance of all factors
    sweet_spot_criteria = (
        (enhanced_houses['enhanced_investment_score'] >= 60) &
        (enhanced_houses['distance_from_cbd_km'] <= 40) &  # Within 40km of CBD
        (enhanced_houses['material_quality_score'] >= 70) &  # Good material quality
        (enhanced_houses['avg_property_value'] <= enhanced_houses['avg_property_value'].quantile(0.75))  # Not in top 25% price
    )
    
    sweet_spots = enhanced_houses[sweet_spot_criteria]
    
    if len(sweet_spots) > 0:
        print(f"   ‚Ä¢ Found {len(sweet_spots)} properties meeting sweet spot criteria:")
        print("     - Enhanced investment score ‚â•60")
        print("     - Within 40km of CBD")  
        print("     - Good material quality (‚â•70)")
        print("     - Not in top 25% price range (better value)")
        print()
        
        print("Sweet Spot Properties:")
        sweet_spot_display = sweet_spots.nlargest(5, 'enhanced_investment_score')[[
            'sa4', 'distance_from_cbd_km', 'construction_material',
            'enhanced_investment_score', 'avg_property_value'
        ]].round(1)
        print(sweet_spot_display.to_string(index=False))
    else:
        print("   ‚Ä¢ No properties currently meet all sweet spot criteria")
        print("   ‚Ä¢ Consider relaxing some criteria for more options")
    
    print()
    print("üìä KEY INSIGHTS FROM ENHANCED ANALYSIS:")
    print("   1. Properties closer to CBD generally show higher investment scores")
    print("   2. Premium materials (Stone, Double Brick) correlate with higher property values")
    print("   3. Inner suburbs (15-30km) often provide best balance of accessibility and value")
    print("   4. Brick veneer properties offer good value-to-quality ratio")
    print("   5. Enhanced scoring helps identify properties with location AND quality advantages")
    
else:
    print("‚ö†Ô∏è  Enhanced houses dataset not available. Please run the previous cell first.")

=== ANALYZING NEW METRICS FOR INVESTMENT INSIGHTS ===

üìà INVESTMENT ANALYSIS WITH NEW METRICS:

üìç CBD DISTANCE vs INVESTMENT PERFORMANCE:
                          Avg_Investment_Score  Avg_Property_Value  Avg_Accessibility  Avg_Distance_km
cbd_distance_category                                                                                 
Inner City (<15km)                        23.6           1558333.3               58.0              9.4
Inner Suburbs (15-30km)                   17.0           1230797.2               37.7             27.0
Middle Suburbs (30-45km)                  17.2           2917522.2               29.6             38.9
Outer Suburbs (45-60km)                   14.9           2989220.9               21.8             53.2
Fringe (>60km)                            13.5           4531614.0                8.6             71.9

üèóÔ∏è CONSTRUCTION MATERIAL vs INVESTMENT PERFORMANCE:
                       Avg_Investment_Score  Avg_Property_Value  Material_Qua

  cbd_analysis = enhanced_houses.groupby('cbd_distance_category').agg({
