In [1]:
# The goal is to find what neighborhoods have heterogeneous areas
# to decide which of them need special treatment

In [2]:
import numpy as np
import pandas as pd
from scipy.stats import entropy
from utils.load_data import load_data
from feature_engineering import engineer_features
from data_cleaning.data_cleaning import clean_data

In [3]:
train_df = load_data("data/train.csv")
train_df = engineer_features(train_df)
train_df = clean_data(train_df)

In [4]:
def calculate_neighborhood_entropy(df, features):
    """
    Calculate entropy for each feature within each neighborhood.
    
    Parameters:
    df: pandas DataFrame containing the data
    features: list of categorical features to analyze
    
    Returns:
    DataFrame with entropy scores for each feature in each neighborhood
    """
    results = []
    
    for neighborhood in df['Neighborhood'].unique():
        neighborhood_data = df[df['Neighborhood'] == neighborhood]
        
        # Calculate entropy for each feature
        feature_entropies = {}
        for feature in features:
            # Get value counts and calculate probabilities
            value_counts = neighborhood_data[feature].value_counts(normalize=True)
            # Calculate entropy
            feature_entropy = entropy(value_counts)
            feature_entropies[f'{feature}_entropy'] = feature_entropy
            
        # Add count of houses in neighborhood
        feature_entropies['house_count'] = len(neighborhood_data)
        
        # Create row for this neighborhood
        row = {'Neighborhood': neighborhood, **feature_entropies}
        results.append(row)
    
    # Convert to DataFrame
    results_df = pd.DataFrame(results)
    
    # Sort by total entropy (sum of all feature entropies)
    entropy_columns = [col for col in results_df.columns if col.endswith('_entropy')]
    results_df['total_entropy'] = results_df[entropy_columns].sum(axis=1)
    results_df = results_df.sort_values('total_entropy', ascending=False)
    
    return results_df

def analyze_heterogeneity(df):
    """
    Analyze neighborhood heterogeneity using multiple categorical features.
    """
    # Features to analyze
    categorical_features = [
        'MSSubClass',
        'BldgType',
        'HouseStyle',
        'OverallQual',
        'RoofStyle',
        'Exterior1st',
        'GarageType'
    ]
    
    # Calculate entropy
    entropy_df = calculate_neighborhood_entropy(df, categorical_features)
    
    # Print summary of most and least heterogeneous neighborhoods
    print("\nMost heterogeneous neighborhoods:")
    print(entropy_df.head()[['Neighborhood', 'total_entropy', 'house_count']].to_string())
    
    print("\nLeast heterogeneous neighborhoods:")
    print(entropy_df.tail()[['Neighborhood', 'total_entropy', 'house_count']].to_string())
    
    # Create detailed feature entropy breakdown
    feature_specific = entropy_df[['Neighborhood'] + [col for col in entropy_df.columns if col.endswith('_entropy')]]
    print("\nFeature-specific entropy breakdown for top 5 most heterogeneous neighborhoods:")
    print(feature_specific.head().to_string())
    
    return entropy_df

# Function to identify neighborhoods that might need special treatment
def identify_special_treatment_neighborhoods(entropy_df, entropy_threshold=None):
    """
    Identify neighborhoods that might need special treatment based on entropy scores.
    
    Parameters:
    entropy_df: DataFrame with entropy scores
    entropy_threshold: Optional threshold, if None will use mean + 1 std
    
    Returns:
    List of neighborhoods needing special treatment
    """
    if entropy_threshold is None:
        # Set threshold as mean + 1 standard deviation of total entropy
        entropy_threshold = (
            entropy_df['total_entropy'].mean() + 
            entropy_df['total_entropy'].std()
        )
    
    special_treatment = entropy_df[
        entropy_df['total_entropy'] > entropy_threshold
    ]['Neighborhood'].tolist()
    
    print(f"\nNeighborhoods needing special treatment (entropy > {entropy_threshold:.2f}):")
    for idx, neighborhood in enumerate(special_treatment, 1):
        print(f"{idx}. {neighborhood}")
    
    return special_treatment

In [5]:
# features = [
#     "MSSubClass",
#      "BldgType",
#      "HouseStyle",
#      "OverallQual",
#      "RoofStyle",
#      "Exterior1st",
#      "GarageType"
# ]
entropy_df = analyze_heterogeneity(train_df)
entropy_df.head(3).transpose()


Most heterogeneous neighborhoods:
   Neighborhood  total_entropy  house_count
15      Edwards       9.610099          100
7       OldTown       8.722382          113
2       Crawfor       8.543410           51
13       IDOTRR       7.910597           37
4       Mitchel       7.648498           49

Least heterogeneous neighborhoods:
   Neighborhood  total_entropy  house_count
3       NoRidge       4.405948           41
20      NPkVill       2.359269            9
22       BrDale       2.162116           16
21      Blmngtn       1.742951           17
24      Blueste       0.693147            2

Feature-specific entropy breakdown for top 5 most heterogeneous neighborhoods:
   Neighborhood  MSSubClass_entropy  BldgType_entropy  HouseStyle_entropy  OverallQual_entropy  RoofStyle_entropy  Exterior1st_entropy  GarageType_entropy  total_entropy
15      Edwards            1.949826          0.735449            1.263441             1.522198           0.733991             2.041174            1.364

Unnamed: 0,15,7,2
Neighborhood,Edwards,OldTown,Crawfor
MSSubClass_entropy,1.949826,1.902442,1.729404
BldgType_entropy,0.735449,0.493582,0.481199
HouseStyle_entropy,1.263441,1.502813,1.272335
OverallQual_entropy,1.522198,1.568816,1.396788
RoofStyle_entropy,0.733991,0.596108,0.72129
Exterior1st_entropy,2.041174,1.771357,1.904138
GarageType_entropy,1.364021,0.887265,1.038256
house_count,100,113,51
total_entropy,9.610099,8.722382,8.54341


In [6]:
identify_special_treatment_neighborhoods(entropy_df)


Neighborhoods needing special treatment (entropy > 8.11):
1. Edwards
2. OldTown
3. Crawfor


['Edwards', 'OldTown', 'Crawfor']

In [7]:
# entropy_df.to_string("nb_entropy.txt")

In [8]:
# 1. Most heterogeneous neighborhoods (highest total entropy):
# - Edwards (9.61): Highest diversity in housing types, very high diversity in exterior materials
# - OldTown (8.72): Very diverse housing types and house styles
# - Crawfor (8.54): Well-balanced diversity across all features

# 2. Least heterogeneous neighborhoods (lowest total entropy):
# - Blueste (0.69): Almost no diversity, only BldgType shows any variation
# - Blmngtn (1.74): Very uniform, only slight variations in quality and roof style
# - BrDale (2.16): Uniform housing types but some variation in building types and garage types

# Key insights about neighborhoods needing special treatment:

# 1. Edwards (100 houses):
# - Highest MSSubClass entropy (1.95) - very diverse housing types
# - Highest Exterior1st entropy (2.04) - many different exterior materials
# - High GarageType entropy (1.36) - diverse garage configurations
# - Good sample size (100 houses)

# 2. OldTown (113 houses):
# - Second highest MSSubClass entropy (1.90)
# - Highest HouseStyle entropy (1.50)
# - Highest OverallQual entropy (1.57)
# - Largest sample size among special treatment group (113 houses)

# 3. Crawfor (51 houses):
# - High entropy across all features
# - Particularly high in Exterior1st (1.90)
# - Smaller sample size (51 houses) might make modeling more challenging

# Recommendations:
# 1. Consider separate models or neighborhood-specific features for these three areas
# 2. Pay special attention to house style and exterior features in these neighborhoods
# 3. Consider using interaction terms between neighborhood and key features
# 4. Might need more robust validation for Crawfor due to smaller sample size


In [9]:
# Yes, that's a good approach! This way we can understand the unique characteristics and patterns in each heterogeneous neighborhood, while still having a solid understanding of the more homogeneous areas.

# Let's outline an EDA plan:

# 1. Edwards (100 houses):
# - Focus on diverse housing types (high MSSubClass entropy)
# - Analyze the variety of exterior materials
# - Study relationship between housing type and price
# - Look for any patterns in garage types
# - Check if quality varies significantly by housing type

# 2. OldTown (113 houses):
# - Deep dive into house styles and their price impact
# - Analyze quality variations
# - Study if there are price patterns by housing age/type
# - Check if certain combinations of features are more valuable

# 3. Crawfor (51 houses):
# - Careful analysis due to smaller sample size
# - Look for any outliers that might affect modeling
# - Study the exterior features and their price impact
# - Analyze if certain housing types command premium prices
# - Check for any unique feature combinations

# 4. Other Neighborhoods (Combined):
# - More standardized analysis
# - Focus on common patterns and relationships
# - Look for consistent price factors
# - Identify any outliers
# - Study if certain features have consistent impact across neighborhoods

# Would you like to start with any particular neighborhood, or should we create a structured approach to analyze them all systematically?

# Should we also create visual comparisons to see how these special neighborhoods differ from the more homogeneous ones?