In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedShuffleSplit
# No need for crc32 or hashlib unless you specifically use them elsewhere for data integrity checks
# from zlib import crc32
# import hashlib

# Function to load housing data (includes synthetic data fallback)
def load_housing_data(housing_path="housing.csv"):
    try:
        # Attempt to load the actual housing.csv file
        housing = pd.read_csv(housing_path)
        print(f"Housing dataset loaded successfully from '{housing_path}'.")
        # Ensure column names are clean if they are not already
        housing.columns = housing.columns.str.strip().str.lower().str.replace(' ', '_')
        return housing
    except FileNotFoundError:
        print(f"Error: '{housing_path}' not found. Please place it in the same directory or provide the correct path.")
        print("Creating a synthetic dataset for demonstration purposes.")
        np.random.seed(42) # for reproducibility

        n_samples = 2000 # Increased sample size for more realistic synthetic data
        data = {
            'longitude': np.random.uniform(-125.0, -114.0, n_samples),
            'latitude': np.random.uniform(32.0, 42.0, n_samples),
            'housing_median_age': np.random.randint(1, 55, n_samples),
            'total_rooms': np.random.randint(6, 15000, n_samples),
            'total_bedrooms': np.random.randint(1, 3000, n_samples),
            'population': np.random.randint(3, 10000, n_samples),
            'households': np.random.randint(1, 2500, n_samples),
            'median_income': np.random.uniform(0.5, 10, n_samples), # Adjusted range
            'ocean_proximity': np.random.choice(['<1H OCEAN', 'INLAND', 'NEAR OCEAN', 'NEAR BAY', 'ISLAND'], n_samples, p=[0.4, 0.3, 0.15, 0.1, 0.05]),
            'median_house_value': np.random.uniform(10000, 600000, n_samples)
        }
        housing = pd.DataFrame(data)

        # Introduce some missing values for demonstration
        for col in ['total_bedrooms', 'median_income']:
            missing_indices = np.random.choice(housing.index, size=int(0.02 * n_samples), replace=False)
            housing.loc[missing_indices, col] = np.nan
        return housing

# Load the dataset (or create synthetic if not found)
housing = load_housing_data()

# --- Data Exploration ---
print("\n--- Housing Data Head ---")
print(housing.head())

print("\n--- Housing Data Info ---")
housing.info()

print("\n--- Housing Data Description ---")
print(housing.describe())

print("\n--- Ocean Proximity Value Counts ---")
print(housing["ocean_proximity"].value_counts())

# --- Data Preparation for Stratified Sampling ---
# Create income categories for stratified sampling
# Ensure 'median_income' is treated as numeric, coercing errors
housing["median_income"] = pd.to_numeric(housing["median_income"], errors='coerce')
# Fill NaN for income_cat creation, median strategy for simplicity here
housing['median_income'].fillna(housing['median_income'].median(), inplace=True)

housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
# Cap categories at 5 to prevent too many small categories
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)

print("\n--- Income Category Value Counts (after capping) ---")
print(housing["income_cat"].value_counts() / len(housing))

# --- Stratified Sampling ---
print("\n--- Performing Stratified Sampling ---")
# Use StratifiedShuffleSplit for splitting, ensuring representative income categories
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]

# Remove the income_cat column as it's no longer needed in the final sets
for set_ in (strat_train_set, strat_test_set):
    set_.drop("income_cat", axis=1, inplace=True)

# Separate features (X) and target (y)
X_train = strat_train_set.drop("median_house_value", axis=1)
y_train = strat_train_set["median_house_value"].copy()
X_test = strat_test_set.drop("median_house_value", axis=1)
y_test = strat_test_set["median_house_value"].copy()

print(f"\nTraining set shape: {X_train.shape}, Test set shape: {X_test.shape}")
print("Data loading, exploration, and stratified splitting complete. X_train, y_train, X_test, y_test are ready.")

Housing dataset loaded successfully from 'housing.csv'.

--- Housing Data Head ---
   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value ocean_proximity  
0       322.0       126.0         8.3252            452600.0        NEAR BAY  
1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  
2       496.0       177.0         7.2574            352100.0        NEAR BAY  
3       558.0       219.0         5.6431            341300.0        NEAR BAY  
4       565.0       259.0         3.8462            342200.0        NEA