In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing

import warnings
import os
warnings.filterwarnings('ignore')

# Load the dataset
print("Loading California Housing dataset...")
housing_data = fetch_california_housing(as_frame=True)

#Downloads California housing data from sklearn
#20,640 houses with 9 columns
#Data appears in a table!


# Get the dataframe
df = housing_data.frame
print(f"Dataset loaded successfully! Shape: {df.shape}")
print("\n" + "="*60)

# First look at the data
print("\nFirst 5 rows of the dataset:")
print(df.head())

# Basic info about dataset
print("\n" + "="*60)
print("\nDataset Information:")
print(df.info())

# Check for missing values
print("\n" + "="*60)
print("\nMissing Values Check:")
missing = df.isnull().sum()
print(missing)
print(f"\nTotal missing values: {missing.sum()}")

# Basic statistics
print("\n" + "="*60)
print("\nBasic Statistical Summary:")
print(df.describe())

# Feature descriptions
print("\n" + "="*60)
print("\nFeature Descriptions:")
print(housing_data.DESCR)

# Check data types
print("\n" + "="*60)
print("\nData Types:")
print(df.dtypes)

# Check for duplicates
print("\n" + "="*60)
duplicates = df.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates}")

# Value ranges for each column
print("\n" + "="*60)
print("\nValue Ranges:")
for col in df.columns:
    print(f"{col}: [{df[col].min():.2f}, {df[col].max():.2f}]")

# Save the raw data for next notebook
print("\n" + "="*60)
print("\nSaving data for preprocessing...")

df.to_csv('../data/raw/california_housing_raw.csv', index=False)
print("Data saved successfully!")

print("\n" + "="*60)
print("\nKey Observations:")
print("1. No missing values in the dataset - that's good!")
print("2. All features are numerical - no need for encoding")
print("3. Target variable is 'MedHouseVal' (Median House Value)")
print("4. Dataset has 20,640 samples and 9 features")
print("5. Some features might have different scales - will need normalization")

print("\n✓ Exploration complete! Ready for preprocessing.")

Loading California Housing dataset...
Dataset loaded successfully! Shape: (20640, 9)


First 5 rows of the dataset:
   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  MedHouseVal  
0    -122.23        4.526  
1    -122.22        3.585  
2    -122.24        3.521  
3    -122.25        3.413  
4    -122.25        3.422  


Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   MedInc       20640 non-null  float64
 1