In [2]:
# 📓 ida.ipynb – Initial Data Analysis

# --- Imports ---
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
import os

# --- Load the raw housing dataset ---
housing = pd.read_csv("../data/raw/housing.csv")

# --- Quick overview ---
housing.info()
housing.describe()
housing["ocean_proximity"].value_counts()

# --- Create income category attribute for stratified sampling ---
housing["income_cat"] = pd.cut(
    housing["median_income"],
    bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
    labels=[1, 2, 3, 4, 5]
)

# --- Stratified Split into Train/Test Sets ---
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_idx, test_idx in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_idx].drop("income_cat", axis=1)
    strat_test_set = housing.loc[test_idx].drop("income_cat", axis=1)

# --- Save raw train/test CSVs ---
os.makedirs("../data/train", exist_ok=True)
os.makedirs("../data/test", exist_ok=True)

strat_train_set.to_csv("../data/train/housing_train.csv", index=False)
strat_test_set.to_csv("../data/test/housing_test.csv", index=False)

print("✅ Saved housing_train.csv and housing_test.csv")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB
✅ Saved housing_train.csv and housing_test.csv
