In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the dataset
# Assuming your boston_housing.csv is in the 'data' folder
df = pd.read_csv('../data/boston_housing.csv') # Adjust path if needed

# 🎯 Perform exploratory data analysis (EDA) [cite: 21]
print("Dataset Overview:")
print(df.head())
print("\nDataset Shape:", df.shape)
print("\nDataset Info:")
df.info()
print("\nDescriptive Statistics:")
print(df.describe())

# 🔍 Handle missing values appropriately [cite: 22]
print("\nMissing Values Before Handling:")
print(df.isnull().sum())
# For Boston Housing, missing values are rare in standard versions.
# If you find any, common strategies include:
# df.fillna(df.mean(), inplace=True) # Fill with mean for numerical columns
# df.dropna(inplace=True) # Drop rows with any missing values

# 📊 Create meaningful visualizations [cite: 23]
# Distribution of the target variable (MEDV - Median value of owner-occupied homes in $1000s)
plt.figure(figsize=(8, 6))
sns.histplot(df['MEDV'], kde=True)
plt.title('Distribution of Median House Value (MEDV)')
plt.xlabel('MEDV ($1000s)')
plt.ylabel('Frequency')
plt.show()

# Correlation Heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Boston Housing Features')
plt.show()

# Pairplot for a subset of features (example)
# sns.pairplot(df[['RM', 'LSTAT', 'PTRATIO', 'MEDV']])
# plt.show()

# ⚙️ Apply feature engineering if necessary [cite: 24]
# For Boston Housing, common feature engineering might involve creating interaction terms
# or polynomial features if linearity assumptions don't hold well.
# Example: Polynomial features (optional, depending on initial model performance)
# from sklearn.preprocessing import PolynomialFeatures
# poly = PolynomialFeatures(degree=2, include_bias=False)
# df_poly = pd.DataFrame(poly.fit_transform(df.drop('MEDV', axis=1)), columns=poly.get_feature_names_out(df.drop('MEDV', axis=1).columns))
# df_poly['MEDV'] = df['MEDV']
# df = df_poly # Use df_poly for further steps if you apply this

# Split data into training and testing sets [cite: 25]
X = df.drop('MEDV', axis=1) # Features
y = df['MEDV']              # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale numerical features (important for many ML algorithms)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrame for easier inspection (optional, but good practice)
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns)

FileNotFoundError: [Errno 2] No such file or directory: '../data/boston_housing.csv'