In [3]:
import kagglehub
import pandas as pd
import numpy as np

# Download latest version
path = kagglehub.dataset_download("arunjangir245/boston-housing-dataset")

print("Path to dataset files:", path)

Path to dataset files: /Users/adityavikrammahendru/.cache/kagglehub/datasets/arunjangir245/boston-housing-dataset/versions/2


In [None]:
import os

# List files in the dataset directory
files = os.listdir(path)
print("Files in dataset:", files)

# Load the dataset (assuming it's a CSV)
csv_file = [f for f in files if f.endswith('.csv')][0]
df = pd.read_csv(os.path.join(path, csv_file))

print(f"\nDataset shape: {df.shape}")
print(f"Number of rows: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")

In [None]:
# Display first few rows
print("First 5 rows of the dataset:")
df.head()

In [None]:
# Data types and info
print("Data Types:")
print(df.dtypes)
print("\n" + "="*50)
print("Dataset Info:")
df.info()

In [None]:
# Statistical summary
print("Statistical Summary:")
df.describe()

In [None]:
# Check for missing values
print("Missing Values:")
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100
missing_df = pd.DataFrame({'Missing Count': missing, 'Missing %': missing_pct})
print(missing_df[missing_df['Missing Count'] > 0] if missing.sum() > 0 else "No missing values found!")

print(f"\nTotal missing values: {missing.sum()}")

In [None]:
# Identify target variable (likely 'MEDV' for Boston Housing - median value)
target_col = None
for col in ['MEDV', 'Price', 'target', 'SalePrice', 'PRICE']:
    if col in df.columns:
        target_col = col
        break

if target_col is None:
    target_col = df.columns[-1]

print(f"Target variable identified: {target_col}")
print(f"\nTarget variable statistics:")
print(df[target_col].describe())

In [None]:
# Correlation analysis
print("Correlation Matrix:")
correlation = df.corr()
print(correlation)

print("\n" + "="*50)
print("Top correlations with target variable:")
if target_col in correlation.columns:
    target_corr = correlation[target_col].drop(target_col).sort_values(ascending=False)
    print(target_corr)

In [None]:
# Check for outliers using IQR method
print("Outlier Analysis (IQR method):")
outlier_info = {}
for col in df.select_dtypes(include=[np.number]).columns:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)][col]
    if len(outliers) > 0:
        outlier_info[col] = len(outliers)

print("Columns with outliers:")
for col, count in outlier_info.items():
    print(f"  {col}: {count} outliers ({count/len(df)*100:.2f}%)")

In [None]:
# Identify categorical columns
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
print(f"Categorical columns: {categorical_cols if categorical_cols else 'None found'}")

if categorical_cols:
    for col in categorical_cols:
        print(f"\n{col} value counts:")
        print(df[col].value_counts())

In [None]:
# Final summary
print("="*50)
print("DATA ANALYSIS SUMMARY")
print("="*50)
print(f"Dataset: Boston Housing")
print(f"Shape: {df.shape[0]} rows x {df.shape[1]} columns")
print(f"Target variable: {target_col}")
print(f"Missing values: {df.isnull().sum().sum()}")
print(f"Numerical columns: {len(df.select_dtypes(include=[np.number]).columns)}")
print(f"Categorical columns: {len(categorical_cols)}")
print("="*50)