In [1]:
import kagglehub
import pandas as pd
import numpy as np

# Download latest version
path = kagglehub.dataset_download("arunjangir245/boston-housing-dataset")

print("Path to dataset files:", path)

Path to dataset files: /Users/adityavikrammahendru/.cache/kagglehub/datasets/arunjangir245/boston-housing-dataset/versions/2


In [2]:
import os

# List files in the dataset directory
files = os.listdir(path)
print("Files in dataset:", files)

# Load the dataset (assuming it's a CSV)
csv_file = [f for f in files if f.endswith('.csv')][0]
df = pd.read_csv(os.path.join(path, csv_file))

print(f"\nDataset shape: {df.shape}")
print(f"Number of rows: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")

Files in dataset: ['BostonHousing.csv']

Dataset shape: (506, 14)
Number of rows: 506
Number of columns: 14


In [3]:
# Display first few rows
print("First 5 rows of the dataset:")
df.head()

First 5 rows of the dataset:


Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [4]:
# Data types and info
print("Data Types:")
print(df.dtypes)
print("\n" + "="*50)
print("Dataset Info:")
df.info()

Data Types:
crim       float64
zn         float64
indus      float64
chas         int64
nox        float64
rm         float64
age        float64
dis        float64
rad          int64
tax          int64
ptratio    float64
b          float64
lstat      float64
medv       float64
dtype: object

Dataset Info:
<class 'pandas.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   crim     506 non-null    float64
 1   zn       506 non-null    float64
 2   indus    506 non-null    float64
 3   chas     506 non-null    int64  
 4   nox      506 non-null    float64
 5   rm       501 non-null    float64
 6   age      506 non-null    float64
 7   dis      506 non-null    float64
 8   rad      506 non-null    int64  
 9   tax      506 non-null    int64  
 10  ptratio  506 non-null    float64
 11  b        506 non-null    float64
 12  lstat    506 non-null    float64
 13  medv     506 non-null   

In [5]:
# Statistical summary
print("Statistical Summary:")
df.describe()

Statistical Summary:


Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
count,506.0,506.0,506.0,506.0,506.0,501.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284341,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.705587,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.884,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.208,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677083,12.5,18.1,0.0,0.624,6.625,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


In [6]:
# Check for missing values
print("Missing Values:")
missing = df.isnull().sum()
missing_pct = (missing / len(df)) * 100
missing_df = pd.DataFrame({'Missing Count': missing, 'Missing %': missing_pct})
print(missing_df[missing_df['Missing Count'] > 0] if missing.sum() > 0 else "No missing values found!")

print(f"\nTotal missing values: {missing.sum()}")

Missing Values:
    Missing Count  Missing %
rm              5   0.988142

Total missing values: 5


In [7]:
# Identify target variable (likely 'MEDV' for Boston Housing - median value)
target_col = None
for col in ['MEDV', 'Price', 'target', 'SalePrice', 'PRICE']:
    if col in df.columns:
        target_col = col
        break

if target_col is None:
    target_col = df.columns[-1]

print(f"Target variable identified: {target_col}")
print(f"\nTarget variable statistics:")
print(df[target_col].describe())

Target variable identified: medv

Target variable statistics:
count    506.000000
mean      22.532806
std        9.197104
min        5.000000
25%       17.025000
50%       21.200000
75%       25.000000
max       50.000000
Name: medv, dtype: float64


In [8]:
# Correlation analysis
print("Correlation Matrix:")
correlation = df.corr()
print(correlation)

print("\n" + "="*50)
print("Top correlations with target variable:")
if target_col in correlation.columns:
    target_corr = correlation[target_col].drop(target_col).sort_values(ascending=False)
    print(target_corr)

Correlation Matrix:
             crim        zn     indus      chas       nox        rm       age  \
crim     1.000000 -0.200469  0.406583 -0.055892  0.420972 -0.219433  0.352734   
zn      -0.200469  1.000000 -0.533828 -0.042697 -0.516604  0.311173 -0.569537   
indus    0.406583 -0.533828  1.000000  0.062938  0.763651 -0.394193  0.644779   
chas    -0.055892 -0.042697  0.062938  1.000000  0.091203  0.091468  0.086518   
nox      0.420972 -0.516604  0.763651  0.091203  1.000000 -0.302751  0.731470   
rm      -0.219433  0.311173 -0.394193  0.091468 -0.302751  1.000000 -0.240286   
age      0.352734 -0.569537  0.644779  0.086518  0.731470 -0.240286  1.000000   
dis     -0.379670  0.664408 -0.708027 -0.099176 -0.769230  0.203507 -0.747881   
rad      0.625505 -0.311948  0.595129 -0.007368  0.611441 -0.210718  0.456022   
tax      0.582764 -0.314563  0.720760 -0.035587  0.668023 -0.292794  0.506456   
ptratio  0.289946 -0.391679  0.383248 -0.121515  0.188933 -0.357612  0.261515   
b       

In [9]:
# Check for outliers using IQR method
print("Outlier Analysis (IQR method):")
outlier_info = {}
for col in df.select_dtypes(include=[np.number]).columns:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)][col]
    if len(outliers) > 0:
        outlier_info[col] = len(outliers)

print("Columns with outliers:")
for col, count in outlier_info.items():
    print(f"  {col}: {count} outliers ({count/len(df)*100:.2f}%)")

Outlier Analysis (IQR method):
Columns with outliers:
  crim: 66 outliers (13.04%)
  zn: 68 outliers (13.44%)
  chas: 35 outliers (6.92%)
  rm: 30 outliers (5.93%)
  dis: 5 outliers (0.99%)
  ptratio: 15 outliers (2.96%)
  b: 77 outliers (15.22%)
  lstat: 7 outliers (1.38%)
  medv: 40 outliers (7.91%)


In [10]:
# Identify categorical columns
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
print(f"Categorical columns: {categorical_cols if categorical_cols else 'None found'}")

if categorical_cols:
    for col in categorical_cols:
        print(f"\n{col} value counts:")
        print(df[col].value_counts())

Categorical columns: None found


In [11]:
# Final summary
print("="*50)
print("DATA ANALYSIS SUMMARY")
print("="*50)
print(f"Dataset: Boston Housing")
print(f"Shape: {df.shape[0]} rows x {df.shape[1]} columns")
print(f"Target variable: {target_col}")
print(f"Missing values: {df.isnull().sum().sum()}")
print(f"Numerical columns: {len(df.select_dtypes(include=[np.number]).columns)}")
print(f"Categorical columns: {len(categorical_cols)}")
print("="*50)

DATA ANALYSIS SUMMARY
Dataset: Boston Housing
Shape: 506 rows x 14 columns
Target variable: medv
Missing values: 5
Numerical columns: 14
Categorical columns: 0
