# Import libraries

In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Load Dataset

In [6]:
# Load the House Prediction dataset (space-separated, no header)
df = pd.read_csv('/kaggle/input/hp-dataset/4) house Prediction Data Set.csv', header=None, sep='\s+')

In [7]:
# Assign Boston Housing column names
columns = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
df.columns = columns

In [8]:
# Step 1: Inspect dataset
print("Dataset Head:\n", df.head())
print("\nMissing Values Before:\n", df.isnull().sum())
print("\nData Types:\n", df.dtypes)
print("\nDataset Shape:\n", df.shape)

Dataset Head:
       CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0  0.00632  18.0   2.31     0  0.538  6.575  65.2  4.0900    1  296.0   
1  0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2  242.0   
2  0.02729   0.0   7.07     0  0.469  7.185  61.1  4.9671    2  242.0   
3  0.03237   0.0   2.18     0  0.458  6.998  45.8  6.0622    3  222.0   
4  0.06905   0.0   2.18     0  0.458  7.147  54.2  6.0622    3  222.0   

   PTRATIO       B  LSTAT  MEDV  
0     15.3  396.90   4.98  24.0  
1     17.8  396.90   9.14  21.6  
2     17.8  392.83   4.03  34.7  
3     18.7  394.63   2.94  33.4  
4     18.7  396.90   5.33  36.2  

Missing Values Before:
 CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
MEDV       0
dtype: int64

Data Types:
 CRIM       float64
ZN         float64
INDUS      float64
CHAS         int64
NOX        float64
RM         fl

# Handle Missing Data

In [9]:
# Step 2: Handle missing data
# Simulate missing data in 'CRIM' and 'RM' (~50 samples)
np.random.seed(42)
df.loc[np.random.choice(df.index, int(0.1 * len(df))), 'CRIM'] = np.nan
df.loc[np.random.choice(df.index, int(0.1 * len(df))), 'RM'] = np.nan

# Fill missing values with median for numerical columns
imputer = SimpleImputer(strategy='median')
numerical_cols = ['CRIM', 'ZN', 'INDUS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT']
df[numerical_cols] = imputer.fit_transform(df[numerical_cols])

print("\nMissing Values After:\n", df.isnull().sum())


Missing Values After:
 CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
MEDV       0
dtype: int64


# Encode Categorical Variables

In [10]:
# Step 3: Encode categorical variables
# CHAS is binary (0/1), ensure it's integer
df['CHAS'] = df['CHAS'].astype(int)

# Standardize Features

In [11]:
# Step 4: Standardize numerical features
scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

# Split Dataset

In [12]:
# Step 5: Split the dataset into training and testing sets
X = df.drop('MEDV', axis=1)  # Features
y = df['MEDV']               # Target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Verify the split
print("\nTraining set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)


Training set shape: (404, 13) (404,)
Testing set shape: (102, 13) (102,)


# Save Outputs

In [13]:
# Save preprocessed data to Kaggle output directory
X_train.to_csv('/kaggle/working/X_train_preprocessed.csv', index=False)
X_test.to_csv('/kaggle/working/X_test_preprocessed.csv', index=False)
y_train.to_csv('/kaggle/working/y_train.csv', index=False)
y_test.to_csv('/kaggle/working/y_test.csv', index=False)

In [14]:
# Display sample of preprocessed data
print("\nSample of preprocessed training data:\n", X_train.head())


Sample of preprocessed training data:
          CRIM        ZN     INDUS  CHAS       NOX        RM       AGE  \
477  1.374516 -0.487722  1.015999     0  0.512296 -1.444676  1.021481   
15  -0.321123 -0.487722 -0.437258     0 -0.144217 -0.659333 -0.429390   
332 -0.390938  1.014463 -0.740749     0 -1.008914 -0.367423 -1.610001   
423  0.435416 -0.487722  1.015999     0  0.512296 -0.260735  0.587642   
19  -0.309532 -0.487722 -0.437258     0 -0.144217 -0.817884  0.032897   

          DIS       RAD       TAX   PTRATIO         B     LSTAT  
477 -0.805438  1.661245  1.530926  0.806576 -0.078878  1.718101  
15   0.334449 -0.637962 -0.601276  1.176466  0.427018 -0.586356  
332  1.352738 -0.982843 -0.619094 -0.719220  0.061137 -0.676067  
423 -0.842945  1.661245  1.530926  0.806576 -3.883072  1.491020  
19   0.000693 -0.637962 -0.601276  1.176466  0.375814 -0.192467  
