In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [9]:
# Load data
data = pd.read_csv('melb_data.csv')

In [10]:
# Select relevant columns
selected_columns = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom', 'Car', 'Price']
data = data[selected_columns]

In [11]:
# Check initial null values
print("Initial Null Values:")
print(data.isnull().sum())

Initial Null Values:
Rooms        0
Distance     0
Bedroom2     0
Bathroom     0
Car         62
Price        0
dtype: int64


In [12]:
# Handle null values
# Drop rows where Price or Distance is missing
data = data.dropna(subset=['Price', 'Distance'])

# Impute missing Bedroom2, Bathroom, Car with median
for col in ['Bedroom2', 'Bathroom', 'Car']:
    data[col] = data[col].fillna(data[col].median())


In [13]:
# Verify no null values remain
print("\nNull Values After Imputation:")
print(data.isnull().sum())


Null Values After Imputation:
Rooms       0
Distance    0
Bedroom2    0
Bathroom    0
Car         0
Price       0
dtype: int64


In [14]:
# Remove duplicates
data = data.drop_duplicates()
print(f"\nNumber of rows after removing duplicates: {len(data)}")


Number of rows after removing duplicates: 13179


In [16]:
# Handle inconsistencies
# Ensure non-negative values
data = data[(data['Price'] > 0) & (data['Distance'] >= 0) & 
            (data['Bathroom'] >= 0) & (data['Car'] >= 0)]

# Fix Bedroom2 > Rooms inconsistency
data.loc[data['Bedroom2'] > data['Rooms'], 'Bedroom2'] = data['Rooms']

In [17]:
# Feature engineering: Bedroom_Discrepancy
data['Bedroom_Discrepancy'] = data['Rooms'] - data['Bedroom2']

In [18]:
# Remove outliers using IQR for Price
Q1 = data['Price'].quantile(0.25)
Q3 = data['Price'].quantile(0.75)
IQR = Q3 - Q1
data = data[~((data['Price'] < (Q1 - 1.5 * IQR)) | (data['Price'] > (Q3 + 1.5 * IQR)))]

In [19]:
# Remove unrealistic values (e.g., Rooms > 10, Bathroom > 5, Car > 5)
data = data[(data['Rooms'] <= 10) & (data['Bathroom'] <= 5) & (data['Car'] <= 5)]

In [20]:
# Normalize numerical features
scaler = StandardScaler()
numerical_features = ['Rooms', 'Distance', 'Bedroom2', 'Bathroom', 'Car', 'Bedroom_Discrepancy']
data[numerical_features] = scaler.fit_transform(data[numerical_features])

In [21]:
# Summary statistics to verify cleanliness
print("\nSummary Statistics After Cleaning:")
print(data.describe())


Summary Statistics After Cleaning:
              Rooms      Distance      Bedroom2      Bathroom           Car  \
count  1.251900e+04  1.251900e+04  1.251900e+04  1.251900e+04  1.251900e+04   
mean  -3.632457e-17  3.632457e-17 -3.632457e-17  9.989258e-17  1.271360e-16   
std    1.000040e+00  1.000040e+00  1.000040e+00  1.000040e+00  1.000040e+00   
min   -2.041349e+00 -1.718632e+00 -3.099025e+00 -2.353421e+00 -1.775057e+00   
25%   -9.636427e-01 -6.789317e-01 -9.268185e-01 -7.740023e-01 -6.407058e-01   
50%    1.140635e-01 -1.423124e-01  1.592847e-01 -7.740023e-01  4.936452e-01   
75%    1.140635e-01  4.613842e-01  1.592847e-01  8.054166e-01  4.936452e-01   
max    7.658006e+00  6.347427e+00  7.762007e+00  5.543673e+00  3.896698e+00   

              Price  Bedroom_Discrepancy  
count  1.251900e+04         12519.000000  
mean   9.898881e+05             0.000000  
std    4.577606e+05             1.000040  
min    8.500000e+04            -0.173345  
25%    6.445000e+05            -0.173

In [1]:
# Save preprocessed data
data.to_csv('melb_preprocessed.csv', index=False)
