In [1]:
# Perform  Data Preprocessing  on   melb_data.csv dataset with statistical perspective. 
# The dataset can be downloaded from  
# https://www.kaggle.com/datasets/gunjanpathak/melb-data?resource=download

In [3]:
import pandas as pd

# Load dataset
df = pd.read_csv("melb_data.csv")

# Show first 5 rows
df.head()


Unnamed: 0.1,Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,5,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,6,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [4]:
# Shape of dataset (rows, columns)
print("Shape:", df.shape)

Shape: (18396, 22)


In [5]:
# Data types & missing values
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18396 entries, 0 to 18395
Data columns (total 22 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     18396 non-null  int64  
 1   Suburb         18396 non-null  object 
 2   Address        18396 non-null  object 
 3   Rooms          18396 non-null  int64  
 4   Type           18396 non-null  object 
 5   Price          18396 non-null  float64
 6   Method         18396 non-null  object 
 7   SellerG        18396 non-null  object 
 8   Date           18396 non-null  object 
 9   Distance       18395 non-null  float64
 10  Postcode       18395 non-null  float64
 11  Bedroom2       14927 non-null  float64
 12  Bathroom       14925 non-null  float64
 13  Car            14820 non-null  float64
 14  Landsize       13603 non-null  float64
 15  BuildingArea   7762 non-null   float64
 16  YearBuilt      8958 non-null   float64
 17  CouncilArea    12233 non-null  object 
 18  Lattit

In [6]:
# Summary statistics (mean, std, min, max, quartiles)
print(df.describe())

         Unnamed: 0         Rooms         Price      Distance      Postcode  \
count  18396.000000  18396.000000  1.839600e+04  18395.000000  18395.000000   
mean   11826.787073      2.935040  1.056697e+06     10.389986   3107.140147   
std     6800.710448      0.958202  6.419217e+05      6.009050     95.000995   
min        1.000000      1.000000  8.500000e+04      0.000000   3000.000000   
25%     5936.750000      2.000000  6.330000e+05      6.300000   3046.000000   
50%    11820.500000      3.000000  8.800000e+05      9.700000   3085.000000   
75%    17734.250000      3.000000  1.302000e+06     13.300000   3149.000000   
max    23546.000000     12.000000  9.000000e+06     48.100000   3978.000000   

           Bedroom2      Bathroom           Car       Landsize  BuildingArea  \
count  14927.000000  14925.000000  14820.000000   13603.000000   7762.000000   
mean       2.913043      1.538492      1.615520     558.116371    151.220219   
std        0.964641      0.689311      0.955916 

In [7]:
print(df.isnull().sum())


Unnamed: 0           0
Suburb               0
Address              0
Rooms                0
Type                 0
Price                0
Method               0
SellerG              0
Date                 0
Distance             1
Postcode             1
Bedroom2          3469
Bathroom          3471
Car               3576
Landsize          4793
BuildingArea     10634
YearBuilt         9438
CouncilArea       6163
Lattitude         3332
Longtitude        3332
Regionname           1
Propertycount        1
dtype: int64


In [9]:
# Drop columns with too many missing values
df = df.drop(columns=["BuildingArea", "YearBuilt"], errors="ignore")

# Fill numeric missing values with median
df = df.fillna(df.median(numeric_only=True))


In [10]:
numeric_cols = df.select_dtypes(include=["int64", "float64"])
Q1 = numeric_cols.quantile(0.25)
Q3 = numeric_cols.quantile(0.75)
IQR = Q3 - Q1

outliers = ((numeric_cols < (Q1 - 1.5 * IQR)) | (numeric_cols > (Q3 + 1.5 * IQR)))
print("Number of outliers per column:\n", outliers.sum())


Number of outliers per column:
 Unnamed: 0         0
Rooms            894
Price            852
Distance         597
Postcode         316
Bedroom2         713
Bathroom         153
Car              699
Landsize         491
Lattitude        758
Longtitude       990
Propertycount    541
dtype: int64


In [11]:
# Correlation matrix
corr_matrix = df.corr(numeric_only=True)

# Correlation of all features with Price
print(corr_matrix["Price"].sort_values(ascending=False))


Price            1.000000
Rooms            0.496430
Bedroom2         0.423709
Bathroom         0.404107
Car              0.204104
Longtitude       0.176656
Postcode         0.098796
Landsize         0.032847
Propertycount   -0.051247
Unnamed: 0      -0.071270
Distance        -0.169775
Lattitude       -0.184717
Name: Price, dtype: float64


In [12]:
df = pd.get_dummies(df, drop_first=True)

print(df.head())


   Unnamed: 0  Rooms      Price  Distance  Postcode  Bedroom2  Bathroom  Car  \
0           1      2  1480000.0       2.5    3067.0       2.0       1.0  1.0   
1           2      2  1035000.0       2.5    3067.0       2.0       1.0  0.0   
2           4      3  1465000.0       2.5    3067.0       3.0       2.0  0.0   
3           5      3   850000.0       2.5    3067.0       3.0       2.0  1.0   
4           6      4  1600000.0       2.5    3067.0       3.0       1.0  2.0   

   Landsize  Lattitude  ...  CouncilArea_Wyndham  CouncilArea_Yarra  \
0     202.0   -37.7996  ...                False               True   
1     156.0   -37.8079  ...                False               True   
2     134.0   -37.8093  ...                False               True   
3      94.0   -37.7969  ...                False               True   
4     120.0   -37.8072  ...                False               True   

   CouncilArea_Yarra Ranges  Regionname_Eastern Victoria  \
0                     False     

In [13]:
df.head()

Unnamed: 0.1,Unnamed: 0,Rooms,Price,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,Lattitude,...,CouncilArea_Wyndham,CouncilArea_Yarra,CouncilArea_Yarra Ranges,Regionname_Eastern Victoria,Regionname_Northern Metropolitan,Regionname_Northern Victoria,Regionname_South-Eastern Metropolitan,Regionname_Southern Metropolitan,Regionname_Western Metropolitan,Regionname_Western Victoria
0,1,2,1480000.0,2.5,3067.0,2.0,1.0,1.0,202.0,-37.7996,...,False,True,False,False,True,False,False,False,False,False
1,2,2,1035000.0,2.5,3067.0,2.0,1.0,0.0,156.0,-37.8079,...,False,True,False,False,True,False,False,False,False,False
2,4,3,1465000.0,2.5,3067.0,3.0,2.0,0.0,134.0,-37.8093,...,False,True,False,False,True,False,False,False,False,False
3,5,3,850000.0,2.5,3067.0,3.0,2.0,1.0,94.0,-37.7969,...,False,True,False,False,True,False,False,False,False,False
4,6,4,1600000.0,2.5,3067.0,3.0,1.0,2.0,120.0,-37.8072,...,False,True,False,False,True,False,False,False,False,False
