In [1]:
# USE CASE :- House Price Prediction.
# Dataset :- melb_data.csv.
# Perform the following tasks:
        # 1.load the dataset using pandas.
        # 2.handle inappropriate data.
        # 3.handle missing data.
        # 4.handle categorical data.

In [3]:
import pandas as pd

# Load dataset
data = pd.read_csv("melb_data.csv")

# See first 5 rows
print(data.head())


   Unnamed: 0      Suburb           Address  Rooms Type      Price Method  \
0           1  Abbotsford      85 Turner St      2    h  1480000.0      S   
1           2  Abbotsford   25 Bloomburg St      2    h  1035000.0      S   
2           4  Abbotsford      5 Charles St      3    h  1465000.0     SP   
3           5  Abbotsford  40 Federation La      3    h   850000.0     PI   
4           6  Abbotsford       55a Park St      4    h  1600000.0     VB   

  SellerG       Date  Distance  ...  Bathroom  Car  Landsize  BuildingArea  \
0  Biggin  3/12/2016       2.5  ...       1.0  1.0     202.0           NaN   
1  Biggin  4/02/2016       2.5  ...       1.0  0.0     156.0          79.0   
2  Biggin  4/03/2017       2.5  ...       2.0  0.0     134.0         150.0   
3  Biggin  4/03/2017       2.5  ...       2.0  1.0      94.0           NaN   
4  Nelson  4/06/2016       2.5  ...       1.0  2.0     120.0         142.0   

   YearBuilt  CouncilArea  Lattitude Longtitude             Regionna

In [4]:
# Step 2: Handle inappropriate data

# Drop columns that are not useful
data = data.drop(["Address", "SellerG", "Date", "CouncilArea"], axis=1)


In [7]:
# Step 3: Handle missing data

# Numeric columns → fill with median
for col in data.select_dtypes(include=['float64', 'int64']).columns:
    data[col] = data[col].fillna(data[col].median())

# Categorical columns → fill with mode
for col in data.select_dtypes(include=['object']).columns:
    data[col] = data[col].fillna(data[col].mode()[0])


In [8]:
# Step 4: Handle categorical data

# One-Hot Encoding for categorical variables
data = pd.get_dummies(data, drop_first=True)

print(data.head())


   Unnamed: 0  Rooms      Price  Distance  Postcode  Bedroom2  Bathroom  Car  \
0           1      2  1480000.0       2.5    3067.0       2.0       1.0  1.0   
1           2      2  1035000.0       2.5    3067.0       2.0       1.0  0.0   
2           4      3  1465000.0       2.5    3067.0       3.0       2.0  0.0   
3           5      3   850000.0       2.5    3067.0       3.0       2.0  1.0   
4           6      4  1600000.0       2.5    3067.0       3.0       1.0  2.0   

   Landsize  BuildingArea  ...  Method_SA  Method_SP  Method_VB  \
0     202.0         126.0  ...      False      False      False   
1     156.0          79.0  ...      False      False      False   
2     134.0         150.0  ...      False       True      False   
3      94.0         126.0  ...      False      False      False   
4     120.0         142.0  ...      False      False       True   

   Regionname_Eastern Victoria  Regionname_Northern Metropolitan  \
0                        False                  

In [9]:
# Example: Separate features and target
X = data.drop("Price", axis=1)
y = data["Price"]


In [13]:
X.head()

Unnamed: 0.1,Unnamed: 0,Rooms,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,...,Method_SA,Method_SP,Method_VB,Regionname_Eastern Victoria,Regionname_Northern Metropolitan,Regionname_Northern Victoria,Regionname_South-Eastern Metropolitan,Regionname_Southern Metropolitan,Regionname_Western Metropolitan,Regionname_Western Victoria
0,1,2,2.5,3067.0,2.0,1.0,1.0,202.0,126.0,1970.0,...,False,False,False,False,True,False,False,False,False,False
1,2,2,2.5,3067.0,2.0,1.0,0.0,156.0,79.0,1900.0,...,False,False,False,False,True,False,False,False,False,False
2,4,3,2.5,3067.0,3.0,2.0,0.0,134.0,150.0,1900.0,...,False,True,False,False,True,False,False,False,False,False
3,5,3,2.5,3067.0,3.0,2.0,1.0,94.0,126.0,1970.0,...,False,False,False,False,True,False,False,False,False,False
4,6,4,2.5,3067.0,3.0,1.0,2.0,120.0,142.0,2014.0,...,False,False,True,False,True,False,False,False,False,False
