# Data Cleaning
This notebook handles:
1. Dataset Format Unifying
2. Nulls Estimation
3. Duplicates Removal
4. Outlier Handling

In [16]:
import pandas as pd

df = pd.read_csv("data/Melbourne_Housing_Market.csv")
df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


## Dataset Format Unifying

In [17]:
df.rename(columns={"Type": "UnitType", "Method": "SaleMethod", "SellerG": "RealEstateAgent", "Date": "SaleDate",
                   "Distance": "DistanceToCBD", "Bedroom2": "Bedrooms", "Bathroom": "Bathrooms", "Car": "CarSpots",
                   "Landsize": "LandSize", "Lattitude": "Latitude", "Longtitude": "Longitude",
                   "Regionname": "RegionName", "Propertycount": "NeighbouringProperties"}, inplace=True)
df.head()

Unnamed: 0,Suburb,Address,Rooms,UnitType,Price,SaleMethod,RealEstateAgent,SaleDate,DistanceToCBD,Postcode,...,Bathrooms,CarSpots,LandSize,BuildingArea,YearBuilt,CouncilArea,Latitude,Longitude,RegionName,NeighbouringProperties
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


In [18]:
df["StreetName"] = df["Address"].str.extract(r"^\S+\s(\w+)\s")
df["StreetType"] = df["Address"].str.extract(r".*\s(\w+)$")
df.drop(columns=["Address"], inplace=True)
df.head()

Unnamed: 0,Suburb,Rooms,UnitType,Price,SaleMethod,RealEstateAgent,SaleDate,DistanceToCBD,Postcode,Bedrooms,...,LandSize,BuildingArea,YearBuilt,CouncilArea,Latitude,Longitude,RegionName,NeighbouringProperties,StreetName,StreetType
0,Abbotsford,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,2.0,...,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0,Studley,St
1,Abbotsford,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,2.0,...,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0,Turner,St
2,Abbotsford,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,2.0,...,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0,Bloomburg,St
3,Abbotsford,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,3.0,...,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0,Victoria,St
4,Abbotsford,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,3.0,...,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0,Charles,St


In [19]:
streetTypeAbbreviations = {"St": "Street", "Rd": "Road", "Av": "Avenue", "Ct": "Court", "Dr": "Drive", "Cr": "Crescent",
                           "Gr": "Grove", "Pl": "Place", "Pde": "Parade", "Cl": "Close", "Wy": "Way", "La": "Lane",
                           "Bvd": "Boulevard", "Tce": "Terrace", "Cct": "Circuit", "Hwy": "Highway", "Avenue": "Avenue",
                           "Ri": "Rise", "Wk": "Walk", "Mw": "Meander Way", "Boulevard": "Boulevard", "Sq": "Square",
                           "Parade": "Parade", "Esplanade": "Esplanade", "N": "North", "Qd": "Quay", "Cir": "Circle",
                           "Vw": "View", "S": "South", "Crescent": "Crescent", "Prm": "Promenade", "Gdns": "Gardens",
                           "W": "West", "Strand": "Strand", "Grove": "Grove", "Ridge": "Ridge", "Vs": "Views",
                           "Ch": "Chase", "Fairway": "Fairway", "Righi": "Right", "E": "East", "Grn": "Green",
                           "Wyn": "Way", "Gln": "Glen", "Esp": "Esplanade", "Bnd": "Bend", "Mews": "Mews",
                           "Rdg": "Ridge", "Lp": "Loop", "Crest": "Crest", "Grange": "Grange", "Crossway": "Crossway",
                           "Pky": "Parkway", "Gra": "Grange", "Rt": "Route", "Res": "Reserve", "Wky": "Way",
                           "East": "East", "Lk": "Lake", "Nk": "Nook", "Gwy": "Gateway", "Mall": "Mall",
                           "Highway": "Highway", "Ambl": "Ambleside", "Terrace": "Terrace", "Pt": "Point",
                           "Parkway": "Parkway", "Street": "Street", "Corso": "Corso", "Outlook": "Outlook",
                           "Media": "Media", "Hub": "Hub", "Crofts": "Crofts", "Victoria": "Victoria", "Nth": "North",
                           "Athol": "Athol", "Nook": "Nook", "Rise": "Rise", "Greenway": "Greenway", "Views": "Views",
                           "street": "Street", "Hl": "Hill", "Glade": "Glade", "Cove": "Cove", "Qy": "Quay",
                           "Lairidge": "Lairidge", "Scala": "Scala", "Broadway": "Broadway", "Road": "Road",
                           "Prst": "Prestwick", "Grand": "Grand", "Loop": "Loop", "Eyrie": "Eyrie", "Dell": "Dell",
                           "Gve": "Grove", "Pkt": "Pocket", "Al": "Alley", "West": "West", "Hts": "Heights",
                           "Aveue": "Avenue", "Summit": "Summit", "Ave": "Avenue", "Woodland": "Woodland",
                           "Edg": "Edge", "Skyline": "Skyline", "Out": "Outlook", "Range": "Range", "Hth": "Heath",
                           "Atrium": "Atrium", "Gables": "Gables", "Mears": "Mears", "App": "Approach", "Brk": "Brook",
                           "Spur": "Spur", "Court": "Court", "Pass": "Pass", "Gld": "Gold", "Crse": "Course",
                           "Ps": "Passage", "Entrance": "Entrance", "Heights": "Heights", "Boulevarde": "Boulevarde",
                           "Circuit": "Circuit", "Parks": "Parks", "Ridgeway": "Ridgeway", "Panorama": "Panorama",
                           "Briars": "Briars"}
df["StreetType"] = df["StreetType"].replace(streetTypeAbbreviations)
df.head()

Unnamed: 0,Suburb,Rooms,UnitType,Price,SaleMethod,RealEstateAgent,SaleDate,DistanceToCBD,Postcode,Bedrooms,...,LandSize,BuildingArea,YearBuilt,CouncilArea,Latitude,Longitude,RegionName,NeighbouringProperties,StreetName,StreetType
0,Abbotsford,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,2.0,...,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0,Studley,Street
1,Abbotsford,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,2.0,...,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0,Turner,Street
2,Abbotsford,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,2.0,...,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0,Bloomburg,Street
3,Abbotsford,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,3.0,...,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0,Victoria,Street
4,Abbotsford,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,3.0,...,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0,Charles,Street


In [20]:
floatColumns = ["DistanceToCBD", "BuildingArea", "Latitude", "Longitude"]
intColumns = [column for column in df.select_dtypes(exclude=object).columns if column not in floatColumns]
for column in intColumns:
    df[column] = pd.to_numeric(df[column], errors="coerce").astype("Int64")
df.head()

Unnamed: 0,Suburb,Rooms,UnitType,Price,SaleMethod,RealEstateAgent,SaleDate,DistanceToCBD,Postcode,Bedrooms,...,LandSize,BuildingArea,YearBuilt,CouncilArea,Latitude,Longitude,RegionName,NeighbouringProperties,StreetName,StreetType
0,Abbotsford,2,h,,SS,Jellis,3/09/2016,2.5,3067,2,...,126,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019,Studley,Street
1,Abbotsford,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067,2,...,202,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019,Turner,Street
2,Abbotsford,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067,2,...,156,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019,Bloomburg,Street
3,Abbotsford,3,u,,VB,Rounds,4/02/2016,2.5,3067,3,...,0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019,Victoria,Street
4,Abbotsford,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067,3,...,134,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019,Charles,Street


In [21]:
df["UnitType"] = df["UnitType"].replace({"h": "House", "u": "Duplex", "t": "Town House"})
df.head()

Unnamed: 0,Suburb,Rooms,UnitType,Price,SaleMethod,RealEstateAgent,SaleDate,DistanceToCBD,Postcode,Bedrooms,...,LandSize,BuildingArea,YearBuilt,CouncilArea,Latitude,Longitude,RegionName,NeighbouringProperties,StreetName,StreetType
0,Abbotsford,2,House,,SS,Jellis,3/09/2016,2.5,3067,2,...,126,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019,Studley,Street
1,Abbotsford,2,House,1480000.0,S,Biggin,3/12/2016,2.5,3067,2,...,202,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019,Turner,Street
2,Abbotsford,2,House,1035000.0,S,Biggin,4/02/2016,2.5,3067,2,...,156,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019,Bloomburg,Street
3,Abbotsford,3,Duplex,,VB,Rounds,4/02/2016,2.5,3067,3,...,0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019,Victoria,Street
4,Abbotsford,3,House,1465000.0,SP,Biggin,4/03/2017,2.5,3067,3,...,134,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019,Charles,Street


In [22]:
saleMethodAbbreviations = {"S": "Sold", "SP": "Sold Prior", "PI": "Passed In", "VB": "Vendor Bid",
                           "SN": "Sold Not Disclosed", "PN": "Sold Prior Not Disclosed", "SA": "Sold After Auction",
                           "W": "Withdrawn Prior to Auction", "SS": "Sold After Auction Not Disclosed"}
df["SaleMethod"] = df["SaleMethod"].replace(saleMethodAbbreviations)
df.head()

Unnamed: 0,Suburb,Rooms,UnitType,Price,SaleMethod,RealEstateAgent,SaleDate,DistanceToCBD,Postcode,Bedrooms,...,LandSize,BuildingArea,YearBuilt,CouncilArea,Latitude,Longitude,RegionName,NeighbouringProperties,StreetName,StreetType
0,Abbotsford,2,House,,Sold After Auction Not Disclosed,Jellis,3/09/2016,2.5,3067,2,...,126,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019,Studley,Street
1,Abbotsford,2,House,1480000.0,Sold,Biggin,3/12/2016,2.5,3067,2,...,202,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019,Turner,Street
2,Abbotsford,2,House,1035000.0,Sold,Biggin,4/02/2016,2.5,3067,2,...,156,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019,Bloomburg,Street
3,Abbotsford,3,Duplex,,Vendor Bid,Rounds,4/02/2016,2.5,3067,3,...,0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019,Victoria,Street
4,Abbotsford,3,House,1465000.0,Sold Prior,Biggin,4/03/2017,2.5,3067,3,...,134,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019,Charles,Street


In [23]:
df["SaleDate"] = pd.to_datetime(df["SaleDate"], format="mixed")
df.head()

Unnamed: 0,Suburb,Rooms,UnitType,Price,SaleMethod,RealEstateAgent,SaleDate,DistanceToCBD,Postcode,Bedrooms,...,LandSize,BuildingArea,YearBuilt,CouncilArea,Latitude,Longitude,RegionName,NeighbouringProperties,StreetName,StreetType
0,Abbotsford,2,House,,Sold After Auction Not Disclosed,Jellis,2016-03-09,2.5,3067,2,...,126,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019,Studley,Street
1,Abbotsford,2,House,1480000.0,Sold,Biggin,2016-03-12,2.5,3067,2,...,202,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019,Turner,Street
2,Abbotsford,2,House,1035000.0,Sold,Biggin,2016-04-02,2.5,3067,2,...,156,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019,Bloomburg,Street
3,Abbotsford,3,Duplex,,Vendor Bid,Rounds,2016-04-02,2.5,3067,3,...,0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019,Victoria,Street
4,Abbotsford,3,House,1465000.0,Sold Prior,Biggin,2017-04-03,2.5,3067,3,...,134,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019,Charles,Street


In [24]:
df["CouncilArea"] = df["CouncilArea"].str.replace(" Council", "")
df.head()

Unnamed: 0,Suburb,Rooms,UnitType,Price,SaleMethod,RealEstateAgent,SaleDate,DistanceToCBD,Postcode,Bedrooms,...,LandSize,BuildingArea,YearBuilt,CouncilArea,Latitude,Longitude,RegionName,NeighbouringProperties,StreetName,StreetType
0,Abbotsford,2,House,,Sold After Auction Not Disclosed,Jellis,2016-03-09,2.5,3067,2,...,126,,,Yarra City,-37.8014,144.9958,Northern Metropolitan,4019,Studley,Street
1,Abbotsford,2,House,1480000.0,Sold,Biggin,2016-03-12,2.5,3067,2,...,202,,,Yarra City,-37.7996,144.9984,Northern Metropolitan,4019,Turner,Street
2,Abbotsford,2,House,1035000.0,Sold,Biggin,2016-04-02,2.5,3067,2,...,156,79.0,1900.0,Yarra City,-37.8079,144.9934,Northern Metropolitan,4019,Bloomburg,Street
3,Abbotsford,3,Duplex,,Vendor Bid,Rounds,2016-04-02,2.5,3067,3,...,0,,,Yarra City,-37.8114,145.0116,Northern Metropolitan,4019,Victoria,Street
4,Abbotsford,3,House,1465000.0,Sold Prior,Biggin,2017-04-03,2.5,3067,3,...,134,150.0,1900.0,Yarra City,-37.8093,144.9944,Northern Metropolitan,4019,Charles,Street


In [25]:
nonAlNumLetters = set()
for column in df.select_dtypes(include=object).columns:
    for cell in df[column]:
        if isinstance(cell, str):
            for letter in cell:
                if not letter.isalnum():
                    nonAlNumLetters.add(letter)
nonAlNumLetters

{' ', '&', "'", '-', '.', '/', '@'}

In [26]:
for column in df.select_dtypes(include=object).columns:
    df[column] = df[column].str.strip().str.title()
    for nonAlNumLetter in nonAlNumLetters:
        df[column] = df[column].str.replace(nonAlNumLetter, "_")
    df[column] = df[column].str.strip("_")
df.head()

Unnamed: 0,Suburb,Rooms,UnitType,Price,SaleMethod,RealEstateAgent,SaleDate,DistanceToCBD,Postcode,Bedrooms,...,LandSize,BuildingArea,YearBuilt,CouncilArea,Latitude,Longitude,RegionName,NeighbouringProperties,StreetName,StreetType
0,Abbotsford,2,House,,Sold_After_Auction_Not_Disclosed,Jellis,2016-03-09,2.5,3067,2,...,126,,,Yarra_City,-37.8014,144.9958,Northern_Metropolitan,4019,Studley,Street
1,Abbotsford,2,House,1480000.0,Sold,Biggin,2016-03-12,2.5,3067,2,...,202,,,Yarra_City,-37.7996,144.9984,Northern_Metropolitan,4019,Turner,Street
2,Abbotsford,2,House,1035000.0,Sold,Biggin,2016-04-02,2.5,3067,2,...,156,79.0,1900.0,Yarra_City,-37.8079,144.9934,Northern_Metropolitan,4019,Bloomburg,Street
3,Abbotsford,3,Duplex,,Vendor_Bid,Rounds,2016-04-02,2.5,3067,3,...,0,,,Yarra_City,-37.8114,145.0116,Northern_Metropolitan,4019,Victoria,Street
4,Abbotsford,3,House,1465000.0,Sold_Prior,Biggin,2017-04-03,2.5,3067,3,...,134,150.0,1900.0,Yarra_City,-37.8093,144.9944,Northern_Metropolitan,4019,Charles,Street


In [27]:
df = df[df.select_dtypes(include=object).columns.tolist() + df.select_dtypes(exclude=object).columns.tolist()]
columns = ["SaleDate"] + [column for column in df.columns if column not in ["SaleDate", "Price"]] + ["Price"]
df = df[columns]
df.head()

Unnamed: 0,SaleDate,Suburb,UnitType,SaleMethod,RealEstateAgent,CouncilArea,RegionName,StreetName,StreetType,Rooms,...,Bedrooms,Bathrooms,CarSpots,LandSize,BuildingArea,YearBuilt,Latitude,Longitude,NeighbouringProperties,Price
0,2016-03-09,Abbotsford,House,Sold_After_Auction_Not_Disclosed,Jellis,Yarra_City,Northern_Metropolitan,Studley,Street,2,...,2,1,1,126,,,-37.8014,144.9958,4019,
1,2016-03-12,Abbotsford,House,Sold,Biggin,Yarra_City,Northern_Metropolitan,Turner,Street,2,...,2,1,1,202,,,-37.7996,144.9984,4019,1480000.0
2,2016-04-02,Abbotsford,House,Sold,Biggin,Yarra_City,Northern_Metropolitan,Bloomburg,Street,2,...,2,1,0,156,79.0,1900.0,-37.8079,144.9934,4019,1035000.0
3,2016-04-02,Abbotsford,Duplex,Vendor_Bid,Rounds,Yarra_City,Northern_Metropolitan,Victoria,Street,3,...,3,2,1,0,,,-37.8114,145.0116,4019,
4,2017-04-03,Abbotsford,House,Sold_Prior,Biggin,Yarra_City,Northern_Metropolitan,Charles,Street,3,...,3,2,0,134,150.0,1900.0,-37.8093,144.9944,4019,1465000.0


## Nulls Estimation

In [28]:
df.isnull().sum()

SaleDate                      0
Suburb                        0
UnitType                      0
SaleMethod                    0
RealEstateAgent               0
CouncilArea                   3
RegionName                    3
StreetName                    0
StreetType                    0
Rooms                         0
DistanceToCBD                 1
Postcode                      1
Bedrooms                   8217
Bathrooms                  8226
CarSpots                   8728
LandSize                  11810
BuildingArea              21115
YearBuilt                 19306
Latitude                   7976
Longitude                  7976
NeighbouringProperties        3
Price                      7610
dtype: int64

In [29]:
df.dropna(subset=["Price"], inplace=True)
df.isnull().sum()

SaleDate                      0
Suburb                        0
UnitType                      0
SaleMethod                    0
RealEstateAgent               0
CouncilArea                   3
RegionName                    3
StreetName                    0
StreetType                    0
Rooms                         0
DistanceToCBD                 1
Postcode                      1
Bedrooms                   6441
Bathrooms                  6447
CarSpots                   6824
LandSize                   9265
BuildingArea              16591
YearBuilt                 15163
Latitude                   6254
Longitude                  6254
NeighbouringProperties        3
Price                         0
dtype: int64

In [30]:
from sklearn.impute import SimpleImputer


imputer = SimpleImputer(strategy='median')
df["SaleDate"] = df["SaleDate"].astype("int64")
df[df.select_dtypes(exclude=object).columns] = imputer.fit_transform(df[df.select_dtypes(exclude=object).columns])
df["SaleDate"] = pd.to_datetime(df["SaleDate"])
df[df.select_dtypes(exclude=object).columns].isnull().sum()

SaleDate                  0
Rooms                     0
DistanceToCBD             0
Postcode                  0
Bedrooms                  0
Bathrooms                 0
CarSpots                  0
LandSize                  0
BuildingArea              0
YearBuilt                 0
Latitude                  0
Longitude                 0
NeighbouringProperties    0
Price                     0
dtype: int64

In [31]:
imputer = SimpleImputer(strategy='most_frequent')
df[df.select_dtypes(include=object).columns] = imputer.fit_transform(df[df.select_dtypes(include=object).columns])
df[df.select_dtypes(include=object).columns].isnull().sum()

Suburb             0
UnitType           0
SaleMethod         0
RealEstateAgent    0
CouncilArea        0
RegionName         0
StreetName         0
StreetType         0
dtype: int64

In [32]:
df.isnull().sum()

SaleDate                  0
Suburb                    0
UnitType                  0
SaleMethod                0
RealEstateAgent           0
CouncilArea               0
RegionName                0
StreetName                0
StreetType                0
Rooms                     0
DistanceToCBD             0
Postcode                  0
Bedrooms                  0
Bathrooms                 0
CarSpots                  0
LandSize                  0
BuildingArea              0
YearBuilt                 0
Latitude                  0
Longitude                 0
NeighbouringProperties    0
Price                     0
dtype: int64

## Duplicates Removal

In [34]:
df.duplicated().sum()

np.int64(7)

In [35]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()

np.int64(0)