# Data Cleaning
This notebook handles:
1. Dataset Format Unifying
2. Nulls Estimation
3. Invalid Records Removal
4. Duplicates Removal

In [1]:
import pandas as pd

df = pd.read_csv("../data/Melbourne_Housing_Market.csv")
df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


In [2]:
from data_manipulation import data_cleaning as dc

## Dataset Format Unifying

In [3]:
df = dc.correct_column_names(df)
df.head()

Unnamed: 0,Suburb,Address,Rooms,UnitType,Price,SaleMethod,RealEstateAgent,SaleDate,DistanceToCBD,Postcode,...,Bathrooms,CarSpots,LandSize,BuildingArea,YearBuilt,CouncilArea,Latitude,Longitude,RegionName,NeighbouringProperties
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


In [4]:
df = dc.convert_floats_to_ints(df)
df.head()

Unnamed: 0,Suburb,Address,Rooms,UnitType,Price,SaleMethod,RealEstateAgent,SaleDate,DistanceToCBD,Postcode,...,Bathrooms,CarSpots,LandSize,BuildingArea,YearBuilt,CouncilArea,Latitude,Longitude,RegionName,NeighbouringProperties
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067,...,1,1,126,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067,...,1,1,202,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067,...,1,0,156,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067,...,2,1,0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067,...,2,0,134,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019


In [5]:
df = dc.format_df_cells(df)
df.head()

Unnamed: 0,Suburb,Address,Rooms,UnitType,Price,SaleMethod,RealEstateAgent,SaleDate,DistanceToCBD,Postcode,...,Bathrooms,CarSpots,LandSize,BuildingArea,YearBuilt,CouncilArea,Latitude,Longitude,RegionName,NeighbouringProperties
0,Abbotsford,68_Studley_St,2,House,,Sold_After_Auction_Not_Disclosed,Jellis,2016-03-09,2.5,3067,...,1,1,126,,,Yarra_City,-37.8014,144.9958,Northern_Metropolitan,4019
1,Abbotsford,85_Turner_St,2,House,1480000.0,Sold,Biggin,2016-03-12,2.5,3067,...,1,1,202,,,Yarra_City,-37.7996,144.9984,Northern_Metropolitan,4019
2,Abbotsford,25_Bloomburg_St,2,House,1035000.0,Sold,Biggin,2016-04-02,2.5,3067,...,1,0,156,79.0,1900.0,Yarra_City,-37.8079,144.9934,Northern_Metropolitan,4019
3,Abbotsford,18_659_Victoria_St,3,Duplex,,Vendor_Bid,Rounds,2016-04-02,2.5,3067,...,2,1,0,,,Yarra_City,-37.8114,145.0116,Northern_Metropolitan,4019
4,Abbotsford,5_Charles_St,3,House,1465000.0,Sold_Prior,Biggin,2017-04-03,2.5,3067,...,2,0,134,150.0,1900.0,Yarra_City,-37.8093,144.9944,Northern_Metropolitan,4019


In [6]:
df = dc.reorder_df_columns(df)
df.head()

Unnamed: 0,SaleDate,Address,CouncilArea,RealEstateAgent,RegionName,SaleMethod,Suburb,UnitType,Bathrooms,Bedrooms,...,CarSpots,DistanceToCBD,LandSize,Latitude,Longitude,NeighbouringProperties,Postcode,Rooms,YearBuilt,Price
0,2016-03-09,68_Studley_St,Yarra_City,Jellis,Northern_Metropolitan,Sold_After_Auction_Not_Disclosed,Abbotsford,House,1,2,...,1,2.5,126,-37.8014,144.9958,4019,3067,2,,
1,2016-03-12,85_Turner_St,Yarra_City,Biggin,Northern_Metropolitan,Sold,Abbotsford,House,1,2,...,1,2.5,202,-37.7996,144.9984,4019,3067,2,,1480000.0
2,2016-04-02,25_Bloomburg_St,Yarra_City,Biggin,Northern_Metropolitan,Sold,Abbotsford,House,1,2,...,0,2.5,156,-37.8079,144.9934,4019,3067,2,1900.0,1035000.0
3,2016-04-02,18_659_Victoria_St,Yarra_City,Rounds,Northern_Metropolitan,Vendor_Bid,Abbotsford,Duplex,2,3,...,1,2.5,0,-37.8114,145.0116,4019,3067,3,,
4,2017-04-03,5_Charles_St,Yarra_City,Biggin,Northern_Metropolitan,Sold_Prior,Abbotsford,House,2,3,...,0,2.5,134,-37.8093,144.9944,4019,3067,3,1900.0,1465000.0


## Nulls Estimation

In [7]:
df.isnull().sum()

SaleDate                      0
Address                       0
CouncilArea                   3
RealEstateAgent               0
RegionName                    3
SaleMethod                    0
Suburb                        0
UnitType                      0
Bathrooms                  8226
Bedrooms                   8217
BuildingArea              21115
CarSpots                   8728
DistanceToCBD                 1
LandSize                  11810
Latitude                   7976
Longitude                  7976
NeighbouringProperties        3
Postcode                      1
Rooms                         0
YearBuilt                 19306
Price                      7610
dtype: int64

In [8]:
df = dc.estimate_nulls(df, remove_price=True)
df.isnull().sum()

SaleDate                  0
Address                   0
CouncilArea               0
RealEstateAgent           0
RegionName                0
SaleMethod                0
Suburb                    0
UnitType                  0
Bathrooms                 0
Bedrooms                  0
BuildingArea              0
CarSpots                  0
DistanceToCBD             0
LandSize                  0
Latitude                  0
Longitude                 0
NeighbouringProperties    0
Postcode                  0
Rooms                     0
YearBuilt                 0
Price                     0
dtype: int64

## Invalid Records Removal

In [9]:
df.describe().T

Unnamed: 0,count,mean,min,25%,50%,75%,max,std
SaleDate,27247.0,2017-05-17 18:07:19.975042816,2016-01-28 00:00:00,2016-10-15 00:00:00,2017-07-10 00:00:00,2017-10-28 00:00:00,2018-10-03 00:00:00,
Bathrooms,27247.0,1.451683,0.0,1.0,1.0,2.0,9.0,0.661993
Bedrooms,27247.0,3.035307,0.0,3.0,3.0,3.0,20.0,0.834856
BuildingArea,27247.0,142.321442,0.0,133.0,133.0,133.0,44515.0,281.163583
CarSpots,27247.0,1.786655,0.0,1.0,2.0,2.0,18.0,0.869543
DistanceToCBD,27247.0,11.280247,0.0,6.4,10.5,14.0,48.1,6.787346
LandSize,27247.0,565.779645,0.0,351.0,512.0,592.0,433014.0,3052.546357
Latitude,27247.0,-37.80547,-38.19043,-37.84283,-37.80046,-37.766,-37.3978,0.080466
Longitude,27247.0,144.9982,144.42379,144.95435,145.0032,145.04835,145.52635,0.105963
NeighbouringProperties,27247.0,7566.67101,83.0,4294.0,6567.0,10412.0,21650.0,4492.147337


No Invalid Records Were Found

## Duplicates Removal

In [10]:
df.duplicated().sum()

np.int64(0)

In [11]:
df = dc.drop_duplicates(df)
df.duplicated().sum()

np.int64(0)

## Clean Data Saving

In [12]:
df.head()

Unnamed: 0,SaleDate,Address,CouncilArea,RealEstateAgent,RegionName,SaleMethod,Suburb,UnitType,Bathrooms,Bedrooms,...,CarSpots,DistanceToCBD,LandSize,Latitude,Longitude,NeighbouringProperties,Postcode,Rooms,YearBuilt,Price
1,2016-03-12,85_Turner_St,Yarra_City,Biggin,Northern_Metropolitan,Sold,Abbotsford,House,1.0,2.0,...,1.0,2.5,202.0,-37.7996,144.9984,4019.0,3067.0,2.0,1970.0,1480000.0
2,2016-04-02,25_Bloomburg_St,Yarra_City,Biggin,Northern_Metropolitan,Sold,Abbotsford,House,1.0,2.0,...,0.0,2.5,156.0,-37.8079,144.9934,4019.0,3067.0,2.0,1900.0,1035000.0
4,2017-04-03,5_Charles_St,Yarra_City,Biggin,Northern_Metropolitan,Sold_Prior,Abbotsford,House,2.0,3.0,...,0.0,2.5,134.0,-37.8093,144.9944,4019.0,3067.0,3.0,1900.0,1465000.0
5,2017-04-03,40_Federation_La,Yarra_City,Biggin,Northern_Metropolitan,Passed_In,Abbotsford,House,2.0,3.0,...,1.0,2.5,94.0,-37.7969,144.9969,4019.0,3067.0,3.0,1970.0,850000.0
6,2016-04-06,55A_Park_St,Yarra_City,Nelson,Northern_Metropolitan,Vendor_Bid,Abbotsford,House,1.0,3.0,...,2.0,2.5,120.0,-37.8072,144.9941,4019.0,3067.0,4.0,2014.0,1600000.0


In [13]:
df.to_csv("../data/CLEANED_Melbourne_Housing_Market.csv", index=False)