# Data preprocessing and researching

In [21]:
!python --version


Python 3.8.3


In [2]:
import pandas as pd
from datetime import datetime
import tensorflow as tf
import matplotlib.pyplot as plt
import matplotlib
import time
import numpy as np
import os
%matplotlib inline
%load_ext tensorboard

In [3]:
print(tf.__version__)
print(tf.config.list_physical_devices("GPU"))
print(tf.config.list_logical_devices("CPU"))

2.3.0
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
[LogicalDevice(name='/device:CPU:0', device_type='CPU')]


In [4]:
DATASET_PATH = "./data/melb_data.csv"
EXCLUDE_COLUMNS = ['Address','Method','SellerG','Date','Postcode','CouncilArea','Lattitude','Longtitude']
TARGET_COLUMN_NAME = 'Price'

In [5]:
melbourne_data= pd.read_csv(DATASET_PATH) 

In [6]:
melbourne_data.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [7]:
melbourne_data = melbourne_data.drop(columns=EXCLUDE_COLUMNS)

In [8]:
target_column = melbourne_data.pop(TARGET_COLUMN_NAME)
target_column.head()

0    1480000.0
1    1035000.0
2    1465000.0
3     850000.0
4    1600000.0
Name: Price, dtype: float64

In [9]:
melbourne_data.head()

Unnamed: 0,Suburb,Rooms,Type,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Regionname,Propertycount
0,Abbotsford,2,h,2.5,2.0,1.0,1.0,202.0,,,Northern Metropolitan,4019.0
1,Abbotsford,2,h,2.5,2.0,1.0,0.0,156.0,79.0,1900.0,Northern Metropolitan,4019.0
2,Abbotsford,3,h,2.5,3.0,2.0,0.0,134.0,150.0,1900.0,Northern Metropolitan,4019.0
3,Abbotsford,3,h,2.5,3.0,2.0,1.0,94.0,,,Northern Metropolitan,4019.0
4,Abbotsford,4,h,2.5,3.0,1.0,2.0,120.0,142.0,2014.0,Northern Metropolitan,4019.0


In [10]:
# lower 
melbourne_data['Suburb']=melbourne_data['Suburb'].str.lower()
melbourne_data['Regionname']=melbourne_data['Regionname'].str.lower()

# covenrt hige numbers to small
numeric_features = melbourne_data.dtypes[melbourne_data.dtypes != 'object'].index
melbourne_data[numeric_features] = melbourne_data[numeric_features].apply(
    lambda x: (x - x.mean()) / (x.std()))
melbourne_data[numeric_features] = melbourne_data[numeric_features].fillna(-1)

In [11]:
melbourne_data.head()

Unnamed: 0,Suburb,Rooms,Type,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Regionname,Propertycount
0,abbotsford,-0.981427,h,-1.301437,-0.947,-0.772347,-0.633757,-0.089312,-1.0,-1.0,northern metropolitan,-0.784596
1,abbotsford,-0.981427,h,-1.301437,-0.947,-0.772347,-1.672574,-0.100839,-0.134872,-1.735382,northern metropolitan,-0.784596
2,abbotsford,0.064874,h,-1.301437,0.088281,0.673342,-1.672574,-0.106352,-0.003637,-1.735382,northern metropolitan,-0.784596
3,abbotsford,0.064874,h,-1.301437,0.088281,0.673342,-0.633757,-0.116375,-1.0,-1.0,northern metropolitan,-0.784596
4,abbotsford,1.111175,h,-1.301437,0.088281,-0.772347,0.40506,-0.10986,-0.018424,1.32307,northern metropolitan,-0.784596


In [12]:
melbourne_data = pd.get_dummies(melbourne_data, dummy_na=True) # add Nan and convert word to one hot dataframe column
melbourne_data.shape

(13580, 337)

In [13]:
melbourne_data.head()

Unnamed: 0,Rooms,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Propertycount,Suburb_abbotsford,...,Type_nan,Regionname_eastern metropolitan,Regionname_eastern victoria,Regionname_northern metropolitan,Regionname_northern victoria,Regionname_south-eastern metropolitan,Regionname_southern metropolitan,Regionname_western metropolitan,Regionname_western victoria,Regionname_nan
0,-0.981427,-1.301437,-0.947,-0.772347,-0.633757,-0.089312,-1.0,-1.0,-0.784596,1,...,0,0,0,1,0,0,0,0,0,0
1,-0.981427,-1.301437,-0.947,-0.772347,-1.672574,-0.100839,-0.134872,-1.735382,-0.784596,1,...,0,0,0,1,0,0,0,0,0,0
2,0.064874,-1.301437,0.088281,0.673342,-1.672574,-0.106352,-0.003637,-1.735382,-0.784596,1,...,0,0,0,1,0,0,0,0,0,0
3,0.064874,-1.301437,0.088281,0.673342,-0.633757,-0.116375,-1.0,-1.0,-0.784596,1,...,0,0,0,1,0,0,0,0,0,0
4,1.111175,-1.301437,0.088281,-0.772347,0.40506,-0.10986,-0.018424,1.32307,-0.784596,1,...,0,0,0,1,0,0,0,0,0,0


In [14]:
melbourne_data[TARGET_COLUMN_NAME]=target_column

In [15]:
melbourne_data = melbourne_data.sample(frac=1)
train = melbourne_data.sample(frac=0.8,random_state=200)
target_train = train.pop('Price')
test = melbourne_data.drop(train.index)
target_test = test.pop('Price')

In [16]:
train.head()

Unnamed: 0,Rooms,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,Propertycount,Suburb_abbotsford,...,Type_nan,Regionname_eastern metropolitan,Regionname_eastern victoria,Regionname_northern metropolitan,Regionname_northern victoria,Regionname_south-eastern metropolitan,Regionname_southern metropolitan,Regionname_western metropolitan,Regionname_western victoria,Regionname_nan
7334,0.064874,0.658103,-0.947,-0.772347,0.40506,0.033223,-0.022121,-0.393956,-0.966162,0,...,0,0,0,0,0,0,1,0,0,0
9534,0.064874,-0.330187,0.088281,0.673342,-0.633757,-0.099586,-0.005485,1.269413,-1.403746,0,...,0,0,0,0,0,0,0,1,0,0
3205,-0.981427,-0.943608,-0.947,-0.772347,-0.633757,0.075322,-0.149659,-0.125671,0.880098,0,...,0,0,0,0,0,0,1,0,0,0
6233,1.111175,0.624024,1.123562,0.673342,0.40506,0.05302,0.014847,0.008472,-0.464629,0,...,0,1,0,0,0,0,0,0,0,0
8121,-0.981427,0.334353,-0.947,0.673342,-0.633757,-0.13993,-0.142265,1.296241,-0.687989,0,...,0,0,0,0,0,0,1,0,0,0


In [17]:
target_train.head()

7334     865000.0
9534     800000.0
3205     612500.0
6233    1209000.0
8121     586000.0
Name: Price, dtype: float64

In [20]:
train.shape[1]

337