In [1]:
import pandas as pd
import numpy as np
import nltk 
from nltk.corpus import stopwords
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer,PorterStemmer
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout, Bidirectional, Dropout
import seaborn as sns

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Avin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Avin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Avin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [4]:
#Load the data csv from google drive
house_data = pd.read_csv('/content/drive/My Drive/databackup.csv')
house_data.head()

FileNotFoundError: [Errno 2] File databackup.csv does not exist: 'databackup.csv'

In [None]:
#Print the datatypes 
house_data.info()

#All variables are string and there are 11 variables

In [None]:
#Drop the rows with nulls
house_data = house_data.dropna() 
house_data = house_data.iloc[1:] #removing the 1st row of the dataframe as it does not have any data

Tidying up


The response variable is house price.
The predictors can be multiple, including the suburb, amenities, description, type, schools..

In [None]:
#I select the potential independent variables as features
features_col = ['Suburb', 'Profile', 'Amenities', 'Type','Address']
house_data[features_col]

In [None]:
#Clean the features by removing the stopwords, regular expressions and replace with a space
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub(' ', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = text.replace('x', ' ')
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    return text

#Applying the clean_text() to features
#house_data['Schools'] = house_data['Schools'].apply(clean_text)
house_data['Profile'] = house_data['Profile'].apply(clean_text)
house_data['Amenities'] = house_data['Amenities'].apply(clean_text)
house_data['Type'] = house_data['Type'].apply(clean_text)
house_data['Suburb'] = house_data['Suburb'].apply(clean_text)

In [None]:
#Further cleaning of the column 'Suburb'
#We will remove the vic - postcode and preserve only the suburb's name
def clean_sub(text):
  text = text.replace('vic','')
  return text


house_data['Suburb'] = house_data['Suburb'].apply(clean_sub)
house_data['Suburb'] = house_data['Suburb'].str.replace('\d+', '')

In [None]:
#Cleaning the house price data and converting to a numeric column
def remove(text):
  text = text.replace('SOLD -','')
  text = text.replace('Price Withheld','0')
  text = text.replace('$','')
  text = text.replace(',','')
  return text

house_data['Price'] = house_data['Price'].apply(remove)
house_data['Price'] = pd.to_numeric(house_data['Price'])
house_data = house_data[house_data['Price'] > 0]

In [None]:
#Plot of histogram for the house price is skewed
house_data.hist(column='Price')

In [None]:
'''
#Median price of the houses
median_price = house_data.median()
print(median_price)
#Replace the null values of house price data with the median figure
house_data=house_data.replace({'Price': {0: 775000.0}})
house_data['Price']
'''

In [None]:
house_data[features_col]

In [None]:
#Remove the vacant land
house_data = house_data[house_data['Type'] != 'vacant land']

Explaratory Data Analysis

In [None]:
#Plotting house price against the type of house
print(house_data['Type'].unique())
df = house_data.groupby('Type')['Price'].mean()
df.plot.bar()

In [None]:
#PLotting house price against the suburb name
df = house_data.groupby('Suburb')['Price'].mean()
df.plot.bar()

Feature Extracting

Extracting the number of beds/baths and parking from Amenities

In [None]:
house_data.Amenities[2096] 

In [None]:
#extracting the number of baths/beds/parkings and storing them as seperate columns 
regex = r'(?P<beds>\d)\sbeds?\s(?P<bath>\d+)\sbaths?\s?(?P<parking>\d)?'
house_data = pd.concat([house_data, house_data['Amenities'].str.extract(regex)], axis=1)

In [None]:
#Fill the null values in the parking column == 0
house_data['parking'] = house_data['parking'].fillna(0)
house_data['beds'] = house_data['beds'].fillna(0)
house_data['bath'] = house_data['bath'].fillna(0)

In [None]:
features_col = ['Suburb', 'Type', 'beds', 'bath', 'parking']
house_data[features_col]

In [None]:
house_data['beds'].unique()

In [None]:
df = house_data.groupby('beds')['Price'].mean()
df.plot.bar()

In [None]:
df = house_data.groupby('bath')['Price'].mean()
df.plot.bar()

In [None]:
df = house_data.groupby('parking')['Price'].mean()
df.plot.bar()

Extracting the the street names from Address

In [None]:
df = house_data['Address'].str.split(r'[/0-9]+', expand=True).drop(columns=[0,2,3])
df = df[1].str.split(' ', expand=True).drop(columns=[0,3,4,5,6,7])
house_data['street'] = df[1].str.cat(df[2], sep =" ") 

In [None]:
house_data.street

Extracting the distance to the nearest school

In [None]:
def extract(text):
  return re.search('[0-9.0-9]+', text).group()

house_data['Schools'] = house_data['Schools'].apply(extract)

In [None]:
features_col = ['Suburb', 'Type', 'beds', 'bath', 'parking','Schools']
house_data[features_col]
house_data['Schools'] = pd.to_numeric(house_data['Schools'])

In [None]:
#house' 'apartment unit flat' 'townhouse' 'new apartments plan' 'villa'
fig, ax = plt.subplots(figsize=(20,10))
df_wide=house_data.pivot_table(index='Type', columns='beds', values='Price')

p2=sns.heatmap(df_wide, ax = ax)

Insights: 
1. Villa with <= 1 number of beds has the least price
2. Apartment unit flat has the highest price for beds >= 4

In [None]:
fig, ax = plt.subplots(figsize=(20,10))
df_wide=house_data.pivot_table(index='Type', columns='Schools', values='Price')

p2=sns.heatmap(df_wide, ax = ax)

Insights:
1. houses with schools in close proximity (0.1 - 1.1km) are high in price. similar pattern in new apartment. 

Preprocessing the features before modelling

Label encode the categorical data

In [None]:
house_data[features_col].info()

In [None]:
#Label encode the suburbs, type and amenities
le = LabelEncoder()
house_data['Suburb'] = le.fit_transform(house_data['Suburb'])
house_data['Type'] = le.fit_transform(house_data['Type'])
house_data['beds'] = pd.to_numeric(house_data['beds'])
house_data['bath'] = pd.to_numeric(house_data['bath'])
house_data['parking'] = pd.to_numeric(house_data['parking'])

In [None]:
house_data[features_col].info()

In [None]:
house_data['Schools'].unique()

In [None]:
house_data.plot.scatter(x='Schools',
                      y='Price',
                      c='DarkBlue')


In [None]:
#Converting the X and Y to arrays
X = house_data[features_col].values
Y = house_data['Price'].values

In [None]:
#Split the test and train data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=42)

In [None]:
#Converting to 2D array
y_train = np.reshape(y_train, (y_train.shape[0], 1))
y_test = np.reshape(y_test, (y_test.shape[0], 1))

y_train.shape

In [None]:
#Scaling the data to 0 - 1
sc = MinMaxScaler(feature_range = (0,1))

X_train = sc.fit_transform(X_train)
y_train = sc.fit_transform(y_train)
X_test = sc.fit_transform(X_test)
y_test = sc.fit_transform(y_test)


In [None]:
#Converting the X_train, y_train, X_test, y_test into 3D arrays
def create_dataset (X, y, time_steps = 1):
    Xs, ys = [], []
    for i in range(len(X)-time_steps):
        v = X[i:i+time_steps, :]
        Xs.append(v)
        ys.append(y[i+time_steps])
    return np.array(Xs), np.array(ys)

TIME_STEPS = 30


In [None]:
X_test, y_test = create_dataset(X_test, y_test,   
                                TIME_STEPS)
X_train, y_train = create_dataset(X_train,y_train, 
                                  TIME_STEPS)
print('X_train.shape: ', X_test.shape)
print('y_train.shape: ', y_train.shape)
print('X_test.shape: ', X_test.shape)
print('y_test.shape: ', y_train.shape)

In [None]:
#Basic LSTM model
def create_model():
  regressor = Sequential()
  # Adding the first LSTM layer and some Dropout regularisation
  regressor.add(LSTM(units = 64, return_sequences = True, input_shape = (X_train.shape[1], X_train.shape[2])))
  regressor.add(Dropout(0.3))
  # Adding a second LSTM layer and some Dropout regularisation
  regressor.add(Bidirectional(LSTM(units = 64, return_sequences = True)))
  regressor.add(Dropout(0.2))
  # Adding a third LSTM layer and some Dropout regularisation
  regressor.add(Bidirectional(LSTM(units = 64, return_sequences = True)))
  regressor.add(Dropout(0.2))
  # Adding a fourth LSTM layer and some Dropout regularisation
  regressor.add(Bidirectional(LSTM(units = 64)))
  regressor.add(Dropout(0.2))
  # Adding the output layer
  regressor.add(Dense(units = 1))

  # Compiling the RNN
  regressor.compile(optimizer = 'adam', loss = 'mean_squared_error')

  return regressor

model = create_model()

# Fitting the RNN to the Training set
history = model.fit(X_train, y_train, 
                    epochs = 50, batch_size = 32,validation_split = 0.2)

In [None]:
plt.figure(figsize = (10, 6))
plt.plot(history.history['loss'])  
plt.plot(history.history['val_loss'])  
plt.title('model loss')  
plt.ylabel('loss')  
plt.xlabel('epoch')  
plt.legend(['train', 'val'], loc='upper left')