In [7]:
#import libraly 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn.metrics import mean_absolute_error

#Read in data from CSV as a Pandas dataframe
df = pd.read_csv('F:\\DOWNLOADS\\Melbourne_housing_FULL.csv')

# The misspellings of “longitude” and “latitude” are preserved here
# Although I fixed them in the souce file ,so now I had to fix them here 
del df['Address']
del df['Method']
del df['SellerG']
del df['Date']
del df['Postcode']
del df['Latitude']
del df['Longitude']
del df['Regionname']
del df['Propertycount']

#The following Pandas command can be used to remove rows with missing values
df.dropna(axis = 0, how = 'any', thresh = None, subset = None, inplace = True)

# convert columns that contain non-numeric data to numeric values
# using one-hot encoding
df = pd.get_dummies(df, columns = ['Suburb', 'CouncilArea', 'Type'])

#assign the dependent and independent variables with Price as y and
#X as the remaining 11 variables (with Price dropped from the dataframe
#using the drop method)
X = df.drop('Price',axis=1)
y = df['Price']

#a standard 70/30 split by calling the
#Scikit-learn command below with a test_size of “0.3” and shuffling the dataset.

X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size = 0.3, shuffle = True)

#assign our chosen algorithm (gradient boosting regressor)
#as a new variable (model) and configure its hyperparameters as
#demonstrated below.

model = ensemble.GradientBoostingRegressor(
 n_estimators = 150,
 learning_rate = 0.1,
 max_depth = 30,
 min_samples_split = 4,
 min_samples_leaf = 6,
 max_features = 0.6,
 loss = 'huber'
)

#we’ll use the fit method to
#link the training data to the algorithm stored in the variable model .
model.fit(X_train, y_train)

#for this exercise we are using mean absolute error to
#evaluate the accuracy of the model.
mae_train = mean_absolute_error(y_train, model.predict(X_train))
print ("Training Set Mean Absolute Error: %.2f" % mae_train)

#The same process is repeated
#using the test data.

mae_test = mean_absolute_error(y_test, model.predict(X_test))
print ("Test Set Mean Absolute Error: %.2f" % mae_test)

#we use this to test and see if we can load the dataset and also to frce diplay of 10 rows 
#df.head(10)

#here df.iloc[100] is used to find the row indexed at position 100 in
#the dataframe, which is a property located in Airport West
#df.iloc[100]

#A code snippet "columns"which is a
#convenient method to print the dataset’s column titles.
#df.columns


Training Set Mean Absolute Error: 29090.25
Test Set Mean Absolute Error: 166057.55
