In [7]:
import pandas as pd


In [11]:
df = pd.read_csv('/content/melb_data.csv')
df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [12]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Suburb         13580 non-null  object 
 1   Address        13580 non-null  object 
 2   Rooms          13580 non-null  int64  
 3   Type           13580 non-null  object 
 4   Price          13580 non-null  float64
 5   Method         13580 non-null  object 
 6   SellerG        13580 non-null  object 
 7   Date           13580 non-null  object 
 8   Distance       13580 non-null  float64
 9   Postcode       13580 non-null  float64
 10  Bedroom2       13580 non-null  float64
 11  Bathroom       13580 non-null  float64
 12  Car            13518 non-null  float64
 13  Landsize       13580 non-null  float64
 14  BuildingArea   7130 non-null   float64
 15  YearBuilt      8205 non-null   float64
 16  CouncilArea    12211 non-null  object 
 17  Lattitude      13580 non-null  float64
 18  Longti

In [15]:
df['Rooms'].describe()

count    13580.000000
mean         2.937997
std          0.955748
min          1.000000
25%          2.000000
50%          3.000000
75%          3.000000
max         10.000000
Name: Rooms, dtype: float64

In [16]:
#Dropping the rows with n.as
df2 = df.dropna(axis=0)
df2.shape

(6196, 21)

In [18]:
#Focus is predicting price
y= df2.Price
y.head()

1    1035000.0
2    1465000.0
4    1600000.0
6    1876000.0
7    1636000.0
Name: Price, dtype: float64

In [19]:
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'Lattitude', 'Longtitude']
x =df2[melbourne_features]

# Building a model

In [20]:
from sklearn.tree import DecisionTreeRegressor

pricePred = DecisionTreeRegressor(random_state= 1)

#Fitting the model
pricePred.fit(x,y)

In [22]:
#Checking accuracy using the training data
print("Making predictions for the following 5 houses:")
print(x.head())
print("The predictions are")
print(pricePred.predict(x.head()))
print(pricePred.score(x,y))

Making predictions for the following 5 houses:
   Rooms  Bathroom  Landsize  Lattitude  Longtitude
1      2       1.0     156.0   -37.8079    144.9934
2      3       2.0     134.0   -37.8093    144.9944
4      4       1.0     120.0   -37.8072    144.9941
6      3       2.0     245.0   -37.8024    144.9993
7      2       1.0     256.0   -37.8060    144.9954
The predictions are
[1035000. 1465000. 1600000. 1876000. 1636000.]
0.9997562169019677


# Checking the models MAE

In [23]:
from sklearn.metrics import mean_absolute_error

predicted_prices = pricePred.predict(x)
mean_absolute_error(y, predicted_prices)

1115.7467183128902

**This figure is so large and the model needs reevaluation. We will go into training and testing splits**

In [25]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_y, test_y = train_test_split(x,y,random_state=5)

price_model = DecisionTreeRegressor()

price_model.fit(train_X,train_y)

predictions = price_model.predict(test_X)
print(mean_absolute_error(test_y,predictions))

263617.8760490639


# Moving to model fitting

In [27]:
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor

def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(test_X)
    mae = mean_absolute_error(test_y, preds_val)
    return(mae)

In [28]:
# compare MAE with differing values of max_leaf_nodes
for max_leaf_nodes in [5, 50, 500, 5000]:
    my_mae = get_mae(max_leaf_nodes, train_X, test_X, train_y, test_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

Max leaf nodes: 5  		 Mean Absolute Error:  374305
Max leaf nodes: 50  		 Mean Absolute Error:  272049
Max leaf nodes: 500  		 Mean Absolute Error:  250187
Max leaf nodes: 5000  		 Mean Absolute Error:  262640


Around 500 leaves seams the best, lets focus on that section

In [29]:
# compare MAE with differing values of max_leaf_nodes
for max_leaf_nodes in [500, 100, 1000, 600, 400]:
    my_mae = get_mae(max_leaf_nodes, train_X, test_X, train_y, test_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

Max leaf nodes: 500  		 Mean Absolute Error:  250187
Max leaf nodes: 100  		 Mean Absolute Error:  258286
Max leaf nodes: 1000  		 Mean Absolute Error:  255299
Max leaf nodes: 600  		 Mean Absolute Error:  251652
Max leaf nodes: 400  		 Mean Absolute Error:  250364


In [30]:
# compare MAE with differing values of max_leaf_nodes
for max_leaf_nodes in [500, 510, 550, 590]:
    my_mae = get_mae(max_leaf_nodes, train_X, test_X, train_y, test_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

Max leaf nodes: 500  		 Mean Absolute Error:  250187
Max leaf nodes: 510  		 Mean Absolute Error:  249116
Max leaf nodes: 550  		 Mean Absolute Error:  249977
Max leaf nodes: 590  		 Mean Absolute Error:  251663


In [31]:
# compare MAE with differing values of max_leaf_nodes
for max_leaf_nodes in [510, 520,505]:
    my_mae = get_mae(max_leaf_nodes, train_X, test_X, train_y, test_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error:  %d" %(max_leaf_nodes, my_mae))

Max leaf nodes: 510  		 Mean Absolute Error:  249116
Max leaf nodes: 520  		 Mean Absolute Error:  248939
Max leaf nodes: 505  		 Mean Absolute Error:  249380


Lets stick with 520 max leaves

Lets implememnt a random forest and see

In [33]:
from sklearn.ensemble import RandomForestRegressor

forest_model = RandomForestRegressor(random_state=1)
forest_model.fit(train_X, train_y)
preds = forest_model.predict(test_X)
print(mean_absolute_error(test_y, preds))
print('This is alot better')

200337.92150573336
This is alot better
