## Zamboanga City Land Price Prediction (Commercial)

In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import math, statistics

### Import dataset

In [26]:
df = pd.read_csv("../datasets/Land - Commercial.csv")
df.head()

Unnamed: 0,Address,Classification,Land Area,Market Value
0,Pasonanca,Commercial,29.0,229100.0
1,Pasonanca,Commercial,361.0,2851900.0
2,Zone 3,Commercial,842.0,6651800.0
3,Zone 3,Commercial,857.0,6770300.0
4,Zone 3,Commercial,20.0,158000.0


### Data Cleaning

In [27]:
df.columns

Index(['Address', 'Classification', 'Land Area', 'Market Value'], dtype='object')

In [28]:
df.isna().sum()

Address           0
Classification    0
Land Area         0
Market Value      0
dtype: int64

##### Renaming column names

In [29]:
df.columns = df.columns.str.lower()

In [30]:
df.columns

Index(['address', 'classification', 'land area', 'market value'], dtype='object')

In [31]:
df = df.rename(columns={'address':'brgy','land area':'land_area','market value':'target_value'
})
df.columns

Index(['brgy', 'classification', 'land_area', 'target_value'], dtype='object')

##### Barangay

In [32]:
df['brgy'] = df['brgy'].str.lower()
df['brgy'].head()

0    pasonanca
1    pasonanca
2       zone 3
3       zone 3
4       zone 3
Name: brgy, dtype: object

In [33]:
pd.set_option("display.max_column", None)
df['brgy'].unique()

array(['pasonanca', 'zone 3', 'zone 1', 'zone 2', 'zone 4', 'santa maria',
       'tetuan', 'san jose gusu', 'san roque', 'tugbungan'], dtype=object)

In [34]:
df['brgy'] = df['brgy'].replace({'pasonanca':0, 'zone 3':1, 'zone 1':2, 'zone 2':3, 'zone 4':4, 'santa maria':5, 'tetuan':6, 'san jose gusu':7, 'san roque':8, 'tugbungan':9
})

In [35]:
df['brgy'].unique()

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int64)

##### Checking for null

In [36]:
df.isna().sum()

brgy              0
classification    0
land_area         0
target_value      0
dtype: int64

### Classification

In [37]:
df['classification'] = df['classification'].str.lower()
df['classification'].unique()

array(['commercial', 'residential'], dtype=object)

In [38]:
df['classification'] = df['classification'].replace({'residential':0,'commercial':1})
df['classification'].unique()

array([1, 0], dtype=int64)

In [39]:
df.head()

Unnamed: 0,brgy,classification,land_area,target_value
0,0,1,29.0,229100.0
1,0,1,361.0,2851900.0
2,1,1,842.0,6651800.0
3,1,1,857.0,6770300.0
4,1,1,20.0,158000.0


##### End of Cleaning

### Pre-Processing

In [40]:
X = df.drop('target_value',axis=1)
       
y = df.target_value

### Metrics Model

### Gradient Boosting

In [41]:
from sklearn import metrics
from xgboost import XGBRegressor

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=13)

xgbr = XGBRegressor(verbosity=0) 

In [42]:
from sklearn.metrics import mean_squared_error, r2_score

xgbr.fit(X_train, y_train)

y_pred = xgbr.predict(X_test)

# Result of MAE
print("Mean Absolute Error:",metrics.mean_absolute_error(y_test, y_pred))

# Result of MSE
print("Mean Squared Error:",metrics.mean_squared_error(y_test, y_pred))

# Result of RMSE
print("Root Mean Squared Error:",np.sqrt(metrics.mean_absolute_error(y_test, y_pred)))

# Result of R^2
print("R Squared:",r2_score(y_test, y_pred))

Mean Absolute Error: 279270.07861328125
Mean Squared Error: 338515511254.35876
Root Mean Squared Error: 528.4601012501145
R Squared: 0.8612833092914087


In [43]:
df.columns

Index(['brgy', 'classification', 'land_area', 'target_value'], dtype='object')

In [44]:
from sklearn import ensemble
from sklearn.inspection import permutation_importance

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20)

params = {
    "n_estimators": 100,
    "max_depth": 3,
    "min_samples_split": 2,
    "learning_rate": 0.3,
    "loss": "squared_error",
}
reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(X_train, y_train)

y_pred = reg.predict(X_test)
# Result of MAE
print("Mean Absolute Error:",metrics.mean_absolute_error(y_test, y_pred))

# Result of MSE
print("Mean Squared Error:",metrics.mean_squared_error(y_test, y_pred))

# Result of RMSE
print("Root Mean Squared Error:",np.sqrt(metrics.mean_absolute_error(y_test, y_pred)))

# Result of R^2
print("R Squared:",r2_score(y_test, y_pred))

Mean Absolute Error: 248533.1691727761
Mean Squared Error: 296260649094.3401
Root Mean Squared Error: 498.5310112448132
R Squared: 0.8479323476018018


### Export pickle file

In [45]:
import pickle

pickle.dump(reg, open('model_commercial_new.pkl','wb'))

In [46]:
# model = pickle.load(open('model_commercial.pkl','rb'))

In [47]:
"""import json

columns = {
    'data_columns' : [col.lower() for col in X.columns]
}
with open("columns_commercial.json","w") as f:
    f.write(json.dumps(columns))"""

'import json\n\ncolumns = {\n    \'data_columns\' : [col.lower() for col in X.columns]\n}\nwith open("columns_commercial.json","w") as f:\n    f.write(json.dumps(columns))'

In [48]:
# df.to_csv (r'D:\System\broker\datasets\export_dataframe_commercial.csv', index = False, header=True)

# print (df)