In [15]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics
import joblib
import pickle

In [2]:
df_house = pd.read_excel("PH_houses_4.xlsx")
df_house.dropna().drop_duplicates()
df_house.info()
display(df_house)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1358 entries, 0 to 1357
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Description  1358 non-null   object 
 1   Location     1358 non-null   object 
 2   Price        1358 non-null   int64  
 3   Bedrooms     1358 non-null   int64  
 4   Bath         1358 non-null   int64  
 5   Area         1358 non-null   int64  
 6   Latitude     1358 non-null   float64
 7   Longitude    1358 non-null   float64
dtypes: float64(2), int64(4), object(2)
memory usage: 85.0+ KB


Unnamed: 0,Description,Location,Price,Bedrooms,Bath,Area,Latitude,Longitude
0,Penthouse Unit (54E) - Residences at The Galle...,"San Antonio, Pasig",175265000,4,4,409,14.588377,121.059897
1,Penthouse Unit (54B) - Residences at The Galle...,"San Antonio, Pasig",128592000,4,4,317,14.588377,121.059897
2,Penthouse Unit (54C) - Residences at The Galle...,"San Antonio, Pasig",111512000,4,4,268,14.588377,121.059897
3,3-Bedroom Bi-Level (58A) for Sale in Empress a...,"Oranbo, Pasig",89146000,3,3,241,14.575822,121.064324
4,3-Bedroom Unit (59H) for Sale in Empress at Ca...,"Oranbo, Pasig",57210000,3,3,152,14.575822,121.064324
...,...,...,...,...,...,...,...,...
1353,ROSE (Bare 2-Storey Inner Unit) - Marytown Pla...,"Bulac, Santa Maria",1400000,1,1,46,14.834660,121.339190
1354,ROSE (Bare 2-Storey End Unit) - Marytown Place...,"Bulac, Santa Maria",1400000,1,1,46,14.834660,121.339190
1355,1BR Quadruplex House and Lot (Legato Model) fo...,"Majada Labas, Calamba",1366658,1,1,25,14.196755,121.104962
1356,ROSE (Bare 2-Storey Inner Unit) - Marytown Pla...,"Bulac, Santa Maria",1160000,1,1,46,14.834660,121.339190


In [3]:
# Format colums as float
df_house = df_house.astype({'Price':'float','Bedrooms':'float', 'Bath':'float', 'Area':'float'}) 

# Shuffle the rows randomly to distribute adjacent houses that are on the same location
df_house = df_house.sample(frac=1, random_state=42)
df_house.reset_index(drop=True, inplace=True)
display(df_house)
df_house.info()

Unnamed: 0,Description,Location,Price,Bedrooms,Bath,Area,Latitude,Longitude
0,Alice 2 Storey Townhouse For Sale in Lancaster...,"Tapia, General Trias",1935600.0,3.0,1.0,40.0,14.355070,120.876425
1,"2BR Unit for Sale in Suntrust Amadea Tower 1, ...","Paligsahan, Quezon City",4842720.0,2.0,1.0,36.0,14.631613,121.023475
2,5-Bedroom House for Sale at Camella Balanga He...,"Cupang Proper, Balanga",7751000.0,5.0,3.0,142.0,14.666375,120.548220
3,1-Bedroom Unit (S56C2) for Sale in Maven at Ca...,"Ortigas CBD, Pasig",13096600.0,1.0,1.0,52.0,14.583771,121.059675
4,4-Bedroom House and Lot for Sale at Lancaster ...,"San Sebastian, Kawit",7410000.0,4.0,3.0,100.0,14.418511,120.898962
...,...,...,...,...,...,...,...,...
1353,"Suntrust Solana, 2-Bedroom Unit for Sale in Er...","Ermita, Manila",4900000.0,2.0,1.0,39.0,14.590526,120.985482
1354,Studio Unit (31.5 sqm) for Sale in La Vida Tow...,"F.B Harisson, Pasay",4547200.0,1.0,1.0,31.0,14.553230,120.990226
1355,Metrotowne - Studio Unit for Sale in Las Pinas...,"Talon Uno, Las Pinas",2600000.0,1.0,1.0,22.0,14.446596,120.987165
1356,5% Launch Promo | 1BR Unit for Sale at Woodsvi...,"Merville, Paranaque",7500000.0,1.0,1.0,37.0,14.499387,121.025633


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1358 entries, 0 to 1357
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Description  1358 non-null   object 
 1   Location     1358 non-null   object 
 2   Price        1358 non-null   float64
 3   Bedrooms     1358 non-null   float64
 4   Bath         1358 non-null   float64
 5   Area         1358 non-null   float64
 6   Latitude     1358 non-null   float64
 7   Longitude    1358 non-null   float64
dtypes: float64(6), object(2)
memory usage: 85.0+ KB


In [4]:
# Define the column transformer
column_transformer = ColumnTransformer(
    transformers=[
        ('tfidf_description', TfidfVectorizer(), 'Description'),
        ('tfidf_location', TfidfVectorizer(), 'Location')
    ],
    remainder='passthrough'  # This will keep the rest of the columns as they are
)

display(column_transformer)

In [5]:
# Define the pipeline
pipe = Pipeline([
    ('column_transformer', column_transformer),
    ('regressor', RandomForestRegressor(n_estimators=10))
])

display(pipe)

In [6]:
# Train-test split
X = df_house.drop(['Price'], axis=1)
y = df_house['Price'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=8)

# Fit the pipeline
pipe.fit(X_train, y_train)
Y_pred = pipe.predict(X_test)

# Calculate metrics
mse = metrics.mean_squared_error(y_test, Y_pred, squared=False)
mape = metrics.mean_absolute_percentage_error(y_test, Y_pred)
r2 = metrics.r2_score(y_test, Y_pred)

print("MAPE: ", mape)
print("MSE: ", mse)
print("R2 Score: ", r2)

MAPE:  0.1058103775649032
MSE:  1841785.0870894585
R2 Score:  0.9757892520964213


In [7]:
# transformed_data = column_transformer.fit_transform(df_house)

# df_test = pd.DataFrame(transformed_data.toarray() if hasattr(transformed_data, "toarray") else transformed_data)

# display(df_test)

In [8]:
# # input prediction parameters
# ask_desc = str(input('Input House Description'))
# ask_loc = str(input('Input Location'))
# ask_bed = float(input('number of bedrooms'))
# ask_bath = float(input('number of baths'))
# ask_floor = float(input('input floor area'))
# ask_lat = float(input('input latitude'))
# ask_long = float(input('input longitude'))

In [9]:
# TEST INPUTS
ask_desc = str('3 Bedroom Condo Unit and 2 Parking for Sale in Fortune Hill San Juan City')
ask_loc = str('ADDITION HILLS, SAN JUAN')
ask_bed = 3
ask_bath = 3
ask_floor = 160
ask_lat = 14.5939458
ask_long = 121.0389312

arr_input = np.array([ask_desc, ask_loc, ask_bed, ask_bath, ask_floor, ask_lat, ask_long]).reshape(1,7)
df_input = pd.DataFrame(arr_input, columns=['Description', 'Location', 'Bedrooms', 'Bath', 'Area', 'Latitude', 'Longitude'])
display(df_input)
pipe.predict(df_input)

Unnamed: 0,Description,Location,Bedrooms,Bath,Area,Latitude,Longitude
0,3 Bedroom Condo Unit and 2 Parking for Sale in...,"ADDITION HILLS, SAN JUAN",3,3,160,14.5939458,121.0389312


array([30843350.1])

In [14]:
import pickle

with open('prediction_pipe.pkl', 'wb') as f:
    pickle.dump(pipe, f)

In [11]:
# import joblib

# joblib.dump(pipe, 'prediction_pipe.pkl')

In [12]:
# import sklearn
# sklearn.__version__