<a href="https://colab.research.google.com/github/Evansokania/House-Price-Prediction/blob/main/House_price_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
# import dataset
df = pd.read_csv('/content/realtor-data.zip.csv')
# display 20 random records
df.sample(20)

Unnamed: 0,status,bed,bath,acre_lot,city,state,zip_code,house_size,prev_sold_date,price
1747,for_sale,5.0,2.0,0.49,Arecibo,Puerto Rico,612.0,2058.0,,148000.0
9153,for_sale,3.0,2.0,0.09,Manati,Puerto Rico,674.0,2417.0,,70000.0
3160,for_sale,,,0.12,Florida,Puerto Rico,650.0,,,28000.0
25584,for_sale,4.0,2.0,0.09,Carolina,Puerto Rico,979.0,1600.0,,114000.0
27751,for_sale,3.0,2.0,,San Juan,Puerto Rico,926.0,1500.0,,229000.0
25140,for_sale,3.0,2.0,0.2,Amherst,Massachusetts,1002.0,1349.0,2005-09-02,439900.0
18788,for_sale,3.0,3.0,,San Juan,Puerto Rico,926.0,1550.0,,265000.0
15676,for_sale,3.0,3.0,,Loiza,Puerto Rico,772.0,1249.0,,161000.0
8486,for_sale,4.0,4.0,2500.0,Caguas,Puerto Rico,725.0,4400.0,,795000.0
11089,for_sale,,,12.14,San Juan,Puerto Rico,926.0,,,500000.0


In [3]:
#drop null values in the dataset
df.dropna(inplace=True)

In [4]:
#check the datatypes of the columns
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1829 entries, 829 to 32897
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   status          1829 non-null   object 
 1   bed             1829 non-null   float64
 2   bath            1829 non-null   float64
 3   acre_lot        1829 non-null   float64
 4   city            1829 non-null   object 
 5   state           1829 non-null   object 
 6   zip_code        1829 non-null   float64
 7   house_size      1829 non-null   float64
 8   prev_sold_date  1829 non-null   object 
 9   price           1829 non-null   float64
dtypes: float64(6), object(4)
memory usage: 157.2+ KB


In [5]:
#convert prev_sold_date column to datetime data type
df.prev_sold_date = pd.to_datetime(df.prev_sold_date)
# convert zip_code into an object(string) since it is a categorical variable
df.zip_code = df.zip_code.astype(int)
df.zip_code = df.zip_code.astype(str)

# Extract year from the parse_version
df['prev_sold_date'] = df['prev_sold_date'].dt.year
df = df.rename(columns={'prev_sold_date': 'year_sold'})
df

Unnamed: 0,status,bed,bath,acre_lot,city,state,zip_code,house_size,year_sold,price
829,for_sale,7.0,3.0,0.09,Dorado,Puerto Rico,949,1192.0,2019,110000.0
3380,for_sale,7.0,3.0,0.09,Dorado,Puerto Rico,949,1192.0,2019,110000.0
5083,for_sale,7.0,3.0,0.09,Dorado,Puerto Rico,949,1192.0,2019,110000.0
5387,for_sale,7.0,3.0,0.09,Dorado,Puerto Rico,949,1192.0,2019,110000.0
9053,for_sale,7.0,3.0,0.09,Dorado,Puerto Rico,949,1192.0,2019,110000.0
...,...,...,...,...,...,...,...,...,...,...
32892,for_sale,3.0,3.0,0.92,Glastonbury,Connecticut,6033,1430.0,2019,439000.0
32893,for_sale,3.0,1.0,1.30,Monson,Massachusetts,1057,1674.0,2018,290000.0
32894,for_sale,3.0,2.0,0.17,Manchester,Connecticut,6040,1521.0,2005,209900.0
32895,for_sale,3.0,3.0,1.00,Tolland,Connecticut,6084,1234.0,1993,239999.0


#### PREPROCESSING

In [6]:
#seperate dataset into features and target variable
X = df.drop('price', axis = 1)
y = df['price']

In [7]:
# Preprocessing pipeline for numerical and categorical features

numerical_features = X.select_dtypes(include=['int64','int32' ,'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)



# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35)

In [8]:
# Create a pipeline with preprocessing and regressor
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

# Define hyperparameters for tuning
param_grid = {
    'regressor__n_estimators': [100, 175, 300],
    'regressor__max_depth': [None, 10, 20],
    'regressor__min_samples_split': [2, 5, 10]
}


#### MODELING

In [9]:
# Perform grid search with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=3)
grid_search.fit(X_train, y_train)

# Evaluate the best model from grid search
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV 1/5] END regressor__max_depth=None, regressor__min_samples_split=2, regressor__n_estimators=100;, score=-1071465815868.935 total time=   2.5s
[CV 2/5] END regressor__max_depth=None, regressor__min_samples_split=2, regressor__n_estimators=100;, score=-11764077966.554 total time=   2.8s
[CV 3/5] END regressor__max_depth=None, regressor__min_samples_split=2, regressor__n_estimators=100;, score=-29757027809.293 total time=   2.7s
[CV 4/5] END regressor__max_depth=None, regressor__min_samples_split=2, regressor__n_estimators=100;, score=-42806230507.180 total time=   4.7s
[CV 5/5] END regressor__max_depth=None, regressor__min_samples_split=2, regressor__n_estimators=100;, score=-11390992294.278 total time=   1.9s
[CV 1/5] END regressor__max_depth=None, regressor__min_samples_split=2, regressor__n_estimators=175;, score=-1106580422226.424 total time=   2.3s
[CV 2/5] END regressor__max_depth=None, regressor__min_samples_split=2

In [10]:
# Assess performance
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f'Mean Absolute Error (MAE): {mae}')
print(f'Mean Squared Error (MSE): {mse}')
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'R-squared (R²): {r2}')

Mean Absolute Error (MAE): 68996.06503939912
Mean Squared Error (MSE): 65186591011.48753
Root Mean Squared Error (RMSE): 255316.64852000453
R-squared (R²): 0.7206936850635046


In [11]:
best_model