In [22]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import matplotlib.pyplot as plt
import joblib

In [23]:
train_data = pd.read_csv('D:\\Codes\\ML\\HPP\\train.csv')
test_data = pd.read_csv('D:\\Codes\\ML\\HPP\\test.csv')

In [24]:
def convert_lotsize(row):
    if row['lot_size_units'] == 'acre':
        return float(row['lot_size'])*43560
    try:
        return float(row['lot_size'])
    except ValueError:
        return np.nan

In [25]:
train_data['lot_size'] = train_data.apply(convert_lotsize, axis = 1)
test_data['lot_size'] = test_data.apply(convert_lotsize, axis = 1)

In [26]:
medlottr = train_data['lot_size'].median()
medlottt = test_data['lot_size'].median()

train_data['lot_size'].fillna(medlottr, inplace=True)
test_data['lot_size'].fillna(medlottt, inplace=True)

In [27]:
train_data.drop(['size_units','lot_size_units'],axis=1, inplace=True)
test_data.drop(['size_units','lot_size_units'],axis=1, inplace=True)

In [28]:
print(train_data.columns)


Index(['beds', 'baths', 'size', 'lot_size', 'zip_code', 'price'], dtype='object')


In [29]:
if 'price' in train_data.columns:
    X_train = train_data.drop(['price'], axis=1)
    y_train = train_data['price']
else:
    print("Column 'price' not found in train_data")

if 'price' in test_data.columns:
    X_test = test_data.drop(['price'], axis=1)
    y_test = test_data['price']
else:
    print("Column 'price' not found in test_data")


In [30]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [31]:
linear_model = LinearRegression()
linear_model.fit(X_train_scaled, y_train)

In [32]:
y_pred_linear = linear_model.predict(X_test_scaled)

In [33]:
mse_linear = mean_squared_error(y_test, y_pred_linear)
r2_linear = r2_score(y_test, y_pred_linear)


print(f'Linear Regression - Mean Squared Error: {mse_linear}')
print(f'Linear Regression - R-squared: {r2_linear}')

Linear Regression - Mean Squared Error: 174562295612.6292
Linear Regression - R-squared: 0.5275840736282074


In [34]:
ridge_model = Ridge()
ridge_model.fit(X_train_scaled, y_train)



In [35]:
y_pred_ridge = ridge_model.predict(X_test_scaled)

In [36]:
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

print(f'Ridge Regression - Mean Squared Error: {mse_ridge}')
print(f'Ridge Regression - R-squared: {r2_ridge}')

Ridge Regression - Mean Squared Error: 174550557102.2542
Ridge Regression - R-squared: 0.5276158414233866


In [37]:
model = LinearRegression()
model.fit(X_train_scaled, y_train)

In [38]:
beds = int(input("Enter the number of beds: "))
baths = float(input("Enter the number of baths: "))
size = float(input("Enter the size in sqft: "))
lot_size = float(input("Enter the lot size in sqft: "))
zip_code = int(input("Enter the zip code: "))

user_input = pd.DataFrame({
    'beds': [beds],
    'baths': [baths],
    'size': [size],
    'lot_size': [lot_size],
    'zip_code': [zip_code]
})

user_input_scaled = scaler.transform(user_input)
price_pred = model.predict(user_input_scaled)

print(f"Predicted price: {price_pred[0]}")

Predicted price: 871296.5071718909


In [39]:
joblib.dump(model, 'house_price_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']