In [2]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
housing = pd.read_csv(r"C:\Users\Aanand Jha\Desktop\house-price-prediction\Housing.csv")

# Data preprocessing
varlist = ['mainroad', 'guestroom', 'basement', 'hotwaterheating', 'airconditioning', 'prefarea']

def binary_map(x):
    return x.map({'yes': 1, "no": 0})

housing[varlist] = housing[varlist].apply(binary_map)

# Get dummies for 'furnishingstatus'
status = pd.get_dummies(housing['furnishingstatus'], drop_first=True)
housing = pd.concat([housing, status], axis=1)
housing.drop(['furnishingstatus'], axis=1, inplace=True)

# Split data into train and test sets
from sklearn.model_selection import train_test_split

np.random.seed(0)
df_train, df_test = train_test_split(housing, train_size=0.7, test_size=0.3, random_state=100)

# Scaling the numeric variables
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
num_vars = ['area', 'bedrooms', 'bathrooms', 'stories', 'parking', 'price']

df_train[num_vars] = scaler.fit_transform(df_train[num_vars])
df_test[num_vars] = scaler.transform(df_test[num_vars])

# Train the model
y_train = df_train.pop('price')
X_train = df_train

from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE

lm = LinearRegression()
rfe = RFE(estimator=lm, n_features_to_select=6)
rfe = rfe.fit(X_train, y_train) 

col = X_train.columns[rfe.support_]
X_train_rfe = X_train[col]

import statsmodels.api as sm
X_train_rfe = sm.add_constant(X_train_rfe)
lm = sm.OLS(y_train, X_train_rfe).fit()

# Model evaluation on the test data
X_test_rfe = sm.add_constant(df_test[col])
y_test = df_test.pop('price')
y_pred = lm.predict(X_test_rfe)

from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)
print(f'R-squared: {r2}')


R-squared: 0.6171279856258437


In [5]:
import pickle

# Save the model
with open('house_price_model.pkl', 'wb') as file:
    pickle.dump(lm, file)

# Save the dataframe
df_train.to_csv('df_train.csv', index=False)
df_test.to_csv('df_test.csv', index=False)


In [7]:
# Load the model
with open('house_price_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

# Load the dataframes
df_train_loaded = pd.read_csv('df_train.csv')
df_test_loaded = pd.read_csv('df_test.csv')

# Verify the loaded model by making predictions again
X_test_rfe_loaded = sm.add_constant(df_test_loaded[col])
y_pred_loaded = loaded_model.predict(X_test_rfe_loaded)

# Check the R-squared value
r2_loaded = r2_score(y_test, y_pred_loaded)
print(f'R-squared from loaded model: {r2_loaded}')


R-squared from loaded model: 0.6171279856258437
