In [1]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Lasso, Ridge, LogisticRegressionCV
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import mean_squared_error

# Set display options
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

# Load dataset
path = r"https://drive.google.com/uc?export=download&id=1P49POlAk27uRzWKXoR2WaEfb1lyyfiRJ"  # CSV file from Google Drive
df = pd.read_csv(path)

# Clean the data
df = df.drop(['Unnamed: 0'], axis=1)

# Split dataset into features and target
X = df.drop("price", axis=1)
y = df['price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=51)

# Feature Scaling
sc = StandardScaler()
sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

# Initialize models
lr = LinearRegression()
lr_lasso = Lasso()
lr_ridge = Ridge()
svr = SVR()
rfr = RandomForestRegressor()
xgb_reg = XGBRegressor()
nb = GaussianNB()
lr_cv = LogisticRegressionCV(cv=5, max_iter=1000)

# Function to calculate RMSE
def rmse(y_test, y_pred):
    return np.sqrt(mean_squared_error(y_test, y_pred))

# Train and evaluate Linear Regression
lr.fit(X_train, y_train)
lr_score = lr.score(X_test, y_test)
lr_rmse = rmse(y_test, lr.predict(X_test))

# Train and evaluate Lasso Regression
lr_lasso.fit(X_train, y_train)
lr_lasso_score = lr_lasso.score(X_test, y_test)
lr_lasso_rmse = rmse(y_test, lr_lasso.predict(X_test))

# Train and evaluate Support Vector Machine (SVR)
svr.fit(X_train, y_train)
svr_score = svr.score(X_test, y_test)
svr_rmse = rmse(y_test, svr.predict(X_test))

# Train and evaluate Random Forest Regressor
rfr.fit(X_train, y_train)
rfr_score = rfr.score(X_test, y_test)
rfr_rmse = rmse(y_test, rfr.predict(X_test))

# Train and evaluate XGBoost
xgb_reg.fit(X_train, y_train)
xgb_reg_score = xgb_reg.score(X_test, y_test)
xgb_reg_rmse = rmse(y_test, xgb_reg.predict(X_test))

# Train and evaluate Naive Bayes (GaussianNB)
nb.fit(X_train, y_train)
nb_score = nb.score(X_test, y_test)  # Naive Bayes score
nb_rmse = rmse(y_test, nb.predict(X_test))  # RMSE for Naive Bayes

# Train and evaluate Logistic Regression
lr_cv.fit(X_train, y_train)
lr_cv_score = lr_cv.score(X_test, y_test)
lr_cv_rmse = rmse(y_test, lr_cv.predict(X_test))

# Display the results in a DataFrame
results = pd.DataFrame([
    {'Model': 'Linear Regression', 'Score': lr_score, 'RMSE': lr_rmse},
    {'Model': 'Lasso', 'Score': lr_lasso_score, 'RMSE': lr_lasso_rmse},
    {'Model': 'Support Vector Machine', 'Score': svr_score, 'RMSE': svr_rmse},
    {'Model': 'Random Forest', 'Score': rfr_score, 'RMSE': rfr_rmse},
    {'Model': 'XGBoost', 'Score': xgb_reg_score, 'RMSE': xgb_reg_rmse},
    {'Model': 'Naive Bayes', 'Score': nb_score, 'RMSE': nb_rmse},
    {'Model': 'Logistic Regression', 'Score': lr_cv_score, 'RMSE': lr_cv_rmse}
], columns=['Model', 'Score', 'RMSE'])

results

# Hyperparameter Tuning for XGBoost (optional, use for optimization)
parameters = {'learning_rate': [0.1, 0.03, 0.05, 0.07],
              'min_child_weight': [1, 3, 5],
              'max_depth': [4, 6, 8],
              'gamma': [0, 0.1, 0.001, 0.2],
              'subsample': [0.7, 1, 1.5],
              'colsample_bytree': [0.7, 1, 1.5],
              'n_estimators': [100, 300, 500]}

xgb_grid = GridSearchCV(xgb_reg, parameters, cv=2, n_jobs=-1, verbose=True)
xgb_grid.fit(X_train, y_train)

# Output the best hyperparameters found through grid search
print("Best Score:", xgb_grid.best_score_)
print("Best Params:", xgb_grid.best_params_)

# Saving the final model (XGBoost)
joblib.dump(xgb_reg, 'bangalore_house_price_prediction_model.pkl')

# Function to predict house price using the trained model
def predict_house_price(model, bath, balcony, total_sqft_int, bhk, price_per_sqft, area_type, availability, location):
    x = np.zeros(len(X.columns))  # Create a zero numpy array with length equal to the number of features

    # Adding feature values according to their index
    x[0] = bath
    x[1] = balcony
    x[2] = total_sqft_int
    x[3] = bhk
    x[4] = price_per_sqft

    if availability == "Ready To Move":
        x[8] = 1

    if f'area_type{area_type}' in X.columns:
        area_type_index = np.where(X.columns == f'area_type{area_type}')[0][0]
        x[area_type_index] = 1

    if f'location_{location}' in X.columns:
        loc_index = np.where(X.columns == f'location_{location}')[0][0]
        x[loc_index] = 1

    # Feature scaling
    x = sc.transform([x])[0]
    return model.predict([x])[0]

# Test the model prediction
predicted_price = predict_house_price(model=xgb_reg, bath=3, balcony=2, total_sqft_int=1672, bhk=3, 
                                      price_per_sqft=8971.291866, area_type="Plot Area", availability="Ready To Move", 
                                      location="Devarabeesana Halli")
print("Predicted House Price:", predicted_price)

# Test the saved model after loading
loaded_model = joblib.load("bangalore_house_price_prediction_model.pkl")
predicted_price = predict_house_price(model=loaded_model, bath=3, balcony=2, total_sqft_int=1750, bhk=3, 
                                      price_per_sqft=8571.428571, area_type="Super built-up", availability="Ready To Move", 
                                      location="Devarabeesana Halli")
print("Predicted House Price (Loaded Model):", predicted_price)


ValueError: Unknown label type: (array([  10.  ,   10.25,   11.  , ..., 2000.  , 2200.  , 2912.  ]),)