# Step 1: Import Libraries

In [1]:
# Step 1: Import necessary libraries
import pandas as pd      # For data manipulation
import numpy as np       # For numerical computations
import matplotlib.pyplot as plt  # For plotting
import seaborn as sns    # For advanced visualizations

# Libraries for machine learning tasks
from sklearn.model_selection import train_test_split  # For splitting data
from sklearn.linear_model import LinearRegression       # Our ML model
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score  # For evaluation

# Optional: To suppress warnings during development
import warnings
warnings.filterwarnings("ignore")

# Step 2: Load the Dataset

In [2]:
# Step 2: Load the dataset
data = pd.read_csv('Dataset/Bengaluru_House_Data.csv')
print("Dataset loaded successfully!")
print(data.head())

Dataset loaded successfully!
              area_type   availability                  location       size  \
0  Super built-up  Area         19-Dec  Electronic City Phase II      2 BHK   
1            Plot  Area  Ready To Move          Chikka Tirupathi  4 Bedroom   
2        Built-up  Area  Ready To Move               Uttarahalli      3 BHK   
3  Super built-up  Area  Ready To Move        Lingadheeranahalli      3 BHK   
4  Super built-up  Area  Ready To Move                  Kothanur      2 BHK   

   society total_sqft  bath  balcony   price  
0  Coomee        1056   2.0      1.0   39.07  
1  Theanmp       2600   5.0      3.0  120.00  
2      NaN       1440   2.0      3.0   62.00  
3  Soiewre       1521   3.0      1.0   95.00  
4      NaN       1200   2.0      1.0   51.00  


# Step 3: Exploratory Data Analysis (EDA)

In [3]:
# Step 3: Exploratory Data Analysis (EDA)
print("\nDataset Information:")
data.info()  # Provides data types and non-null counts

print("\nMissing values in each column:")
print(data.isnull().sum())  # Displays count of missing values per column


Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB

Missing values in each column:
area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64


# Step 4: Data Cleaning & Preprocessing

# 4.1: Drop Unnecessary Columns

In [4]:
# Step 4.1: Drop unnecessary columns (if they exist in your dataset)
columns_to_drop = ['society', 'availability']  # Example columns to drop
data = data.drop(columns=columns_to_drop, errors='ignore')  # errors='ignore' if columns not present

# 4.2: Convert total_sqft to Numeric

In [5]:
# Step 4.2: Function to convert total_sqft into a numeric value
def convert_sqft_to_num(x):
    try:
        # Check if the value is a range
        if '-' in x:
            tokens = x.split('-')
            return (float(tokens[0]) + float(tokens[1])) / 2
        else:
            return float(x)
    except:
        return np.nan

# Apply the function and drop rows where conversion fails
data['total_sqft'] = data['total_sqft'].apply(convert_sqft_to_num)
data = data.dropna(subset=['total_sqft'])

# 4.3: Extract Number of Bedrooms (BHK)

In [6]:
# Step 4.3: Extract number of BHK from the 'size' column with error handling

def extract_bhk(x):
    # If the value is missing, return NaN
    if pd.isna(x):
        return np.nan
    # If the value is a string (e.g., "2 BHK")
    if isinstance(x, str):
        try:
            return int(x.split(' ')[0])
        except Exception as e:
            return np.nan
    # If the value is already numeric (e.g., a float like 2.0)
    if isinstance(x, (int, float)):
        return int(x)
    return np.nan

# Apply the function to create a new 'bhk' column
data['bhk'] = data['size'].apply(extract_bhk)

# Optionally, drop rows where bhk could not be determined
data.dropna(subset=['bhk'], inplace=True)

# Convert 'bhk' to integer type
data['bhk'] = data['bhk'].astype(int)

# Drop the original 'size' column as it is no longer needed
data.drop('size', axis=1, inplace=True)

# Step 5: Handle Categorical Variables

# 5.1: Process the location Column

In [7]:
# Step 5.1: Group rare locations under 'other'
location_stats = data['location'].value_counts()
rare_locations = location_stats[location_stats <= 10].index  # Threshold can be adjusted
data['location'] = data['location'].apply(lambda x: 'other' if x in rare_locations else x)

# Step 5.2: One-hot encode the 'location' column
data = pd.get_dummies(data, columns=['location'], drop_first=True)

# Step 6: Feature Selection & Target Variable

In [8]:
# Step 6: Define features and target variable
# Initial features from our data
feature_columns = ['total_sqft', 'bath', 'balcony', 'bhk']

# Add any columns generated from one-hot encoding (e.g., those starting with 'location_')
location_columns = [col for col in data.columns if col.startswith('location_')]
feature_columns.extend(location_columns)

# Feature matrix and target vector
X = data[feature_columns]
y = data['price']  # Ensure 'price' is present in your dataset and is your target

# Step 7: Split the Data into Training and Testing Sets

In [9]:
# Step 7: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print("\nData split into training and testing sets.")


Data split into training and testing sets.


# Step 7.5: Handle Missing Values in the Feature Matrix

In [10]:
from sklearn.impute import SimpleImputer

# Create an imputer object with a mean filling strategy
imputer = SimpleImputer(strategy='mean')

# Fit the imputer on the training data and transform both training and test data
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

# Step 8: Train the Machine Learning Model

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression

# Create a pipeline that imputes missing values, scales the features, then fits a Linear Regression model.
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('regressor', LinearRegression())
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)
print("\nPipeline training completed.")

# Optionally, you can inspect the model intercept and coefficients:
regressor = pipeline.named_steps['regressor']
print("Intercept:", regressor.intercept_)
print("Coefficients:", regressor.coef_)


Pipeline training completed.
Intercept: 112.54817980388458
Coefficients: [ 6.24470566e+01  5.18202615e+01 -2.07309222e+00 -1.34160721e+01
  3.16767975e+00  2.84012947e+00 -5.74237367e-02  4.72657452e+00
  2.04748860e+00  1.95945346e-01  1.50342301e+00  2.38172098e+00
  3.46809792e-01  3.12921304e-01  1.39046168e-02 -6.05315308e-01
  4.56259643e-01  1.12237930e+00  2.30828099e+00  1.03922536e+00
  2.78680614e-01 -1.82307665e-01  4.71873576e-01 -8.20801569e-02
  7.19871011e-01  1.29889637e-01  1.05623974e+00  1.02984900e+00
  3.99648196e+00  1.06199201e+00  7.86825777e-03  6.92284697e-01
  1.51014041e+00  2.56522767e+00  6.20826892e+00  8.98059961e-01
 -2.84556393e-01 -1.49200042e-01  8.00209042e-01 -3.41900696e-01
  2.12421684e-01  2.24111750e+00  4.71232268e+00  3.37291507e+00
 -1.23559376e-01 -6.86089379e-01 -6.41313416e-01  1.08981869e+00
  6.10894109e+00  1.90473311e-01  1.45481769e-01  7.15042283e-01
  2.19624224e+00  4.14527020e-01 -1.13712536e+00 -2.04489291e-01
 -1.03136277e+00

# Step 9: Evaluate the Model

In [12]:
# Now evaluate using the pipeline
y_pred = pipeline.predict(X_test)

# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2  = r2_score(y_test, y_pred)

print("\nModel Evaluation Metrics:")
print("Mean Absolute Error (MAE):", round(mae, 2))
print("Mean Squared Error (MSE):", round(mse, 2))
print("R2 Score:", round(r2, 2))


Model Evaluation Metrics:
Mean Absolute Error (MAE): 41.33
Mean Squared Error (MSE): 9714.88
R2 Score: 0.55


# Step 10: Build a Recommendation Function

In [13]:
def recommend_properties_pipeline(user_budget, data, feature_columns, pipeline):
    """
    Recommend properties within the user's budget using the trained pipeline.
    
    Parameters:
        user_budget (float): The user's maximum budget.
        data (DataFrame): The cleaned dataset (with one-hot encoded columns, etc.).
        feature_columns (list): List of feature columns used in the model.
        pipeline (Pipeline): The trained pipeline (imputer, scaler, regressor).
    
    Returns:
        recommendations (DataFrame): Properties sorted by predicted price.
    """
    data_copy = data.copy()
    
    # Select feature columns from the data
    features = data_copy[feature_columns]
    
    # Predict prices using the entire pipeline
    data_copy['predicted_price'] = pipeline.predict(features)
    
    # Filter properties within the budget
    recommendations = data_copy[data_copy['predicted_price'] <= user_budget]
    
    # Sort by predicted price (ascending)
    recommendations = recommendations.sort_values(by='predicted_price')
    
    return recommendations

# Step 11: Get User Input and Display Recommendations

In [14]:
try:
    # Ask the user for their budget (in lakhs)
    user_budget = float(input("Enter your budget in lakhs: "))
except Exception as e:
    print("Invalid input. Please enter a numeric value for the budget.")
    user_budget = 0

if user_budget > 0:
    recommended_houses = recommend_properties_pipeline(user_budget, data, feature_columns, pipeline)
    
    if not recommended_houses.empty:
        print("\nRecommended Houses within your budget (showing top 5 recommendations):")
        display_columns = feature_columns + ['predicted_price', 'price']
        print(recommended_houses[display_columns].head())
    else:
        print("\nNo houses found within the specified budget.")
else:
    print("Please provide a valid budget to get recommendations.")

Enter your budget in lakhs:  500000



Recommended Houses within your budget (showing top 5 recommendations):
      total_sqft  bath  balcony  bhk  location_1st Block Jayanagar  \
2841       527.0   1.0      0.0    1                         False   
4698       640.0   1.0      1.0    1                         False   
6329       600.0   1.0      2.0    2                         False   
2702       674.0   1.0      1.0    1                         False   
5516       750.0   1.0      1.0    1                         False   

      location_1st Phase JP Nagar  location_2nd Phase Judicial Layout  \
2841                        False                               False   
4698                        False                               False   
6329                        False                               False   
2702                        False                               False   
5516                        False                               False   

      location_2nd Stage Nagarbhavi  location_5th Block Hbr Layout  