In [116]:
import warnings
warnings.filterwarnings('ignore')

In [117]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler,RobustScaler,minmax_scale
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,LabelEncoder,OrdinalEncoder

In [118]:
df = pd.read_csv('bengaluru_house_prices.csv')

In [119]:
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [120]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


# Data Cleaning

In [121]:
# Check for duplicates
print("Number of duplicates:", df.duplicated().sum())

# Check missing values
print("Missing values:")
print(df.isnull().sum())

Number of duplicates: 529
Missing values:
area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64


In [122]:
# Drop duplicates
df = df.drop_duplicates()

In [123]:
# Handle missing values
# Drop rows with null location and size
df = df.dropna(subset=['location', 'size'])

# Drop society column due to many nulls
df = df.drop('society', axis=1)

# Fill bath with median
df['bath'] = df['bath'].fillna(df['bath'].median())

# Fill balcony with 0
df['balcony'] = df['balcony'].fillna(0)

print("Shape after handling missing values:", df.shape)
print("Missing values after cleaning:")
print(df.isnull().sum())

Shape after handling missing values: (12774, 8)
Missing values after cleaning:
area_type       0
availability    0
location        0
size            0
total_sqft      0
bath            0
balcony         0
price           0
dtype: int64


In [124]:
# import re

# def convert_sqft_to_num(x):
#     tokens = x.split('-')
#     if len(tokens) == 2:
#         return (float(tokens[0]) + float(tokens[1])) / 2
#     try:
#         return float(x)
#     except:
#         return None

# df['total_sqft'] = df['total_sqft'].apply(convert_sqft_to_num)

# # Drop rows where total_sqft is None
# df = df.dropna(subset=['total_sqft'])

# print("Shape after converting total_sqft:", df.shape)

def convert_sqft_to_num(x):
    # 1. Handle Ranges (e.g., '2100 - 2850')
    tokens = x.split('-')
    if len(tokens) == 2:
        return (float(tokens[0]) + float(tokens[1])) / 2
    
    # 2. Handle Normal Numbers & Units
    try:
        # Check for specific units and convert to sqft
        if 'Sq. Meter' in x:
            return float(x.replace('Sq. Meter', '')) * 10.764
        elif 'Sq. Yards' in x:
            return float(x.replace('Sq. Yards', '')) * 9.0
        elif 'Acres' in x:
            return float(x.replace('Acres', '')) * 43560
        elif 'Guntha' in x:
            return float(x.replace('Guntha', '')) * 1089
        elif 'Cents' in x:
            return float(x.replace('Cents', '')) * 435.6
        elif 'Grounds' in x:
            return float(x.replace('Grounds', '')) * 2400
        elif 'Perch' in x:
            return float(x.replace('Perch', '')) * 272.25
            
        # If no unit, just convert to float
        return float(x)
    except:
        return None

# Apply the Pro function
df['total_sqft'] = df['total_sqft'].apply(convert_sqft_to_num)

In [125]:
# Extract BHK from size
df['bhk'] = df['size'].apply(lambda x: int(x.split(' ')[0]))

# Drop size column
df = df.drop('size', axis=1)

print("Columns after extracting bhk:", df.columns.tolist())

Columns after extracting bhk: ['area_type', 'availability', 'location', 'total_sqft', 'bath', 'balcony', 'price', 'bhk']


In [126]:
# Check unique values in availability
print("Unique availability:", df['availability'].unique()[:10])  # first 10

# Simplify availability: Ready To Move = 1, else 0
df['availability'] = df['availability'].apply(lambda x: 1 if x == 'Ready To Move' else 0)

print("Availability after simplification:", df['availability'].value_counts())

Unique availability: ['19-Dec' 'Ready To Move' '18-May' '18-Feb' '18-Nov' '20-Dec' '17-Oct'
 '21-Dec' '19-Sep' '20-Sep']
Availability after simplification: availability
1    10171
0     2603
Name: count, dtype: int64


In [127]:
# Check unique locations
print("Number of unique locations:", df['location'].nunique())

# For simplicity, drop location as it has too many categories
df = df.drop('location', axis=1)

print("Final columns:", df.columns.tolist())

Number of unique locations: 1304
Final columns: ['area_type', 'availability', 'total_sqft', 'bath', 'balcony', 'price', 'bhk']


In [128]:
# Check area_type unique
print("Area type unique:", df['area_type'].unique())

Area type unique: ['Super built-up  Area' 'Plot  Area' 'Built-up  Area' 'Carpet  Area']


In [129]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 12774 entries, 0 to 13318
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     12774 non-null  object 
 1   availability  12774 non-null  int64  
 2   total_sqft    12774 non-null  float64
 3   bath          12774 non-null  float64
 4   balcony       12774 non-null  float64
 5   price         12774 non-null  float64
 6   bhk           12774 non-null  int64  
dtypes: float64(4), int64(2), object(1)
memory usage: 798.4+ KB


In [130]:
# Define x and y
x = df.drop('price', axis=1)
y = df['price']

In [131]:
# One-hot encode categorical columns in X using sklearn
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(drop='first', sparse_output=False)
encoded_features = encoder.fit_transform(x[['area_type']])

# Create DataFrame for encoded features
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(['area_type']), index=x.index)

# Drop original area_type and concatenate
x = x.drop('area_type', axis=1)
x = pd.concat([x, encoded_df], axis=1)

print("X columns after encoding:", x.columns.tolist())
print("X shape after encoding:", x.shape)

X columns after encoding: ['availability', 'total_sqft', 'bath', 'balcony', 'bhk', 'area_type_Carpet  Area', 'area_type_Plot  Area', 'area_type_Super built-up  Area']
X shape after encoding: (12774, 8)


In [132]:
# Train test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [133]:
# Train the model
model = LinearRegression()
model.fit(x_train, y_train)

print("Model trained successfully")

Model trained successfully


In [134]:
# Evaluate the model
from sklearn.metrics import mean_squared_error, r2_score

y_pred = model.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R2 Score: {r2}")

Mean Squared Error: 19353.7509441019
R2 Score: 0.21944414359988773
