<a href="https://colab.research.google.com/github/AabidMK/RealtyAI_Infosys_Internship_Aug2025/blob/Sarayu-Narra/Bagging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# Load the dataset
df = pd.read_csv('/content/Real Estate Data V21.csv (1).zip')

# Display the first 5 rows and info
display(df.head())
display(df.info())

# Check for missing values
missing_values = df.isnull().sum()
print("Missing values per column:")
print(missing_values)

# Handle missing values - Impute numerical columns with the mean and categorical with mode
# Identify numerical and categorical columns
numerical_cols = df.select_dtypes(include=np.number).columns
categorical_cols = df.select_dtypes(include='object').columns

# Impute numerical columns with mean
for col in numerical_cols:
    if missing_values[col] > 0:
        df[col].fillna(df[col].mean(), inplace=True)

# Impute categorical columns with mode
for col in categorical_cols:
    if missing_values[col] > 0:
        df[col].fillna(df[col].mode()[0], inplace=True)

# Display info again to confirm changes
display(df.info())

Unnamed: 0,Name,Property Title,Price,Location,Total_Area,Price_per_SQFT,Description,Baths,Balcony
0,Casagrand ECR 14,"4 BHK Flat for sale in Kanathur Reddikuppam, C...",₹1.99 Cr,"Kanathur Reddikuppam, Chennai",2583,7700.0,Best 4 BHK Apartment for modern-day lifestyle ...,4,Yes
1,"Ramanathan Nagar, Pozhichalur,Chennai",10 BHK Independent House for sale in Pozhichal...,₹2.25 Cr,"Ramanathan Nagar, Pozhichalur,Chennai",7000,3210.0,Looking for a 10 BHK Independent House for sal...,6,Yes
2,DAC Prapthi,"3 BHK Flat for sale in West Tambaram, Chennai",₹1.0 Cr,"Kasthuribai Nagar, West Tambaram,Chennai",1320,7580.0,"Property for sale in Tambaram, Chennai. This 3...",3,No
3,"Naveenilaya,Chepauk, Triplicane,Chennai",7 BHK Independent House for sale in Triplicane...,₹3.33 Cr,"Naveenilaya,Chepauk, Triplicane,Chennai",4250,7840.0,Entire Building for sale with 7 units of singl...,5,Yes
4,VGN Spring Field Phase 1,"2 BHK Flat for sale in Avadi, Chennai",₹48.0 L,"Avadi, Chennai",960,5000.0,"Property for sale in Avadi, Chennai. This 2 BH...",3,Yes


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14528 entries, 0 to 14527
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Name            14528 non-null  object 
 1   Property Title  14528 non-null  object 
 2   Price           14528 non-null  object 
 3   Location        14528 non-null  object 
 4   Total_Area      14528 non-null  int64  
 5   Price_per_SQFT  14528 non-null  float64
 6   Description     14528 non-null  object 
 7   Baths           14528 non-null  int64  
 8   Balcony         14528 non-null  object 
dtypes: float64(1), int64(2), object(6)
memory usage: 1021.6+ KB


None

Missing values per column:
Name              0
Property Title    0
Price             0
Location          0
Total_Area        0
Price_per_SQFT    0
Description       0
Baths             0
Balcony           0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14528 entries, 0 to 14527
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Name            14528 non-null  object 
 1   Property Title  14528 non-null  object 
 2   Price           14528 non-null  object 
 3   Location        14528 non-null  object 
 4   Total_Area      14528 non-null  int64  
 5   Price_per_SQFT  14528 non-null  float64
 6   Description     14528 non-null  object 
 7   Baths           14528 non-null  int64  
 8   Balcony         14528 non-null  object 
dtypes: float64(1), int64(2), object(6)
memory usage: 1021.6+ KB


None

In [None]:
# Convert 'Price' column to numerical format
def convert_price_to_numeric_robust(price_str):
    try:
        # Ensure the input is treated as a string before using string methods
        price_str = str(price_str).replace('₹', '').replace(',', '').strip()
        if 'Cr' in price_str:
            return float(price_str.replace('Cr', '').strip()) * 10000000
        elif 'L' in price_str:
            return float(price_str.replace('L', '').strip()) * 100000
        else:
             # Try converting directly to float if no 'Cr' or 'L'
             return float(price_str)
    except ValueError:
        # If conversion fails, return NaN
        return np.nan

# Only apply the conversion if the 'Price' column is of object type
if df['Price'].dtype == 'object':
    df['Price'] = df['Price'].apply(convert_price_to_numeric_robust)

# Drop rows where price conversion failed (Price is NaN)
df.dropna(subset=['Price'], inplace=True)

# Display the information of the DataFrame again to confirm the data types and NaN counts
display(df.info())

# Display the first few rows to confirm the data structure
display(df.head())

<class 'pandas.core.frame.DataFrame'>
Index: 14526 entries, 0 to 14527
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Name            14526 non-null  object 
 1   Property Title  14526 non-null  object 
 2   Price           14526 non-null  float64
 3   Location        14526 non-null  object 
 4   Total_Area      14526 non-null  int64  
 5   Price_per_SQFT  14526 non-null  float64
 6   Description     14526 non-null  object 
 7   Baths           14526 non-null  int64  
 8   Balcony         14526 non-null  object 
dtypes: float64(2), int64(2), object(5)
memory usage: 1.1+ MB


None

Unnamed: 0,Name,Property Title,Price,Location,Total_Area,Price_per_SQFT,Description,Baths,Balcony
0,Casagrand ECR 14,"4 BHK Flat for sale in Kanathur Reddikuppam, C...",19900000.0,"Kanathur Reddikuppam, Chennai",2583,7700.0,Best 4 BHK Apartment for modern-day lifestyle ...,4,Yes
1,"Ramanathan Nagar, Pozhichalur,Chennai",10 BHK Independent House for sale in Pozhichal...,22500000.0,"Ramanathan Nagar, Pozhichalur,Chennai",7000,3210.0,Looking for a 10 BHK Independent House for sal...,6,Yes
2,DAC Prapthi,"3 BHK Flat for sale in West Tambaram, Chennai",10000000.0,"Kasthuribai Nagar, West Tambaram,Chennai",1320,7580.0,"Property for sale in Tambaram, Chennai. This 3...",3,No
3,"Naveenilaya,Chepauk, Triplicane,Chennai",7 BHK Independent House for sale in Triplicane...,33300000.0,"Naveenilaya,Chepauk, Triplicane,Chennai",4250,7840.0,Entire Building for sale with 7 units of singl...,5,Yes
4,VGN Spring Field Phase 1,"2 BHK Flat for sale in Avadi, Chennai",4800000.0,"Avadi, Chennai",960,5000.0,"Property for sale in Avadi, Chennai. This 2 BH...",3,Yes


In [None]:
# Inspect unique values in the 'Price' column to understand the different formats
unique_price_values = df['Price'].unique()
print("Sample unique values in 'Price' column:")
print(unique_price_values[:50])

Sample unique values in 'Price' column:
[19900000. 22500000. 10000000. 33300000.  4800000.  4000000.  6000000.
  7235000.  4200000.  3000000.  2940000. 13000000. 85000000. 11500000.
  4900000.  5700000.  2500000. 26000000. 29700000. 16200000. 10800000.
  8900000.  6300000.  9000000.  4950000.  2900000.  4500000.  7000000.
  4600000.  8950000.  5400000.  8700000. 25000000.  2600000. 24000000.
  2480000.  3700000.  6500000. 12000000.  5600000.  7500000.  3200000.
  8500000.  3600000.  5000000. 16000000.  5500000.  7900000.  6700000.
  9900000.]


In [None]:
# Drop rows where price conversion failed (Price is NaN)
df.dropna(subset=['Price'], inplace=True)

# Display the information of the DataFrame one last time to confirm no missing values in 'Price'
display(df.info())

# Display the first few rows to confirm the data structure
display(df.head())

<class 'pandas.core.frame.DataFrame'>
Index: 14526 entries, 0 to 14527
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Name            14526 non-null  object 
 1   Property Title  14526 non-null  object 
 2   Price           14526 non-null  float64
 3   Location        14526 non-null  object 
 4   Total_Area      14526 non-null  int64  
 5   Price_per_SQFT  14526 non-null  float64
 6   Description     14526 non-null  object 
 7   Baths           14526 non-null  int64  
 8   Balcony         14526 non-null  object 
dtypes: float64(2), int64(2), object(5)
memory usage: 1.1+ MB


None

Unnamed: 0,Name,Property Title,Price,Location,Total_Area,Price_per_SQFT,Description,Baths,Balcony
0,Casagrand ECR 14,"4 BHK Flat for sale in Kanathur Reddikuppam, C...",19900000.0,"Kanathur Reddikuppam, Chennai",2583,7700.0,Best 4 BHK Apartment for modern-day lifestyle ...,4,Yes
1,"Ramanathan Nagar, Pozhichalur,Chennai",10 BHK Independent House for sale in Pozhichal...,22500000.0,"Ramanathan Nagar, Pozhichalur,Chennai",7000,3210.0,Looking for a 10 BHK Independent House for sal...,6,Yes
2,DAC Prapthi,"3 BHK Flat for sale in West Tambaram, Chennai",10000000.0,"Kasthuribai Nagar, West Tambaram,Chennai",1320,7580.0,"Property for sale in Tambaram, Chennai. This 3...",3,No
3,"Naveenilaya,Chepauk, Triplicane,Chennai",7 BHK Independent House for sale in Triplicane...,33300000.0,"Naveenilaya,Chepauk, Triplicane,Chennai",4250,7840.0,Entire Building for sale with 7 units of singl...,5,Yes
4,VGN Spring Field Phase 1,"2 BHK Flat for sale in Avadi, Chennai",4800000.0,"Avadi, Chennai",960,5000.0,"Property for sale in Avadi, Chennai. This 2 BH...",3,Yes


In [None]:
# Extract the city from the 'Location' column
df['City'] = df['Location'].apply(lambda x: x.split(',')[-1].strip())

# Extract the number of bedrooms (BHK) from the 'Property Title' column
# This is a more robust approach to handle variations in the title
df['BHK'] = df['Property Title'].str.extract(r'(\d+)\s*BHK').astype(float)

# Handle cases where BHK information might be missing or in different formats (NaNs after extraction)
# Impute missing BHK values with the mode
bhk_mode = df['BHK'].mode()[0]
df['BHK'].fillna(bhk_mode, inplace=True)
df['BHK'] = df['BHK'].astype(int) # Convert to integer after imputation

# Create a binary feature for 'Balcony' column
df['Balcony_Encoded'] = df['Balcony'].apply(lambda x: 1 if x == 'Yes' else 0)

# Drop the original 'Location', 'Property Title', and 'Balcony' columns
df.drop(['Location', 'Property Title', 'Balcony'], axis=1, inplace=True)

# Inspect the new features
display(df[['City', 'BHK', 'Balcony_Encoded']].head())
display(df[['City', 'BHK', 'Balcony_Encoded']].info())
display(df['City'].value_counts())
display(df['BHK'].value_counts())
display(df['Balcony_Encoded'].value_counts())

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['BHK'].fillna(bhk_mode, inplace=True)


Unnamed: 0,City,BHK,Balcony_Encoded
0,Chennai,4,1
1,Chennai,10,1
2,Chennai,3,0
3,Chennai,7,1
4,Chennai,2,1


<class 'pandas.core.frame.DataFrame'>
Index: 14526 entries, 0 to 14527
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   City             14526 non-null  object
 1   BHK              14526 non-null  int64 
 2   Balcony_Encoded  14526 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 453.9+ KB


None

Unnamed: 0_level_0,count
City,Unnamed: 1_level_1
Bangalore,4512
Pune,2964
New Delhi,2164
Chennai,1595
Kolkata,1392
Mumbai,1353
Hyderabad,540
Thane,6


Unnamed: 0_level_0,count
BHK,Unnamed: 1_level_1
2,6429
3,3121
1,2586
4,985
5,591
6,295
10,166
7,143
8,132
9,78


Unnamed: 0_level_0,count
Balcony_Encoded,Unnamed: 1_level_1
1,8578
0,5948


In [None]:
# 1. Define features (X) and target (y)
X = df.drop('Price', axis=1)
y = df['Price']

# 2. Identify numerical and categorical features
numerical_features = X.select_dtypes(include=np.number).columns
categorical_features = X.select_dtypes(include='object').columns

# 3. Create ColumnTransformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough' # Keep other columns (if any)
)

# 4. Create a BaggingRegressor model
bagging_model = BaggingRegressor(random_state=42)

# 5. Combine preprocessor and model into a Pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', bagging_model)])

# 6. Print the pipeline structure
print(pipeline)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('num', StandardScaler(),
                                                  Index(['Total_Area', 'Price_per_SQFT', 'Baths', 'BHK', 'Balcony_Encoded'], dtype='object')),
                                                 ('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  Index(['Name', 'Description', 'City'], dtype='object'))])),
                ('regressor', BaggingRegressor(random_state=42))])


In [None]:
# Train the pipeline on the data
pipeline.fit(X, y)

In [None]:
# Make predictions on the test data
y_pred = pipeline.predict(X_test)
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"R-squared Score: {r2:.2f}")

NameError: name 'X_test' is not defined

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the pipeline on the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test data
y_pred = pipeline.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")
print(f"Root Mean Squared Error: {rmse:.2f}")
print(f"R-squared Score: {r2:.2f}")

Mean Squared Error: 43008214755949.76
Root Mean Squared Error: 6558064.86
R-squared Score: 0.85


In [None]:
import joblib

# Define the filenames for the pipeline and model
pipeline_filename = 'real_estate_pipeline.pkl'
model_filename = 'bagging_regressor_model.pkl'

# Save the pipeline and the trained model
joblib.dump(pipeline, pipeline_filename)
joblib.dump(bagging_model, model_filename)

print(f"Pipeline saved to {pipeline_filename}")
print(f"Model saved to {model_filename}")

Pipeline saved to real_estate_pipeline.pkl
Model saved to bagging_regressor_model.pkl
