In [1]:
from google.colab import files
files.upload()

Output hidden; open in https://colab.research.google.com to view.

In [2]:
# basic_dataset_checks.py
import pandas as pd

# 1. Load dataset
df = pd.read_csv("/content/Real Estate Data V21.csv")

# 2. Show first 5 rows
print("\n🔹 First 5 rows:")
print(df.head())

# 3. Show last 5 rows
print("\n🔹 Last 5 rows:")
print(df.tail())

# 4. Dataset shape (rows, columns)
print("\n🔹 Shape of dataset:", df.shape)

# 5. Column names
print("\n🔹 Columns in dataset:")
print(df.columns.tolist())

# 6. Data types and non-null counts
print("\n🔹 Info:")
print(df.info())

# 7. Statistical summary of numerical columns
print("\n🔹 Statistical Summary:")
print(df.describe())

# 8. Check for missing values
print("\n🔹 Missing Values:")
print(df.isnull().sum())

# 9. Check for duplicate rows
print("\n🔹 Duplicate Rows:", df.duplicated().sum())

# 10. Unique values in categorical columns
print("\n🔹 Unique Values in Categorical Columns:")
for col in df.select_dtypes(include=['object']).columns:
    print(f"{col}: {df[col].nunique()} unique values")

# 11. Correlation matrix (for numeric columns)
print("\n🔹 Correlation Matrix:")
print(df.corr(numeric_only=True))



🔹 First 5 rows:
                                      Name  \
0                         Casagrand ECR 14   
1    Ramanathan Nagar, Pozhichalur,Chennai   
2                              DAC Prapthi   
3  Naveenilaya,Chepauk, Triplicane,Chennai   
4                 VGN Spring Field Phase 1   

                                      Property Title     Price  \
0  4 BHK Flat for sale in Kanathur Reddikuppam, C...  ₹1.99 Cr   
1  10 BHK Independent House for sale in Pozhichal...  ₹2.25 Cr   
2      3 BHK Flat for sale in West Tambaram, Chennai   ₹1.0 Cr   
3  7 BHK Independent House for sale in Triplicane...  ₹3.33 Cr   
4              2 BHK Flat for sale in Avadi, Chennai   ₹48.0 L   

                                   Location  Total_Area  Price_per_SQFT  \
0             Kanathur Reddikuppam, Chennai        2583          7700.0   
1     Ramanathan Nagar, Pozhichalur,Chennai        7000          3210.0   
2  Kasthuribai Nagar, West Tambaram,Chennai        1320          7580.0   
3   Navee

In [3]:
df.columns

Index(['Name', 'Property Title', 'Price', 'Location', 'Total_Area',
       'Price_per_SQFT', 'Description', 'Baths', 'Balcony'],
      dtype='object')

In [4]:
import numpy as np
# Function to convert price string to lakhs
def convert_to_lakhs(price):
    if isinstance(price, str):
        # Remove ₹, commas, and spaces
        price = price.replace("₹", "").replace(",", "").strip()

        try:
            if "Cr" in price:   # Crores
                return float(price.replace("Cr", "").strip()) * 100
            elif "L" in price:  # Lakhs
                return float(price.replace("L", "").strip())
            elif price.isdigit():  # raw number (e.g., "1200000")
                return float(price) / 100000
            else:
                return np.nan   # unrecognized format
        except:
            return np.nan
    return np.nan

# Apply to all prices
converted = [convert_to_lakhs(p) for p in df['Price']]

print("Original:", df['Price'])
print("In Lakhs:", converted)
print("Missing values:", sum(np.isnan(x) for x in converted))

Original: 0        ₹1.99 Cr
1        ₹2.25 Cr
2         ₹1.0 Cr
3        ₹3.33 Cr
4         ₹48.0 L
           ...   
14523     ₹40.0 L
14524     ₹14.0 L
14525     ₹30.0 L
14526     ₹60.0 L
14527     ₹55.0 L
Name: Price, Length: 14528, dtype: object
In Lakhs: [199.0, 225.0, 100.0, 333.0, 48.0, 40.0, 60.0, 72.35, 42.0, 30.0, 29.4, 130.0, 850.0, 114.99999999999999, 49.0, 57.0, 25.0, 260.0, 297.0, 162.0, 108.0, 100.0, 40.0, 89.0, 63.0, 90.0, 25.0, 49.5, 29.0, 45.0, 70.0, 46.0, 89.5, 45.0, 54.0, 87.0, 250.0, 70.0, 26.0, 90.0, 54.0, 240.0, 29.0, 90.0, 24.8, 37.0, 65.0, 120.0, 56.0, 45.0, 45.0, 75.0, 32.0, 85.0, 36.0, 50.0, 225.0, 160.0, 65.0, 32.0, 55.0, 54.0, 79.0, 85.0, 67.0, 99.0, 120.0, 75.0, 63.0, 88.0, 26.86, 64.9, 58.0, 275.0, 90.0, 114.99999999999999, 239.0, 30.0, 27.28, 70.0, 28.0, 93.0, 45.0, 65.0, 513.0, 43.75, 69.0, 45.0, 100.0, 29.5, 72.0, 378.0, 15.0, 130.0, 18.0, 78.0, 48.5, 72.0, 36.0, 300.0, 72.0, 650.0, 17.0, 58.5, 58.5, 229.99999999999997, 40.0, 45.0, 40.4, 300.0, 56.0, 3

In [5]:
df.columns

Index(['Name', 'Property Title', 'Price', 'Location', 'Total_Area',
       'Price_per_SQFT', 'Description', 'Baths', 'Balcony'],
      dtype='object')

In [6]:
import pandas as pd

# Split location into city and state
df[["city", "state"]] = df["Location"].str.split(",", n=1, expand=True)

# Clean spaces
df["city"] = df["city"].str.strip()
df["state"] = df["state"].str.strip()

# Drop the original location column
df.drop(columns=["Location"], inplace=True)

print(df)

                                                    Name  \
0                                       Casagrand ECR 14   
1                  Ramanathan Nagar, Pozhichalur,Chennai   
2                                            DAC Prapthi   
3                Naveenilaya,Chepauk, Triplicane,Chennai   
4                               VGN Spring Field Phase 1   
...                                                  ...   
14523      Krishna Park Extension, Tilak Nagar,New Delhi   
14524                   Rawta, Jaffarpur Kalan,New Delhi   
14525                Rani Garden, Geeta Colony,New Delhi   
14526  Lig flat rohini,Sector 16E, Sector 16 Rohini,N...   
14527               Sector 3B, Sector 3 Rohini,New Delhi   

                                          Property Title     Price  \
0      4 BHK Flat for sale in Kanathur Reddikuppam, C...  ₹1.99 Cr   
1      10 BHK Independent House for sale in Pozhichal...  ₹2.25 Cr   
2          3 BHK Flat for sale in West Tambaram, Chennai   ₹1.0 Cr   

In [7]:
df.columns

Index(['Name', 'Property Title', 'Price', 'Total_Area', 'Price_per_SQFT',
       'Description', 'Baths', 'Balcony', 'city', 'state'],
      dtype='object')

In [8]:
import re
import pandas as pd
import numpy as np

# Function to extract number of bedrooms
def extract_bedrooms(description: str):
    if not isinstance(description, str):
        return None
    match = re.search(r'(\d+)\s*[-]?\s*(BHK|Bedrooms?|Beds?)', description, re.IGNORECASE)
    if match:
        return int(match.group(1))
    return None

# Function to add bedroom column to DataFrame
def add_bedroom_column(df: pd.DataFrame):
    # Check if 'Description' column exists before proceeding
    if 'Description' not in df.columns:
        print("Error: 'Description' column not found in the DataFrame.")
        return df # Return original DataFrame if column is missing

    df["Bedrooms"] = df["Description"].apply(extract_bedrooms)  # new column
    # We won't drop 'Description' here as other parts of the notebook might use it.
    # If you want to drop it later, you can add a separate step.
    return df

# Apply the function to the main DataFrame df
df = add_bedroom_column(df)

print(df.head())

                                      Name  \
0                         Casagrand ECR 14   
1    Ramanathan Nagar, Pozhichalur,Chennai   
2                              DAC Prapthi   
3  Naveenilaya,Chepauk, Triplicane,Chennai   
4                 VGN Spring Field Phase 1   

                                      Property Title     Price  Total_Area  \
0  4 BHK Flat for sale in Kanathur Reddikuppam, C...  ₹1.99 Cr        2583   
1  10 BHK Independent House for sale in Pozhichal...  ₹2.25 Cr        7000   
2      3 BHK Flat for sale in West Tambaram, Chennai   ₹1.0 Cr        1320   
3  7 BHK Independent House for sale in Triplicane...  ₹3.33 Cr        4250   
4              2 BHK Flat for sale in Avadi, Chennai   ₹48.0 L         960   

   Price_per_SQFT                                        Description  Baths  \
0          7700.0  Best 4 BHK Apartment for modern-day lifestyle ...      4   
1          3210.0  Looking for a 10 BHK Independent House for sal...      6   
2          7580.0

In [9]:
df = df.drop(columns=["Description"])

In [10]:
df.columns

Index(['Name', 'Property Title', 'Price', 'Total_Area', 'Price_per_SQFT',
       'Baths', 'Balcony', 'city', 'state', 'Bedrooms'],
      dtype='object')

In [16]:
import pandas as pd
import joblib
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer # Import SimpleImputer

X = df[['Name', 'Property Title', 'Total_Area', 'Price_per_SQFT',
        'Baths', 'Balcony', 'Bedrooms', 'state', 'city']]
y = df['Price']
categorical_cols = ['Name', 'Property Title', 'Balcony', 'state', 'city']
numeric_cols = ['Total_Area', 'Price_per_SQFT', 'Baths', 'Bedrooms']

# Add an imputer for numerical columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), # Impute missing values with the median
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols), # Use the numeric_transformer pipeline
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

In [20]:
df['Price_in_Lakhs'] = df['Price'].apply(convert_to_lakhs)
df.dropna(subset=['Price_in_Lakhs'], inplace=True)
y = df['Price_in_Lakhs']
X = df[['Name', 'Property Title', 'Total_Area', 'Price_per_SQFT',
        'Baths', 'Balcony', 'Bedrooms', 'state', 'city']]
pipeline.fit(X, y)

In [21]:
joblib.dump(pipeline, "real_estate_pipeline.pkl")
print("Pipeline trained and saved successfully!")

Pipeline trained and saved successfully!
