In [1]:
import pandas as pd
import numpy as np
import re

In [None]:
df = pd.read_csv("data/house_prices.csv")

In [3]:
df.shape

(187531, 21)

In [4]:
df.head()

Unnamed: 0,Index,Title,Description,Amount(in rupees),Price (in rupees),location,Carpet Area,Status,Floor,Transaction,...,facing,overlooking,Society,Bathroom,Balcony,Car Parking,Ownership,Super Area,Dimensions,Plot Area
0,0,1 BHK Ready to Occupy Flat for sale in Srushti...,"Bhiwandi, Thane has an attractive 1 BHK Flat f...",42 Lac,6000.0,thane,500 sqft,Ready to Move,10 out of 11,Resale,...,,,Srushti Siddhi Mangal Murti Complex,1,2.0,,,,,
1,1,2 BHK Ready to Occupy Flat for sale in Dosti V...,One can find this stunning 2 BHK flat for sale...,98 Lac,13799.0,thane,473 sqft,Ready to Move,3 out of 22,Resale,...,East,Garden/Park,Dosti Vihar,2,,1 Open,Freehold,,,
2,2,2 BHK Ready to Occupy Flat for sale in Sunrise...,Up for immediate sale is a 2 BHK apartment in ...,1.40 Cr,17500.0,thane,779 sqft,Ready to Move,10 out of 29,Resale,...,East,Garden/Park,Sunrise by Kalpataru,2,,1 Covered,Freehold,,,
3,3,1 BHK Ready to Occupy Flat for sale Kasheli,This beautiful 1 BHK Flat is available for sal...,25 Lac,,thane,530 sqft,Ready to Move,1 out of 3,Resale,...,,,,1,1.0,,,,,
4,4,2 BHK Ready to Occupy Flat for sale in TenX Ha...,"This lovely 2 BHK Flat in Pokhran Road, Thane ...",1.60 Cr,18824.0,thane,635 sqft,Ready to Move,20 out of 42,Resale,...,West,"Garden/Park, Main Road",TenX Habitat Raymond Realty,2,,1 Covered,Co-operative Society,,,


In [5]:
def extract_bhk(title):
    """Tách số BHK từ Title, loại bỏ khỏi chuỗi tiêu đề"""
    bhk_match = re.search(r'(\d+)\s*BHK', title)
    if bhk_match:
        bhk = bhk_match.group(1)
        new_title = re.sub(r'\d+\s*BHK', '', title).strip()
        return bhk, new_title
    else:
        # Nếu không có BHK => giả định 1 (Studio Apartment)
        return '1', title

def process_balcony(value):
    """Chuẩn hóa Balcony hoặc Bathroom, chuyển về int"""
    if pd.isna(value):
        return 0
    value = str(value).strip()
    if value in ['> 10', '>10']:
        return 11
    try:
        return int(value)
    except ValueError:
        return 0

def extract_parking_count(value):
    """Tách số lượng chỗ đỗ xe từ Car Parking"""
    if pd.isna(value):
        return 0
    value = str(value).replace(',', '').strip()
    parts = value.split()
    try:
        count = int(parts[0])
    except (ValueError, IndexError):
        count = 0
    return min(count, 40)

def convert_to_rupees(value):
    """Chuyển đổi từ 'Lac' hoặc 'Cr' sang số rupee"""
    if pd.isna(value):
        return None
    value = str(value).replace(",", "").strip()
    if "Lac" in value:
        return float(value.replace("Lac", "").strip()) * 1_00_000
    elif "Cr" in value:
        return float(value.replace("Cr", "").strip()) * 1_00_00_000
    else:
        try:
            return float(value)
        except:
            return None

def extract_area(area_str):
    """Chuyển diện tích về sqft"""
    if pd.isnull(area_str):
        return None
    area_str = str(area_str).lower().strip()
    match = re.match(r'([\d,\.]+)\s*(sqft|sqyd|sqyrd|sqm)', area_str)
    if not match:
        return None
    value_str, unit = match.groups()
    try:
        value = float(value_str.replace(',', ''))
    except ValueError:
        return None
    if unit == 'sqft':
        sqft = value
    elif unit in ['sqyd', 'sqyrd']:
        sqft = value * 9
    elif unit == 'sqm':
        sqft = value * 10.7639
    else:
        return None
    return int(round(sqft))

In [6]:
# Tách BHK từ Title
df[['BHK', 'Title']] = df['Title'].apply(lambda x: pd.Series(extract_bhk(x)))
df['BHK'] = pd.to_numeric(df['BHK'], errors='coerce')

In [8]:
# Xử lý Balcony & Bathroom
df['Balcony'] = df['Balcony'].apply(process_balcony)
df['Bathroom'] = df['Bathroom'].apply(process_balcony)

In [9]:
# Xử lý Car Parking
df['Car Parking'] = df['Car Parking'].apply(extract_parking_count)

In [10]:
# Điền giá trị thiếu cho các cột dạng category/text
df["facing"] = df["facing"].fillna("Unknown")
df["overlooking"] = df["overlooking"].fillna("Unknown")
df["Ownership"] = df["Ownership"].fillna("Unknown")
df["Furnishing"] = df["Furnishing"].fillna("Unknown")
df["Description"] = df["Description"].fillna("Don't have description")
df["Status"].fillna(df["Status"].mode()[0], inplace=True)
df["Floor"].fillna(df["Floor"].mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Status"].fillna(df["Status"].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Floor"].fillna(df["Floor"].mode()[0], inplace=True)


In [11]:
# Điền NaN cho numeric
df["Balcony"] = df["Balcony"].fillna(0)
df["Bathroom"] = df["Bathroom"].fillna(1)
df["Car Parking"] = df["Car Parking"].fillna(0)

In [12]:
# Loại bỏ cột không dùng hoặc trùng lặp
drop_cols = ["Plot Area", "Dimensions", "Society", "overlooking"]
df.drop(columns=[col for col in drop_cols if col in df.columns], inplace=True)

In [13]:
# Xử lý cột Floor -> tách thành Current Floor & Total Floors
df[["Current Floor", "Total Floors"]] = df["Floor"].str.extract(r"(\d+) out of (\d+)")
df["Current Floor"] = pd.to_numeric(df["Current Floor"], errors="coerce").fillna(0).astype(int)
df["Total Floors"] = pd.to_numeric(df["Total Floors"], errors="coerce").fillna(0).astype(int)
df.drop(columns=["Floor"], inplace=True)

In [14]:
# Chuyển đổi Amount(in rupees)
df["Amount(in rupees)"] = df["Amount(in rupees)"].apply(convert_to_rupees)


In [15]:
# Chuyển đổi diện tích
if "Carpet Area" in df.columns:
    df["Carpet Area"] = df["Carpet Area"].apply(extract_area)
if "Super Area" in df.columns:
    df["Super Area"] = df["Super Area"].apply(extract_area)

In [16]:
# Bỏ bản ghi thiếu giá
df.dropna(subset=["Price (in rupees)"], inplace=True)
df.dropna(subset=["Transaction"], inplace=True)

In [17]:
df['Amount'] = df['Amount(in rupees)']
df['Price'] = df['Price (in rupees)']
df['Location'] = df['location']
df['Facing'] = df['facing']

df.drop(columns=['Amount(in rupees)', 'Price (in rupees)', 'location', 'facing'], inplace=True)

In [18]:
df.head()

Unnamed: 0,Index,Title,Description,Carpet Area,Status,Transaction,Furnishing,Bathroom,Balcony,Car Parking,Ownership,Super Area,BHK,Current Floor,Total Floors,Amount,Price,Location,Facing
0,0,Ready to Occupy Flat for sale in Srushti Siddh...,"Bhiwandi, Thane has an attractive 1 BHK Flat f...",500.0,Ready to Move,Resale,Unfurnished,1,2,0,Unknown,,1,10,11,4200000.0,6000.0,thane,Unknown
1,1,Ready to Occupy Flat for sale in Dosti Vihar P...,One can find this stunning 2 BHK flat for sale...,473.0,Ready to Move,Resale,Semi-Furnished,2,0,1,Freehold,,2,3,22,9800000.0,13799.0,thane,East
2,2,Ready to Occupy Flat for sale in Sunrise by Ka...,Up for immediate sale is a 2 BHK apartment in ...,779.0,Ready to Move,Resale,Unfurnished,2,0,1,Freehold,,2,10,29,14000000.0,17500.0,thane,East
4,4,Ready to Occupy Flat for sale in TenX Habitat ...,"This lovely 2 BHK Flat in Pokhran Road, Thane ...",635.0,Ready to Move,Resale,Unfurnished,2,0,1,Co-operative Society,,2,20,42,16000000.0,18824.0,thane,West
5,5,Ready to Occupy Flat for sale in Virat Aangan ...,Creatively planned and constructed is a 1 BHK ...,,Ready to Move,Resale,Unfurnished,1,1,0,Co-operative Society,680.0,1,2,7,4500000.0,6618.0,thane,East


In [None]:
df.to_csv("data/HousePrice_processed.csv", index=False)
print("✅ Dữ liệu đã tiền xử lý xong và lưu vào data/HousePrice_processed.csv")

✅ Dữ liệu đã tiền xử lý xong và lưu vào HousePrice_processed.csv


In [20]:
import pandas as pd

# Đọc dữ liệu đã xử lý
df = pd.read_csv("HousePrice_processed.csv")

# 1. Xem kích thước & thông tin tổng quát
print("📌 Kích thước dataset:", df.shape)
print("\n📌 Thông tin dataset:")
print(df.info())

# 2. Xem 5 dòng đầu tiên
print("\n📌 5 dòng đầu tiên:")
print(df.head())

# 3. Kiểm tra giá trị thiếu theo cột
print("\n📌 Số lượng giá trị thiếu mỗi cột:")
print(df.isnull().sum())

# 4. Thống kê mô tả cho cột số
print("\n📌 Thống kê mô tả các cột số:")
print(df.describe())

# 5. Thống kê mô tả cho cột dạng category/text
print("\n📌 Thống kê giá trị duy nhất (cột dạng category):")
for col in df.select_dtypes(include='object').columns:
    print(f"{col}: {df[col].nunique()} giá trị duy nhất")

# 6. Tìm bản ghi trùng lặp
duplicates = df[df.duplicated()]
print(f"\n📌 Số bản ghi trùng lặp: {len(duplicates)}")

# 7. Kiểm tra các cột quan trọng có giá trị bất thường
print("\n📌 Giá trị BHK bất thường (<=0 hoặc quá lớn):")
print(df[(df['BHK'] <= 0) | (df['BHK'] > 20)])

print("\n📌 Giá trị diện tích bất thường (Carpet Area <= 0 hoặc quá lớn):")
if 'Carpet Area' in df.columns:
    print(df[(df['Carpet Area'] <= 0) | (df['Carpet Area'] > 20000)])

print("\n📌 Giá trị Price bất thường (<=0 hoặc quá lớn):")
print(df[(df['Price'] <= 0) | (df['Price'] > 1e9)])


📌 Kích thước dataset: (169860, 19)

📌 Thông tin dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 169860 entries, 0 to 169859
Data columns (total 19 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Index          169860 non-null  int64  
 1   Title          169860 non-null  object 
 2   Description    169860 non-null  object 
 3   Carpet Area    93620 non-null   float64
 4   Status         169860 non-null  object 
 5   Transaction    169860 non-null  object 
 6   Furnishing     169860 non-null  object 
 7   Bathroom       169860 non-null  int64  
 8   Balcony        169860 non-null  int64  
 9   Car Parking    169860 non-null  int64  
 10  Ownership      169860 non-null  object 
 11  Super Area     76215 non-null   float64
 12  BHK            169860 non-null  int64  
 13  Current Floor  169860 non-null  int64  
 14  Total Floors   169860 non-null  int64  
 15  Amount         169860 non-null  float64
 16  Price          16