# 0. Import library and load data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
import re
sns.set(style="whitegrid")

df = pd.read_csv("../data/raw/surat_uncleaned.csv")

# 1. Data Cross-Swap

Since there are many misclassified values inside the data, we need to cross-fill to get the suitable value within each row of the dataset. We gonna work with:
- ``transaction``: {``New Property``, ``Resale``}
- ``status``: {``Ready to Move``} or the value with the term "poss" inside. We gonna change it to Possession later
- ``floor``: It format is <string/int> out of <int>
- ``furnish``: {``Unfurnished``, ``Semi-Furnished``, ``Furnished``}
- ``facing``: value that contains the term of direction: "South", "North", "West", "East"

In [2]:
# transaction valid value
def is_valid_transaction(v):
    if pd.isna(v):
        return False
    return v in {"New Property", "Resale"}

In [3]:
# status valid value
def is_valid_status(v):
    if pd.isna(v):
        return False
    if not isinstance(v, str):
        return False
    return (
        v == "Ready to Move"
        or "Poss" in v
        or v == "Freehold"
    )

In [4]:
# floor valid value
def is_valid_floor(v):
    if pd.isna(v):
        return False
    if not isinstance(v, str):
        return False
    return "out of" in v

In [5]:
# furnishing valid value
def is_valid_furnishing(v):
    if pd.isna(v):
        return False
    return v in {"Unfurnished", "Semi-Furnished", "Furnished"}

In [6]:
# facing valid value
def is_valid_facing(v):
    if pd.isna(v):
        return False
    if not isinstance(v, str):
        return False
    return any(x in v for x in ["East", "West", "North", "South", "Main Road", "Garden/Park"])

In [7]:
valid_mapping = {
    "transaction": is_valid_transaction,
    "status": is_valid_status,
    "floor": is_valid_floor,
    "furnishing": is_valid_furnishing,
    "facing": is_valid_facing,
}

In [8]:
def fix_row(row, max_iter=20):
    row = row.copy()

    for _ in range(max_iter):
        changed = False

        for col, is_valid in valid_mapping.items():
            val = row[col]

            # We gonna skip if it is valid
            if pd.notna(val) and is_valid(val):
                continue

            # Find other valid value in other column
            for other_col, other_validator in valid_mapping.items():
                if other_col == col:
                    continue

                other_val = row[other_col]

                # assign the value of col
                if pd.notna(other_val) and is_valid(other_val):

                    # make sure other_col is good
                    if not other_validator(other_val):
                        # swap even if it is null
                        row[col], row[other_col] = other_val, val
                        changed = True
                        break

        if not changed:
            break

    return row

In [9]:
df = df.apply(fix_row, axis=1)

# 2. Handle non-numerical data

After we cross swapping the data, we dive deep down into cleaning every single columns inside the data

Firstly, we clean the ``areaWithType`` first since it is the most stable ones. We can see there are 4 main values inside the column corresponding to the area types: ``Super Area``, ``Carpet Area``, ``Plot Area``, ``Built Area`` and we need to remove the misclassified values. 

In [10]:
valid_areaWithType_value = ["Super Area", "Carpet Area", "Plot Area", "Built Area"]

df = df[df['areaWithType'].isin(valid_areaWithType_value)]

Back to ``transaction``, we first clean the column with the valid value ``New Property`` and ``Resale``. Any other value that out of this will be transfter to NaN

In [11]:
def analyze_value_distribution(col_name):
    print(f"\n=== Value Distribution for column: {col_name} ===")
    
    value_counts = df[col_name].value_counts(dropna=False)
    print("Number of unique values:", df[col_name].nunique())
    print("Top 10 most frequent values:\n", value_counts.head(10))

In [12]:
valid_transaction = {"New Property", "Resale"}

df["transaction"] = df["transaction"].where(
    df["transaction"].isin(valid_transaction),
    np.nan
)

In [13]:
analyze_value_distribution("transaction")


=== Value Distribution for column: transaction ===
Number of unique values: 2
Top 10 most frequent values:
 transaction
Resale          2719
New Property    1775
NaN               25
Name: count, dtype: int64


Inside column ``status``, we keep value ``Ready to Move`` and ``Freehold``. The value string that contains "Poss" string will be converted into ``Possession``

In [14]:
def normalize_status(v):
    if pd.isna(v) or not isinstance(v, str):
        return pd.NA

    if v == "Ready to Move":
        return "Ready to Move"

    if "Poss" in v:
        return "Possession"

    if v == "Freehold":
        return "Freehold"

    return pd.NA

In [15]:
df["status"] = df["status"].apply(normalize_status)

In [16]:
analyze_value_distribution("status")


=== Value Distribution for column: status ===
Number of unique values: 3
Top 10 most frequent values:
 status
Ready to Move    3078
Possession       1117
<NA>              210
Freehold          114
Name: count, dtype: int64


With ``floor`` we gonna keep the value with format "<string/int> out of <string/int>" and split them into two different columns that represents ``floor``(the current floor of that flats. If it is ground -> 0 and basement -> -1) and ``num_floor`` (the total floors of the building).

In [18]:
# Ensure string dtype
df["floor_raw"] = df["floor"].astype("string")

# Regex pattern
pattern = re.compile(
    r"^\s*(Ground|Basement|\d+)\s+out\s+of\s+(\d+)\s*$",
    re.IGNORECASE
)

def parse_floor(v):
    if pd.isna(v):
        return pd.NA, pd.NA

    m = pattern.match(v)
    if not m:
        return pd.NA, pd.NA

    current, total = m.groups()

    # Parse current floor
    current = current.lower()
    if current == "ground":
        floor_val = 0
    elif current == "basement":
        floor_val = -1
    else:
        floor_val = int(current)

    num_floor = int(total)

    return floor_val, num_floor

In [19]:
df[["floor", "num_floor"]] = df["floor_raw"].apply(
    lambda x: pd.Series(parse_floor(x))
)

df.drop(columns="floor_raw", inplace=True)

In [20]:
analyze_value_distribution("floor")


=== Value Distribution for column: floor ===
Number of unique values: 21
Top 10 most frequent values:
 floor
<NA>    848
5       564
0       386
7       376
6       365
1       364
4       304
3       300
2       286
8       221
Name: count, dtype: int64


In [21]:
analyze_value_distribution("num_floor")


=== Value Distribution for column: num_floor ===
Number of unique values: 23
Top 10 most frequent values:
 num_floor
<NA>    848
13      833
12      528
14      399
5       377
4       273
2       178
10      172
1       169
3       158
Name: count, dtype: int64


For ``furnishing``, we just keep the three main value ``Unfurnished``; ``Semi-Furnished`` and ``Furnished``. Everything left is NaN

In [22]:
valid_furnishing = {
    "Unfurnished",
    "Semi-Furnished",
    "Furnished"
}

df["furnishing"] = df["furnishing"].where(
    df["furnishing"].isin(valid_furnishing),
    pd.NA
)

In [23]:
analyze_value_distribution("furnishing")


=== Value Distribution for column: furnishing ===
Number of unique values: 3
Top 10 most frequent values:
 furnishing
Unfurnished       2739
<NA>               666
Semi-Furnished     585
Furnished          529
Name: count, dtype: int64


Now, we back to the most dirty column of ``facing`` where many misclassified values inside (1675/4525 = 37%) as well as ``description`` with 30.3% missing value percentage. We can check via code:

In [24]:
invalid_mask = ~df["facing"].apply(is_valid_facing)

num_invalid = invalid_mask.sum()

print(f"Number of invalid values in 'facing': {num_invalid}")

Number of invalid values in 'facing': 1675


Therefore, we gonna drop ``facing`` as well as ``description``

In [25]:
df = df.drop(columns=['facing', 'description'])

Finally, let's take a look at the dataframe after deciding to removing all NaN before moving to the next phases of cleaning the numerical dataset

In [27]:
cols = ["areaWithType", "transaction", "status", "furnishing", "floor", "num_floor"]

df = df.dropna(subset=cols)

In [28]:
df.describe()

Unnamed: 0,property_name,areaWithType,square_feet,transaction,status,floor,furnishing,price_per_sqft,price,num_floor
count,3255,3255,3255,3255,3255,3255,3255,3006,3255,3255
unique,1412,2,1073,2,3,21,3,1615,722,23
top,3 BHK Apartment for Sale in Vesu Surat,Super Area,1000 sqft,Resale,Ready to Move,5,Unfurnished,"₹5,000 per sqft",Call for Price,13
freq,70,2102,62,1837,2337,539,2313,59,142,811


# 3. Handle numerical data