In [47]:
import pandas as pd
import matplotlib.pyplot as plt

pd.set_option("display.max_columns", 50)

In [48]:
raw = pd.read_csv("data/readable_transaction_01_09_2025.csv", na_values=["","-","   "])
raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 383584 entries, 0 to 383583
Data columns (total 13 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   Property Type                    383584 non-null  object 
 1   District                         383584 non-null  object 
 2   Mukim                            383584 non-null  object 
 3   Scheme Name/Area                 383584 non-null  object 
 4   Road Name                        283469 non-null  object 
 5   Month, Year of Transaction Date  383584 non-null  object 
 6   Tenure                           383584 non-null  object 
 7   Land/Parcel Area                 383584 non-null  float64
 8   Unit                             383584 non-null  object 
 9   Main Floor Area                  283477 non-null  float64
 10  Unit                             283477 non-null  object 
 11  Unit Level                       383584 non-null  object 
 12  Tr

In [49]:
raw.iloc[383544,:]

Property Type                       Town House
District                            Timur Laut
Mukim                                       13
Scheme Name/Area                   THE ADDRESS
Road Name                                  NaN
Month, Year of Transaction Date       1/6/2023
Tenure                                Freehold
Land/Parcel Area                         222.0
Unit                                      sq.m
Main Floor Area                            NaN
Unit                                       NaN
Unit Level                                   1
Transaction Price                      1550000
Name: 383544, dtype: object

In [50]:
# Formatting/Cleaning Column Names
cols = raw.columns.str.strip().str.replace(" ","_").str.lower().str.replace(",","").str.replace("/","_or_")
print(cols)
data = raw.copy()
data.columns = cols
data.sample()

Index(['property_type', 'district', 'mukim', 'scheme_name_or_area',
       'road_name', 'month_year_of_transaction_date', 'tenure',
       'land_or_parcel_area', 'unit', 'main_floor_area', 'unit', 'unit_level',
       'transaction_price'],
      dtype='object')


Unnamed: 0,property_type,district,mukim,scheme_name_or_area,road_name,month_year_of_transaction_date,tenure,land_or_parcel_area,unit,main_floor_area,unit.1,unit_level,transaction_price
183161,2 - 2 1/2 Storey Terraced,Kuala Lumpur,Mukim Kuala Lumpur,DAMANSARA HEIGHTS (BKT DAMANSARA),LORONG SETIABISTARI 5,1/1/2022,Freehold,160.0,sq.m,151.0,sq.m,,1600000


In [51]:
# Inspecting `unit` column 
print(data.iloc[:,8].unique())
print(data.iloc[:,10].unique())

# Dropping unit column as they are the same with the exception of missing data denoted by `-`
data = data.drop(columns="unit")
data.info()

['sq.m']
['sq.m' nan]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 383584 entries, 0 to 383583
Data columns (total 11 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   property_type                   383584 non-null  object 
 1   district                        383584 non-null  object 
 2   mukim                           383584 non-null  object 
 3   scheme_name_or_area             383584 non-null  object 
 4   road_name                       283469 non-null  object 
 5   month_year_of_transaction_date  383584 non-null  object 
 6   tenure                          383584 non-null  object 
 7   land_or_parcel_area             383584 non-null  float64
 8   main_floor_area                 283477 non-null  float64
 9   unit_level                      383584 non-null  object 
 10  transaction_price               383584 non-null  int64  
dtypes: float64(2), int64(1), object(8)
memory usage: 32.2+ M

In [56]:
# Missing data analysis
print(data.isna().sum())
rows_missing = data[data.isna().any(axis=1)]
rows_missing.sample(5)

property_type                          0
district                               0
mukim                                  0
scheme_name_or_area                    0
road_name                         100115
month_year_of_transaction_date         0
tenure                                 0
land_or_parcel_area                    0
main_floor_area                   100107
unit_level                             0
transaction_price                      0
dtype: int64


Unnamed: 0,property_type,district,mukim,scheme_name_or_area,road_name,month_year_of_transaction_date,tenure,land_or_parcel_area,main_floor_area,unit_level,transaction_price
260289,Condominium/Apartment,Kinta,Ipoh Selatan,PRIMA FALIM,,1/5/2024,Freehold,73.0,,21,218000
299521,Condominium/Apartment,Timur Laut,Bdr Tanjung Pinang,QUAYSIDE CONDO (SERI TANJUNG PINANG),,1/3/2023,Freehold,111.39,,1,850000
322011,Flat,Hulu Selangor,Bandar Serendah,BUKIT SENTOSA,,1/7/2022,Freehold,72.65,,3,98000
256081,Condominium/Apartment,Johor Bahru,Plentong,TMN MOLEK,,1/5/2023,Freehold,137.0,,27,680000
340982,Low-Cost Flat,Klang,Kapar,"PANGSAPURI REBANA, BDR BUKIT RAJA",,1/9/2023,Freehold,61.0,,4,130000
