In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

df=pd.read_csv('Raw_KL_Housing_Dataset.csv')
df

Unnamed: 0,Location,Price,Rooms,Bathrooms,Car Parks,Property Type,Size,Furnishing
0,KLCC,"RM1,250,000",3,3.0,2.0,Serviced Residence,"Built-up : 1,335 sq. ft.",Fully Furnished
1,Dutamas,"RM1,030,000",3,4.0,2.0,Condominium,"Built-up : 1,875 sq. ft.",Partly Furnished
2,Bukit Jalil,"RM900,000",5,3.0,2.0,Condominium,"Built-up : 1,513 sq. ft.",Partly Furnished
3,Taman Tun Dr Ismail,"RM5,350,000",6,5.0,4.0,Bungalow,Land area : 7200 sq. ft.,Partly Furnished
4,Taman Tun Dr Ismail,"RM2,600,000",5,4.0,4.0,Semi-detached House,Land area : 3600 sq. ft.,Partly Furnished
...,...,...,...,...,...,...,...,...
32110,Seputeh,"RM750,000",3,2.0,1.0,Condominium,Built-up : 915 sq. ft.,Partly Furnished
32111,KL Sentral,"RM1,400,000",4,3.0,2.0,Condominium,Land area : 1544 sq. ft.,Fully Furnished
32112,KL Eco City,"RM880,000",1,1.0,1.0,Condominium,Built-up : 650 sq. ft.,Partly Furnished
32113,Sri Hartamas,"RM2,700,000",6,6.0,3.0,Condominium,"Built-up : 3,973 sq. ft.",Partly Furnished


In [2]:
# If 'Price' column is not string, skip the preprocessing steps
# If it's string, apply the preprocessing steps
def target_preprocess(df, col):
    if df[col].dtype == 'object':
        df[col] = df[col].str.replace('RM', '').str.replace(',', '').apply(pd.to_numeric)
    df = df.dropna(subset=[col])  # Drop rows with NaN values in the 'Price' column
    return df

# Apply preprocessing to 'Price' column
df = target_preprocess(df, 'Price')
df

Unnamed: 0,Location,Price,Rooms,Bathrooms,Car Parks,Property Type,Size,Furnishing
0,KLCC,1250000,3,3.0,2.0,Serviced Residence,"Built-up : 1,335 sq. ft.",Fully Furnished
1,Dutamas,1030000,3,4.0,2.0,Condominium,"Built-up : 1,875 sq. ft.",Partly Furnished
2,Bukit Jalil,900000,5,3.0,2.0,Condominium,"Built-up : 1,513 sq. ft.",Partly Furnished
3,Taman Tun Dr Ismail,5350000,6,5.0,4.0,Bungalow,Land area : 7200 sq. ft.,Partly Furnished
4,Taman Tun Dr Ismail,2600000,5,4.0,4.0,Semi-detached House,Land area : 3600 sq. ft.,Partly Furnished
...,...,...,...,...,...,...,...,...
32110,Seputeh,750000,3,2.0,1.0,Condominium,Built-up : 915 sq. ft.,Partly Furnished
32111,KL Sentral,1400000,4,3.0,2.0,Condominium,Land area : 1544 sq. ft.,Fully Furnished
32112,KL Eco City,880000,1,1.0,1.0,Condominium,Built-up : 650 sq. ft.,Partly Furnished
32113,Sri Hartamas,2700000,6,6.0,3.0,Condominium,"Built-up : 3,973 sq. ft.",Partly Furnished


In [3]:
import ast

def clean_up_size(df, col):
    df[['Build Type', 'Sqft']] = df[col].str.extract(r'^([^:]+) : (.*) sq\. ft\.$')
    df['Sqft'] = df['Sqft'].str.replace(',', '').str.replace('x', '*').str.replace('X', '*')
    
    def evaluate_expression(expr):
        try:
            return ast.literal_eval(expr)
        except:
            return None
    
    df['Sqft'] = df['Sqft'].apply(evaluate_expression).astype(float)
    
    return df
df=clean_up_size(df,'Size')
df=df.drop('Size',axis=1)
df

Unnamed: 0,Location,Price,Rooms,Bathrooms,Car Parks,Property Type,Furnishing,Build Type,Sqft
0,KLCC,1250000,3,3.0,2.0,Serviced Residence,Fully Furnished,Built-up,1335.0
1,Dutamas,1030000,3,4.0,2.0,Condominium,Partly Furnished,Built-up,1875.0
2,Bukit Jalil,900000,5,3.0,2.0,Condominium,Partly Furnished,Built-up,1513.0
3,Taman Tun Dr Ismail,5350000,6,5.0,4.0,Bungalow,Partly Furnished,Land area,7200.0
4,Taman Tun Dr Ismail,2600000,5,4.0,4.0,Semi-detached House,Partly Furnished,Land area,3600.0
...,...,...,...,...,...,...,...,...,...
32110,Seputeh,750000,3,2.0,1.0,Condominium,Partly Furnished,Built-up,915.0
32111,KL Sentral,1400000,4,3.0,2.0,Condominium,Fully Furnished,Land area,1544.0
32112,KL Eco City,880000,1,1.0,1.0,Condominium,Partly Furnished,Built-up,650.0
32113,Sri Hartamas,2700000,6,6.0,3.0,Condominium,Partly Furnished,Built-up,3973.0


In [5]:
df['Car Parks']=df['Car Parks'].fillna(0)
df['Furnishing']=df['Furnishing'].fillna('Unfurnished')
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(subset=['Sqft'], inplace=True)
df.dropna(subset=['Bathrooms'], inplace=True)
df.dropna(subset=['Rooms'], inplace=True)
df

Unnamed: 0,Location,Price,Rooms,Bathrooms,Car Parks,Property Type,Furnishing,Build Type,Sqft
0,KLCC,1250000,3,3.0,2.0,Serviced Residence,Fully Furnished,Built-up,1335.0
1,Dutamas,1030000,3,4.0,2.0,Condominium,Partly Furnished,Built-up,1875.0
2,Bukit Jalil,900000,5,3.0,2.0,Condominium,Partly Furnished,Built-up,1513.0
3,Taman Tun Dr Ismail,5350000,6,5.0,4.0,Bungalow,Partly Furnished,Land area,7200.0
4,Taman Tun Dr Ismail,2600000,5,4.0,4.0,Semi-detached House,Partly Furnished,Land area,3600.0
...,...,...,...,...,...,...,...,...,...
32110,Seputeh,750000,3,2.0,1.0,Condominium,Partly Furnished,Built-up,915.0
32111,KL Sentral,1400000,4,3.0,2.0,Condominium,Fully Furnished,Land area,1544.0
32112,KL Eco City,880000,1,1.0,1.0,Condominium,Partly Furnished,Built-up,650.0
32113,Sri Hartamas,2700000,6,6.0,3.0,Condominium,Partly Furnished,Built-up,3973.0


In [6]:
df['Price']=df['Price']/1000000
df=df.loc[df['Price']<10]
df = df.loc[(df['Price'] >= 0.5) & (df['Price'] < 10)]
df = df.loc[(df['Sqft'] >= 500) & (df['Sqft'] < 5000)]
df

Unnamed: 0,Location,Price,Rooms,Bathrooms,Car Parks,Property Type,Furnishing,Build Type,Sqft
0,KLCC,1.25,3,3.0,2.0,Serviced Residence,Fully Furnished,Built-up,1335.0
1,Dutamas,1.03,3,4.0,2.0,Condominium,Partly Furnished,Built-up,1875.0
2,Bukit Jalil,0.90,5,3.0,2.0,Condominium,Partly Furnished,Built-up,1513.0
4,Taman Tun Dr Ismail,2.60,5,4.0,4.0,Semi-detached House,Partly Furnished,Land area,3600.0
9,Mont Kiara,1.78,5,4.0,2.0,Condominium,Partly Furnished,Built-up,1830.0
...,...,...,...,...,...,...,...,...,...
32110,Seputeh,0.75,3,2.0,1.0,Condominium,Partly Furnished,Built-up,915.0
32111,KL Sentral,1.40,4,3.0,2.0,Condominium,Fully Furnished,Land area,1544.0
32112,KL Eco City,0.88,1,1.0,1.0,Condominium,Partly Furnished,Built-up,650.0
32113,Sri Hartamas,2.70,6,6.0,3.0,Condominium,Partly Furnished,Built-up,3973.0
