In [1]:
import pandas as pd

In [2]:
# Read and checking the data
df = pd.read_csv('lamudiscraped.csv')
df.head()

Unnamed: 0,web-scraper-order,web-scraper-start-url,title,title-href,lokasi,luaslahan,luasbangunan,kamartidur,kamarmandi,fullyfurnished,memiliki_ac,memiliki_airpanas,memiliki_tv,memiliki_kitchenset,memiliki_balkon,memiliki_keamanan24jam,memiliki_tamanbermainanak,harga
0,1696673387-1707,https://www.lamudi.co.id/jakarta/house/single-...,Lihat,https://www.lamudi.co.id/jual-cepat-rumah-di-j...,Jakarta Garden City\n ...,120 m²,90 m²,4,4,Tidak,,,,,Balkon,Keamanan 24 jam,Taman bermain anak,Rp2.200.000.000
1,1696673391-1708,https://www.lamudi.co.id/jakarta/house/single-...,Lihat,https://www.lamudi.co.id/dijual-rumah-mewah-di...,"Senopati, Jakarta Selatan\n ...",211 m²,400 m²,4,4,Tidak,,,,,,,,Rp22.000.000.000
2,1696673396-1709,https://www.lamudi.co.id/jakarta/house/single-...,Lihat,https://www.lamudi.co.id/rumah-murah-di-pondok...,Jln.Kartika Utama\n ...,413 m²,750 m²,4,4,750,Pendingin ruangan (AC),Air Panas,,Kitchen set,Balkon,,Taman bermain anak,Rp15.000.000.000
3,1696673400-1710,https://www.lamudi.co.id/jakarta/house/single-...,Lihat,https://www.lamudi.co.id/rumah-siap-huni-dilok...,"Tanjung Barat, Jakarta Selatan",65 m²,45 m²,2,2,45,,,,,,Keamanan 24 jam,,Rp365.000.000
4,1696673404-1711,https://www.lamudi.co.id/jakarta/house/single-...,Lihat,https://www.lamudi.co.id/rumah-pondok-indah-lu...,"Pondok Indah, Jakarta Selatan",200 m²,250 m²,4,4,200,,,,,,,,Rp7.000.000.000


In [3]:
# Dropping unneccessary columns
df = df.drop(['web-scraper-order', 'web-scraper-start-url', 'title', 'title-href', 'fullyfurnished', 'memiliki_ac',
              'memiliki_airpanas', 'memiliki_tv', 'memiliki_kitchenset', 'memiliki_balkon'], axis=1)
df.head()

Unnamed: 0,lokasi,luaslahan,luasbangunan,kamartidur,kamarmandi,memiliki_keamanan24jam,memiliki_tamanbermainanak,harga
0,Jakarta Garden City\n ...,120 m²,90 m²,4,4,Keamanan 24 jam,Taman bermain anak,Rp2.200.000.000
1,"Senopati, Jakarta Selatan\n ...",211 m²,400 m²,4,4,,,Rp22.000.000.000
2,Jln.Kartika Utama\n ...,413 m²,750 m²,4,4,,Taman bermain anak,Rp15.000.000.000
3,"Tanjung Barat, Jakarta Selatan",65 m²,45 m²,2,2,Keamanan 24 jam,,Rp365.000.000
4,"Pondok Indah, Jakarta Selatan",200 m²,250 m²,4,4,,,Rp7.000.000.000


In [4]:
# Renaming the columns
df.columns = ["lokasi", "luas_lahan", "luas_bangunan", "kamar_tidur", "kamar_mandi", "keamanan_24jam", "taman_bermain_anak", "harga"]

# Mengatasi nilai NaN
df.fillna({"keamanan_24jam": "Tidak", "taman_bermain_anak": "Tidak"}, inplace=True)

# Mengatasi format kolom lokasi dengan menghilangkan karakter '\n'
df['lokasi'] = df['lokasi'].str.replace('\n', '')

# Mengatasi format kolom harga yang mengandung Rp dan titik
df['harga'] = df['harga'].str.replace('Rp', '').str.replace('.', '').astype(float)

df.head()

Unnamed: 0,lokasi,luas_lahan,luas_bangunan,kamar_tidur,kamar_mandi,keamanan_24jam,taman_bermain_anak,harga
0,Jakarta Garden City ...,120 m²,90 m²,4,4,Keamanan 24 jam,Taman bermain anak,2200000000.0
1,"Senopati, Jakarta Selatan ...",211 m²,400 m²,4,4,Tidak,Tidak,22000000000.0
2,Jln.Kartika Utama ...,413 m²,750 m²,4,4,Tidak,Taman bermain anak,15000000000.0
3,"Tanjung Barat, Jakarta Selatan",65 m²,45 m²,2,2,Keamanan 24 jam,Tidak,365000000.0
4,"Pondok Indah, Jakarta Selatan",200 m²,250 m²,4,4,Tidak,Tidak,7000000000.0


In [6]:
# Drop nan
df = df.dropna()

In [7]:
# Resolve the format of land_area and building_area columns containing m²
df['luas_lahan'] = df['luas_lahan'].str.replace(' m²', '').astype(int)
df['luas_bangunan'] = df['luas_bangunan'].str.replace(' m²', '').astype(int)

In [8]:
# Function to extract city names from text
def extract_city(location):
    words = location.split()
    city_name = ' '.join(words[-2:])
    return city_name

# Applying the function into the new column
df['kota'] = df['lokasi'].apply(extract_city)

# Removing whitespace
df['lokasi'] = df['lokasi'].str.replace(r'\s+', '', regex=True)

In [9]:
# Change the values
df['keamanan_24jam'] = df['keamanan_24jam'].map({'Keamanan 24 jam': 'Ada', 'Tidak': 'Tidak'})
df['taman_bermain_anak'] = df['taman_bermain_anak'].map({'Taman bermain anak': 'Ada', 'Tidak': 'Tidak'})

In [11]:
# Column arrangement
cols = ['kota'] + [col for col in df if col != 'kota']
df = df[cols]

In [12]:
# Define the lower and upper percentiles
lower_percentile = (1 - 0.98) / 2  # 0.05 for 5% on each side
upper_percentile = 1 - lower_percentile

# Extract the data within the desired range around the mean
df = df[(df['luas_lahan'] >= df['luas_lahan'].quantile(lower_percentile)) &
                    (df['luas_lahan'] <= df['luas_lahan'].quantile(upper_percentile))]

# Define the lower and upper percentiles
lower_percentile = (1 - 0.99) / 2  # 0.05 for 5% on each side
upper_percentile = 1 - lower_percentile

# Extract the data within the desired range around the mean
df = df[(df['luas_bangunan'] >= df['luas_bangunan'].quantile(lower_percentile)) &
                    (df['luas_bangunan'] <= df['luas_bangunan'].quantile(upper_percentile))]

df = df[df['kamar_tidur'] <= 11]
df = df[df['kamar_mandi'] <= 11]

In [15]:
df.head()

Unnamed: 0,kota,lokasi,luas_lahan,luas_bangunan,kamar_tidur,kamar_mandi,keamanan_24jam,taman_bermain_anak,harga
0,Jakarta Timur,"JakartaGardenCityCakung,JakartaTimur",120,90,4,4,Ada,Ada,2200000000.0
1,Jakarta Selatan,"Senopati,JakartaSelatanSenayan,JakartaSelatan",211,400,4,4,Tidak,Tidak,22000000000.0
2,Jakarta Selatan,"Jln.KartikaUtamaPondokIndah,JakartaSelatan",413,750,4,4,Tidak,Ada,15000000000.0
3,Jakarta Selatan,"TanjungBarat,JakartaSelatan",65,45,2,2,Ada,Tidak,365000000.0
4,Jakarta Selatan,"PondokIndah,JakartaSelatan",200,250,4,4,Tidak,Tidak,7000000000.0


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2565 entries, 0 to 2715
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   kota                2565 non-null   object 
 1   lokasi              2565 non-null   object 
 2   luas_lahan          2565 non-null   int64  
 3   luas_bangunan       2565 non-null   int64  
 4   kamar_tidur         2565 non-null   int64  
 5   kamar_mandi         2565 non-null   int64  
 6   keamanan_24jam      2565 non-null   object 
 7   taman_bermain_anak  2565 non-null   object 
 8   harga               2565 non-null   float64
dtypes: float64(1), int64(4), object(4)
memory usage: 200.4+ KB
