# Profify 


## Memuat datasets

In [9]:
# Impor library yang diperlukan
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# mengatur beberapa opsi tampilan untuk pandas dan matplotlib
pd.set_option('display.max_columns', None)
sns.set_style('whitegrid')
plt.style.use('fivethirtyeight')

# Muat dataset dari folder /data/raw
try:
    file_path = '../data/raw/jabodetabek_house_price.csv'
    df = pd.read_csv(file_path)
except FileNotFoundError:
    print("File tidak ditemukan! Pastikan nama file dan path-nya sudah benar.")



In [4]:
## Menampilkan 5 data baris pertama
print("5 baris pertama Data: ")
df.head(5)

5 baris pertama Data: 


Unnamed: 0,url,price_in_rp,title,address,district,city,lat,long,facilities,property_type,ads_id,bedrooms,bathrooms,land_size_m2,building_size_m2,carports,certificate,electricity,maid_bedrooms,maid_bathrooms,floors,building_age,year_built,property_condition,building_orientation,garages,furnishing
0,https://www.rumah123.com/properti/bekasi/hos11...,2990000000.0,Rumah cantik Sumarecon Bekasi\nLingkungan asri...,"Summarecon Bekasi, Bekasi",Summarecon Bekasi,Bekasi,-6.223945,106.986275,"Tempat Jemuran, Jalur Telepon, Taman, Taman",rumah,hos11360272,4.0,4.0,239.0,272.0,0.0,shm - sertifikat hak milik,4400 mah,0.0,1.0,2.0,5.0,2017.0,bagus,,0.0,unfurnished
1,https://www.rumah123.com/properti/bekasi/hos10...,1270000000.0,"Rumah Kekinian, Magenta Summarecon Bekasi","Summarecon Bekasi, Bekasi",Summarecon Bekasi,Bekasi,-6.223945,106.986275,Taman,rumah,hos10680347,3.0,2.0,55.0,69.0,1.0,hgb - hak guna bangunan,2200 mah,0.0,0.0,2.0,,,bagus,,0.0,
2,https://www.rumah123.com/properti/bekasi/hos10...,1950000000.0,Rumah Cantik 2 Lantai Cluster Bluebell Summare...,"Summarecon Bekasi, Bekasi",Summarecon Bekasi,Bekasi,-6.223945,106.986275,"Jogging Track, Kolam Renang, Masjid, Taman,...",rumah,hos10685867,3.0,3.0,119.0,131.0,1.0,hgb - hak guna bangunan,2200 mah,1.0,1.0,2.0,,,bagus,,1.0,unfurnished
3,https://www.rumah123.com/properti/bekasi/hos10...,3300000000.0,Rumah Mewah 2Lantai L10x18 C di Cluster VERNON...,"Summarecon Bekasi, Bekasi",Summarecon Bekasi,Bekasi,-6.223945,106.986275,"Jalur Telepon, Jogging Track, Track Lari, K...",rumah,hos10927790,3.0,3.0,180.0,174.0,0.0,shm - sertifikat hak milik,3500 mah,1.0,1.0,2.0,6.0,2016.0,bagus sekali,utara,2.0,unfurnished
4,https://www.rumah123.com/properti/bekasi/hos10...,4500000000.0,"Rumah Hoek di Cluster Maple Summarecon Bekasi,...","Summarecon Bekasi, Bekasi",Summarecon Bekasi,Bekasi,-6.223945,106.986275,"Jogging Track, Kolam Renang, Taman, Jalur Te...",rumah,hos10785530,4.0,3.0,328.0,196.0,2.0,shm - sertifikat hak milik,3500 mah,1.0,1.0,2.0,9.0,2013.0,bagus,utara,1.0,unfurnished


In [5]:
# Tampilkan informasi ringkas tentang dataset
print("\nInformasi Dataset:")
df.info()


Informasi Dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3553 entries, 0 to 3552
Data columns (total 27 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   url                   3553 non-null   object 
 1   price_in_rp           3553 non-null   float64
 2   title                 3553 non-null   object 
 3   address               3553 non-null   object 
 4   district              3553 non-null   object 
 5   city                  3553 non-null   object 
 6   lat                   3553 non-null   float64
 7   long                  3553 non-null   float64
 8   facilities            3553 non-null   object 
 9   property_type         3552 non-null   object 
 10  ads_id                3549 non-null   object 
 11  bedrooms              3519 non-null   float64
 12  bathrooms             3524 non-null   float64
 13  land_size_m2          3551 non-null   float64
 14  building_size_m2      3551 non-null   float64
 15  c

In [6]:

# Tampilkan statistik deskriptif untuk kolom numerik
print("\nStatistik Deskriptif:")
display(df.describe())


Statistik Deskriptif:


Unnamed: 0,price_in_rp,lat,long,bedrooms,bathrooms,land_size_m2,building_size_m2,carports,maid_bedrooms,maid_bathrooms,floors,building_age,year_built,garages
count,3553.0,3553.0,3553.0,3519.0,3524.0,3551.0,3551.0,3553.0,3553.0,3553.0,3547.0,2108.0,2108.0,3553.0
mean,4191685000.0,-6.324721,106.792881,3.326513,2.624858,204.806815,186.58744,1.197861,0.496482,0.370391,1.76459,3.88093,2018.137097,0.708978
std,13750670000.0,0.129245,0.172159,2.672148,2.696497,402.127746,248.443471,1.114996,0.685723,0.536024,0.637349,7.603708,7.641448,1.311879
min,42000000.0,-6.894828,106.402315,1.0,1.0,12.0,1.0,0.0,0.0,0.0,1.0,0.0,1870.0,0.0
25%,800000000.0,-6.397933,106.687295,2.0,2.0,75.0,65.5,1.0,0.0,0.0,1.0,0.0,2016.0,0.0
50%,1500000000.0,-6.300733,106.799954,3.0,2.0,108.0,112.0,1.0,0.0,0.0,2.0,1.0,2021.0,0.0
75%,3590000000.0,-6.231754,106.874766,4.0,3.0,192.0,208.0,2.0,1.0,1.0,2.0,6.0,2022.0,1.0
max,580000000000.0,-6.102478,109.771691,99.0,99.0,8000.0,6000.0,15.0,7.0,5.0,5.0,152.0,2052.0,50.0


## --- LANGKAH 1: MENANGANI DATA YANG HILANG (MISSING VALUES) ---

In [8]:
## melihat nilai missing values pada data
print("Jumlah data yang hilang sebelum Cleaning : ")
df.isnull().sum()

Jumlah data yang hilang sebelum Cleaning : 


url                        0
price_in_rp                0
title                      0
address                    0
district                   0
city                       0
lat                        0
long                       0
facilities                 0
property_type              1
ads_id                     4
bedrooms                  34
bathrooms                 29
land_size_m2               2
building_size_m2           2
carports                   0
certificate              141
electricity                0
maid_bedrooms              0
maid_bathrooms             0
floors                     6
building_age            1445
year_built              1445
property_condition       246
building_orientation    1647
garages                    0
furnishing               387
dtype: int64

In [None]:

df['kota'] = df['location'].apply(lambda x: x.split(',')[-1].strip())
