In [136]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import numpy as np
import re
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
import time

from sklearn.feature_selection import mutual_info_regression

# Atur agar semua kolom terlihat
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_seq_items', None)
pd.options.display.max_rows = None
pd.options.display.max_columns = None 

# df = pd.read_csv('AB_US_2023.csv')
# dt = pd.read_csv('AB_US_2020.csv')

# Splitting data

split data menjadi 70:30 karena datanya termasuk besar, jadi biar oke di train ataupun test nya

In [137]:
df = pd.read_csv("airbnb-listings-us-cleaned-o (1).csv", low_memory=False)

In [138]:
df = df.reset_index(drop=True)

In [139]:
df.drop('Amenities', axis=1, inplace=True)

In [140]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131856 entries, 0 to 131855
Data columns (total 52 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   Last Scraped                    131856 non-null  object 
 1   Host Since                      131856 non-null  object 
 2   Host Response Time              131856 non-null  object 
 3   Host Response Rate              131856 non-null  float64
 4   Host Neighbourhood              131856 non-null  object 
 5   Host Total Listings Count       131856 non-null  int64  
 6   Neighbourhood                   131856 non-null  object 
 7   City                            131856 non-null  object 
 8   State                           131856 non-null  object 
 9   Zipcode                         131856 non-null  int64  
 10  Latitude                        131856 non-null  float64
 11  Longitude                       131856 non-null  float64
 12  Property Type   

In [141]:
# 2️Pisahkan Features (X) dan Target (y)
X = df.drop(columns=['Price'])  # Hapus kolom Price dari fitur
y = df['Price']  # Simpan target harga

# Split Data (70% Train - 30% Test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

X_train = X_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

# Cek Hasil
print(f"X_train shape: {X_train.shape}")  # Fitur training
print(f"y_train shape: {y_train.shape}")  # Target training
print(f"X_test shape: {X_test.shape}")    # Fitur testing
print(f"y_test shape: {y_test.shape}")    # Target testing


X_train shape: (92299, 51)
y_train shape: (92299,)
X_test shape: (39557, 51)
y_test shape: (39557,)


# Encoding

In [142]:
# df.to_csv("airbnb-nyoba.csv", index=False)

In [143]:
# df = pd.read_csv("airbnb-nyoba.csv", low_memory=False)

In [144]:
df.isnull().sum() / len(df) * 100

Last Scraped                      0.0
Host Since                        0.0
Host Response Time                0.0
Host Response Rate                0.0
Host Neighbourhood                0.0
Host Total Listings Count         0.0
Neighbourhood                     0.0
City                              0.0
State                             0.0
Zipcode                           0.0
Latitude                          0.0
Longitude                         0.0
Property Type                     0.0
Room Type                         0.0
Accommodates                      0.0
Bathrooms                         0.0
Bedrooms                          0.0
Beds                              0.0
Bed Type                          0.0
Price                             0.0
Cleaning Fee                      0.0
Guests Included                   0.0
Extra People                      0.0
Minimum Nights                    0.0
Maximum Nights                    0.0
Calendar Updated                  0.0
Availability

In [145]:
df['Last Scraped'].nunique()

25

In [146]:
df['Last Scraped'].value_counts()

Last Scraped
2017-05-03    45483
2017-05-04    17564
2017-03-07     9234
2017-04-02     8506
2017-05-10     6881
2016-07-07     6156
2017-05-11     5764
2017-06-02     5259
2017-05-05     4904
2016-01-04     3724
2017-04-07     3540
2016-09-07     3485
2017-05-02     3315
2016-09-06     3179
2016-05-04     1642
2016-05-21      717
2016-05-19      494
2016-05-18      464
2016-05-20      398
2016-05-17      353
2015-10-18      336
2015-10-17      302
2015-10-16      130
2015-10-19       24
2017-04-08        2
Name: count, dtype: int64

## Encoding feature ordinal

**Host Response Time**

In [147]:
df['Host Response Time'].unique()

array(['within an hour', 'within a day', 'within a few hours',
       'a few days or more'], dtype=object)

In [148]:
# df.loc[df['Host Response Time'] == "within an hour", "Host Response Time"] = 0
# df.loc[df['Host Response Time'] == "within a few hours", "Host Response Time"] = 1
# df.loc[df['Host Response Time'] == "within a day", "Host Response Time"] = 2
# df.loc[df['Host Response Time'] == "a few days or more", "Host Response Time"] = 3

In [149]:
df['Host Response Time'].unique()

array(['within an hour', 'within a day', 'within a few hours',
       'a few days or more'], dtype=object)

**Using splitted data**

In [150]:
ordinal_mapping = {
    "within an hour": 0,
    "within a few hours": 1,
    "within a day": 2,
    "a few days or more": 3
}

# Lakukan encoding di Train Set (X_train)
X_train['Host Response Time'] = X_train['Host Response Time'].map(ordinal_mapping)

# Terapkan encoding yang sama ke Test Set (X_test)
X_test['Host Response Time'] = X_test['Host Response Time'].map(ordinal_mapping)

print(X_train['Host Response Time'].unique()) 
print(X_test['Host Response Time'].unique()) 

[0 2 1 3]
[0 2 1 3]


## Encoding features One-Hot

**Room Type**

In [151]:
#df = pd.get_dummies(df, columns=['Room Type'], drop_first=True)

**Bed Type**

In [152]:
#df = pd.get_dummies(df, columns=['Bed Type'], drop_first=True)

**Cancellation Policy**

In [153]:
#df = pd.get_dummies(df, columns=['Cancellation Policy'], drop_first=True)

**Using splitted data**

In [154]:
# One-Hot Encoding (Hanya di Train)
X_train = pd.get_dummies(X_train, columns=['Room Type', 'Bed Type', 'Cancellation Policy'], drop_first=True)

train_columns = X_train.columns

# One-Hot Encoding untuk Test (Pastikan hanya gunakan kolom yang ada di Train)
X_test = pd.get_dummies(X_test, columns=['Room Type', 'Bed Type', 'Cancellation Policy'], drop_first=True)

# astikan Test memiliki kolom yang sama dengan Train (Tambahkan kolom yang hilang)
X_test = X_test.reindex(columns=train_columns)

print("Train Columns:", X_train.shape)
print("Test Columns:", X_test.shape)

Train Columns: (92299, 60)
Test Columns: (39557, 60)


In [155]:
X_train.isnull().sum()

Last Scraped                           0
Host Since                             0
Host Response Time                     0
Host Response Rate                     0
Host Neighbourhood                     0
Host Total Listings Count              0
Neighbourhood                          0
City                                   0
State                                  0
Zipcode                                0
Latitude                               0
Longitude                              0
Property Type                          0
Accommodates                           0
Bathrooms                              0
Bedrooms                               0
Beds                                   0
Cleaning Fee                           0
Guests Included                        0
Extra People                           0
Minimum Nights                         0
Maximum Nights                         0
Calendar Updated                       0
Availability 30                        0
Availability 60 

In [156]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92299 entries, 0 to 92298
Data columns (total 60 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Last Scraped                         92299 non-null  object 
 1   Host Since                           92299 non-null  object 
 2   Host Response Time                   92299 non-null  int64  
 3   Host Response Rate                   92299 non-null  float64
 4   Host Neighbourhood                   92299 non-null  object 
 5   Host Total Listings Count            92299 non-null  int64  
 6   Neighbourhood                        92299 non-null  object 
 7   City                                 92299 non-null  object 
 8   State                                92299 non-null  object 
 9   Zipcode                              92299 non-null  int64  
 10  Latitude                             92299 non-null  float64
 11  Longitude                   

In [157]:
# df.head()

nge convert kolom boolean menjadi 1/0

In [158]:
# bool_cols = ['Wireless Internet', 'Heating', 'Kitchen', 'Essentials', 'TV', 
#              'Smoke detector', 'Shampoo', 'Internet', 'Washer', 'Dryer',
#              'Room Type_Private room', 'Room Type_Shared room', 'Bed Type_Couch',
#              'Bed Type_Futon', 'Bed Type_Pull-out Sofa', 'Bed Type_Real Bed',
#              'Cancellation Policy_long_term', 'Cancellation Policy_moderate',
#              'Cancellation Policy_no_refunds', 'Cancellation Policy_strict',
#              'Cancellation Policy_super_strict_30', 'Cancellation Policy_super_strict_60']
# df[bool_cols] = df[bool_cols].astype(int)

In [159]:
for col in X_train.select_dtypes(include=['bool']).columns:
    X_train[col] = X_train[col].astype(int)

for col in X_test.select_dtypes(include=['bool']).columns:
    X_test[col] = X_test[col].astype(int)


print(X_train.dtypes)  # Harusnya semua kolom numerik (int64 atau float64)
print(X_train.head())  # Lihat hasil perubahan (0 dan 1)


Last Scraped                            object
Host Since                              object
Host Response Time                       int64
Host Response Rate                     float64
Host Neighbourhood                      object
Host Total Listings Count                int64
Neighbourhood                           object
City                                    object
State                                   object
Zipcode                                  int64
Latitude                               float64
Longitude                              float64
Property Type                           object
Accommodates                           float64
Bathrooms                                int64
Bedrooms                                 int64
Beds                                     int64
Cleaning Fee                           float64
Guests Included                        float64
Extra People                           float64
Minimum Nights                         float64
Maximum Night

In [160]:
# df.head()

In [161]:
# df.isnull().sum() / len(df) * 100

## Convert datetime

In [162]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131856 entries, 0 to 131855
Data columns (total 52 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   Last Scraped                    131856 non-null  object 
 1   Host Since                      131856 non-null  object 
 2   Host Response Time              131856 non-null  object 
 3   Host Response Rate              131856 non-null  float64
 4   Host Neighbourhood              131856 non-null  object 
 5   Host Total Listings Count       131856 non-null  int64  
 6   Neighbourhood                   131856 non-null  object 
 7   City                            131856 non-null  object 
 8   State                           131856 non-null  object 
 9   Zipcode                         131856 non-null  int64  
 10  Latitude                        131856 non-null  float64
 11  Longitude                       131856 non-null  float64
 12  Property Type   

In [163]:
# date_columns = ['Last Scraped', 'Host Since', 'Calendar Updated', 'Calendar last Scraped']

# # Ubah ke datetime format
# for col in date_columns:
#     df[col] = pd.to_datetime(df[col], errors='coerce')  # errors='coerce' untuk menangani nilai yang tidak valid

# # Cek apakah konversi berhasil
# print(df[date_columns].dtypes)

**Using train test data**

In [164]:
date_columns = ['Last Scraped', 'Host Since', 'Calendar Updated', 'Calendar last Scraped']

# Ubah ke datetime format (di Train)
for col in date_columns:
    X_train[col] = pd.to_datetime(X_train[col], errors='coerce')

# Ubah ke datetime format (di Test)
for col in date_columns:
    X_test[col] = pd.to_datetime(X_test[col], errors='coerce')

print(X_train[date_columns].dtypes)
print(X_test[date_columns].dtypes)


Last Scraped             datetime64[ns]
Host Since               datetime64[ns]
Calendar Updated         datetime64[ns]
Calendar last Scraped    datetime64[ns]
dtype: object
Last Scraped             datetime64[ns]
Host Since               datetime64[ns]
Calendar Updated         datetime64[ns]
Calendar last Scraped    datetime64[ns]
dtype: object


  X_train[col] = pd.to_datetime(X_train[col], errors='coerce')
  X_test[col] = pd.to_datetime(X_test[col], errors='coerce')


## Encoding Lokasi

Kolom: State, City, Neighbourhood

In [165]:
def kfold_target_encoding(train, test, col_name, target, n_splits=5, shuffle=True, random_state=42):
    """
    Melakukan K-Fold Target Encoding pada kolom `col_name`,
    tanpa mengakses kolom target di dalam DataFrame train.
    
    train: DataFrame berisi fitur (TIDAK ada 'Price')
    test:  DataFrame berisi fitur (TIDAK ada 'Price')
    col_name: nama kolom kategori yang akan di-encode
    target: Series (y_train) yang hanya berisi nilai target (Price)
    """
    train_encoded = train.copy()
    test_encoded = test.copy()
    
    # Nama kolom baru untuk menyimpan hasil encoding
    col_encoded_name = col_name + "_encoded"
    train_encoded[col_encoded_name] = np.nan
    
    kf = KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)

    for train_idx, val_idx in kf.split(train):
        # X_fold, y_fold adalah data latih
        X_fold = train.iloc[train_idx]
        y_fold = target.iloc[train_idx]

        # Gabungkan hanya kolom kategori dan target
        fold_df = pd.DataFrame({
            col_name: X_fold[col_name],
            "target_temp": y_fold.values
        })

        # Hitung rata-rata target per kategori di fold
        mean_map = fold_df.groupby(col_name)["target_temp"].mean().to_dict()

        # Apply ke validation fold
        train_encoded.loc[train_encoded.index[val_idx], col_encoded_name] = train.loc[train_encoded.index[val_idx], col_name].map(mean_map)

    # Isi NaN (kalau ada kategori di val fold yang tidak muncul di train fold)
    global_mean = target.mean()
    train_encoded[col_encoded_name] = train_encoded[col_encoded_name].fillna(global_mean)

    full_df = pd.DataFrame({col_name: train[col_name], "target_temp": target.values})
    full_mean_map = full_df.groupby(col_name)["target_temp"].mean().to_dict()

    # Apply ke Test
    test_encoded[col_encoded_name] = test_encoded[col_name].map(full_mean_map)
    test_encoded[col_encoded_name] = test_encoded[col_encoded_name].fillna(global_mean)

    return train_encoded, test_encoded

City

In [166]:
X_train_encoded, X_test_encoded = kfold_target_encoding(
    train=X_train,
    test=X_test,
    col_name='City',
    target=y_train,
    n_splits=5,
    shuffle=True,
    random_state=42
)

print("Sebelum encoding:\n", X_train[['City']].head())
print("Setelah encoding:\n", X_train_encoded[['City', 'City_encoded']].head())

Sebelum encoding:
           City
0       Austin
1  New Orleans
2     Brooklyn
3  Los Angeles
4       Boston
Setelah encoding:
           City  City_encoded
0       Austin    229.007544
1  New Orleans    190.132304
2     Brooklyn    112.409443
3  Los Angeles    135.513605
4       Boston    170.721303


State

In [167]:
X_train_encoded, X_test_encoded = kfold_target_encoding(
    train=X_train,
    test=X_test,
    col_name='State',
    target=y_train,
    n_splits=5,
    shuffle=True,
    random_state=42
)

print("Sebelum encoding:\n", X_train[['State']].head())
print("Setelah encoding:\n", X_train_encoded[['State', 'State_encoded']].head())

Sebelum encoding:
   State
0    TX
1    LA
2    NY
3    CA
4    MA
Setelah encoding:
   State  State_encoded
0    TX     229.603296
1    LA     190.039879
2    NY     135.833968
3    CA     159.077333
4    MA     167.798682


Neighbourhood

In [168]:
X_train_encoded, X_test_encoded = kfold_target_encoding(
    train=X_train,
    test=X_test,
    col_name='Neighbourhood',
    target=y_train, 
    n_splits=5,
    shuffle=True,
    random_state=42
)

print("Sebelum encoding:\n", X_train[['Neighbourhood']].head())
print("Setelah encoding:\n", X_train_encoded[['Neighbourhood', 'Neighbourhood_encoded']].head())

Sebelum encoding:
         Neighbourhood
0              Zilker
1      Bayou St. John
2  Bedford-Stuyvesant
3     University Park
4            Mattapan
Setelah encoding:
         Neighbourhood  Neighbourhood_encoded
0              Zilker             253.172949
1      Bayou St. John             167.043956
2  Bedford-Stuyvesant              97.386064
3     University Park              93.513889
4            Mattapan              78.700000


In [169]:
# Gunakan variabel sementara agar tidak menimpa hasil sebelumnya
X_train_encoded, X_test_encoded = X_train.copy(), X_test.copy()

for col in ['City', 'State', 'Neighbourhood', 'Property Type', 'Host Neighbourhood']:
    X_train_encoded, X_test_encoded = kfold_target_encoding(
        train=X_train_encoded,  # Gunakan hasil encoding sebelumnya
        test=X_test_encoded,
        col_name=col,
        target=y_train,
        n_splits=5,
        shuffle=True,
        random_state=42
    )

print("Sebelum encoding:\n", X_train[['City', 'State', 'Neighbourhood', 'Property Type', 'Host Neighbourhood']].head())
print("Setelah encoding:\n", X_train_encoded.head())


Sebelum encoding:
           City State       Neighbourhood Property Type Host Neighbourhood
0       Austin    TX              Zilker     Apartment     Travis Heights
1  New Orleans    LA      Bayou St. John     Apartment     Bayou St. John
2     Brooklyn    NY  Bedford-Stuyvesant          Loft       Clinton Hill
3  Los Angeles    CA     University Park     Apartment         West Adams
4       Boston    MA            Mattapan     Apartment         Dorchester
Setelah encoding:
   Last Scraped Host Since  Host Response Time  Host Response Rate  \
0   2017-03-07 2014-09-14                   0               100.0   
1   2017-06-02 2010-12-28                   0               100.0   
2   2017-05-03 2015-09-12                   2               100.0   
3   2017-05-03 2015-12-24                   0               100.0   
4   2016-09-07 2015-12-05                   2                90.0   

  Host Neighbourhood  Host Total Listings Count       Neighbourhood  \
0     Travis Heights            

In [170]:
X_train_encoded.drop(["Neighbourhood", "City", "State", "Property Type", "Host Neighbourhood"], axis=1, inplace=True)
X_test_encoded.drop(["Neighbourhood", "City", "State", "Property Type", "Host Neighbourhood"], axis=1, inplace=True)

# Feature Selection

In [171]:
X_train_encoded.head()

Unnamed: 0,Last Scraped,Host Since,Host Response Time,Host Response Rate,Host Total Listings Count,Zipcode,Latitude,Longitude,Accommodates,Bathrooms,Bedrooms,Beds,Cleaning Fee,Guests Included,Extra People,Minimum Nights,Maximum Nights,Calendar Updated,Availability 30,Availability 60,Availability 90,Availability 365,Calendar last Scraped,Number of Reviews,Review Scores Rating,Review Scores Accuracy,Review Scores Cleanliness,Review Scores Checkin,Review Scores Communication,Review Scores Location,Review Scores Value,Calculated host listings count,Reviews per Month,Wireless Internet,Heating,Kitchen,Essentials,TV,Smoke detector,Shampoo,Internet,Washer,Dryer,Room Type_Private room,Room Type_Shared room,Bed Type_Couch,Bed Type_Futon,Bed Type_Pull-out Sofa,Bed Type_Real Bed,Cancellation Policy_long_term,Cancellation Policy_moderate,Cancellation Policy_no_refunds,Cancellation Policy_strict,Cancellation Policy_super_strict_30,Cancellation Policy_super_strict_60,City_encoded,State_encoded,Neighbourhood_encoded,Property Type_encoded,Host Neighbourhood_encoded
0,2017-03-07,2014-09-14,0,100.0,2,78704,30.254806,-97.74709,6.0,2,2,2,20.0,6.0,50.0,1.0,1125.0,2025-03-06 10:39:28.775941,26.0,56.0,86.0,361.0,2017-03-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,1,0,1,0,0,0,0,229.007544,229.603296,253.172949,143.682414,260.275986
1,2017-06-02,2010-12-28,0,100.0,2,70119,29.975902,-90.081435,2.0,1,1,1,52.5,2.0,25.0,2.0,365.0,NaT,23.0,51.0,75.0,350.0,2017-06-02,159.0,97.0,10.0,10.0,10.0,10.0,9.0,9.0,2.0,2.04,1,1,1,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,190.132304,190.039879,167.043956,143.682414,157.061856
2,2017-05-03,2015-09-12,2,100.0,3,11205,40.692402,-73.960417,1.0,1,1,1,10.0,1.0,0.0,2.0,1125.0,NaT,2.0,2.0,29.0,29.0,2017-05-03,28.0,88.0,8.0,8.0,10.0,9.0,9.0,9.0,3.0,1.44,1,1,1,0,1,1,0,1,1,1,0,1,0,0,0,1,0,1,0,0,0,0,112.409443,135.833968,97.386064,184.228794,136.231144
3,2017-05-03,2015-12-24,0,100.0,1,90007,34.032978,-118.288024,1.0,1,1,1,16.0,1.0,0.0,1.0,1125.0,NaT,0.0,0.0,0.0,0.0,2017-05-03,1.0,100.0,10.0,10.0,10.0,10.0,10.0,10.0,1.0,0.09,1,1,1,0,0,1,0,1,1,1,0,1,0,0,0,1,0,0,0,1,0,0,135.513605,159.077333,93.513889,143.682414,91.296296
4,2016-09-07,2015-12-05,2,90.0,1,2124,42.272581,-71.068548,2.0,1,0,1,52.5,1.0,0.0,1.0,1125.0,NaT,0.0,0.0,0.0,0.0,2016-09-06,3.0,100.0,9.0,9.0,9.0,9.0,9.0,10.0,1.0,0.36,1,1,1,1,1,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,170.721303,167.798682,78.7,143.204722,96.518248


In [172]:
X_train_encoded.drop(["Last Scraped", "Calendar Updated", "Calendar last Scraped"], axis=1, inplace=True)
X_test_encoded.drop(["Last Scraped", "Calendar Updated", "Calendar last Scraped"], axis=1, inplace=True)

In [173]:
X_train_encoded.head()

Unnamed: 0,Host Since,Host Response Time,Host Response Rate,Host Total Listings Count,Zipcode,Latitude,Longitude,Accommodates,Bathrooms,Bedrooms,Beds,Cleaning Fee,Guests Included,Extra People,Minimum Nights,Maximum Nights,Availability 30,Availability 60,Availability 90,Availability 365,Number of Reviews,Review Scores Rating,Review Scores Accuracy,Review Scores Cleanliness,Review Scores Checkin,Review Scores Communication,Review Scores Location,Review Scores Value,Calculated host listings count,Reviews per Month,Wireless Internet,Heating,Kitchen,Essentials,TV,Smoke detector,Shampoo,Internet,Washer,Dryer,Room Type_Private room,Room Type_Shared room,Bed Type_Couch,Bed Type_Futon,Bed Type_Pull-out Sofa,Bed Type_Real Bed,Cancellation Policy_long_term,Cancellation Policy_moderate,Cancellation Policy_no_refunds,Cancellation Policy_strict,Cancellation Policy_super_strict_30,Cancellation Policy_super_strict_60,City_encoded,State_encoded,Neighbourhood_encoded,Property Type_encoded,Host Neighbourhood_encoded
0,2014-09-14,0,100.0,2,78704,30.254806,-97.74709,6.0,2,2,2,20.0,6.0,50.0,1.0,1125.0,26.0,56.0,86.0,361.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,1,0,1,0,0,0,0,229.007544,229.603296,253.172949,143.682414,260.275986
1,2010-12-28,0,100.0,2,70119,29.975902,-90.081435,2.0,1,1,1,52.5,2.0,25.0,2.0,365.0,23.0,51.0,75.0,350.0,159.0,97.0,10.0,10.0,10.0,10.0,9.0,9.0,2.0,2.04,1,1,1,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,190.132304,190.039879,167.043956,143.682414,157.061856
2,2015-09-12,2,100.0,3,11205,40.692402,-73.960417,1.0,1,1,1,10.0,1.0,0.0,2.0,1125.0,2.0,2.0,29.0,29.0,28.0,88.0,8.0,8.0,10.0,9.0,9.0,9.0,3.0,1.44,1,1,1,0,1,1,0,1,1,1,0,1,0,0,0,1,0,1,0,0,0,0,112.409443,135.833968,97.386064,184.228794,136.231144
3,2015-12-24,0,100.0,1,90007,34.032978,-118.288024,1.0,1,1,1,16.0,1.0,0.0,1.0,1125.0,0.0,0.0,0.0,0.0,1.0,100.0,10.0,10.0,10.0,10.0,10.0,10.0,1.0,0.09,1,1,1,0,0,1,0,1,1,1,0,1,0,0,0,1,0,0,0,1,0,0,135.513605,159.077333,93.513889,143.682414,91.296296
4,2015-12-05,2,90.0,1,2124,42.272581,-71.068548,2.0,1,0,1,52.5,1.0,0.0,1.0,1125.0,0.0,0.0,0.0,0.0,3.0,100.0,9.0,9.0,9.0,9.0,9.0,10.0,1.0,0.36,1,1,1,1,1,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,170.721303,167.798682,78.7,143.204722,96.518248


In [174]:
X_train_encoded["Year"] = X_train_encoded["Host Since"].dt.year
X_train_encoded["Month"] = X_train_encoded["Host Since"].dt.month
X_train_encoded["Day"] = X_train_encoded["Host Since"].dt.day

X_test_encoded["Year"] = X_test_encoded["Host Since"].dt.year
X_test_encoded["Month"] = X_test_encoded["Host Since"].dt.month
X_test_encoded["Day"] = X_test_encoded["Host Since"].dt.day

X_test_encoded.drop(["Host Since"], axis=1, inplace=True)
X_train_encoded.drop(["Host Since"], axis=1, inplace=True)

In [175]:
X_train_encoded.head()

Unnamed: 0,Host Response Time,Host Response Rate,Host Total Listings Count,Zipcode,Latitude,Longitude,Accommodates,Bathrooms,Bedrooms,Beds,Cleaning Fee,Guests Included,Extra People,Minimum Nights,Maximum Nights,Availability 30,Availability 60,Availability 90,Availability 365,Number of Reviews,Review Scores Rating,Review Scores Accuracy,Review Scores Cleanliness,Review Scores Checkin,Review Scores Communication,Review Scores Location,Review Scores Value,Calculated host listings count,Reviews per Month,Wireless Internet,Heating,Kitchen,Essentials,TV,Smoke detector,Shampoo,Internet,Washer,Dryer,Room Type_Private room,Room Type_Shared room,Bed Type_Couch,Bed Type_Futon,Bed Type_Pull-out Sofa,Bed Type_Real Bed,Cancellation Policy_long_term,Cancellation Policy_moderate,Cancellation Policy_no_refunds,Cancellation Policy_strict,Cancellation Policy_super_strict_30,Cancellation Policy_super_strict_60,City_encoded,State_encoded,Neighbourhood_encoded,Property Type_encoded,Host Neighbourhood_encoded,Year,Month,Day
0,0,100.0,2,78704,30.254806,-97.74709,6.0,2,2,2,20.0,6.0,50.0,1.0,1125.0,26.0,56.0,86.0,361.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,1,0,1,0,0,0,0,229.007544,229.603296,253.172949,143.682414,260.275986,2014.0,9.0,14.0
1,0,100.0,2,70119,29.975902,-90.081435,2.0,1,1,1,52.5,2.0,25.0,2.0,365.0,23.0,51.0,75.0,350.0,159.0,97.0,10.0,10.0,10.0,10.0,9.0,9.0,2.0,2.04,1,1,1,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,190.132304,190.039879,167.043956,143.682414,157.061856,2010.0,12.0,28.0
2,2,100.0,3,11205,40.692402,-73.960417,1.0,1,1,1,10.0,1.0,0.0,2.0,1125.0,2.0,2.0,29.0,29.0,28.0,88.0,8.0,8.0,10.0,9.0,9.0,9.0,3.0,1.44,1,1,1,0,1,1,0,1,1,1,0,1,0,0,0,1,0,1,0,0,0,0,112.409443,135.833968,97.386064,184.228794,136.231144,2015.0,9.0,12.0
3,0,100.0,1,90007,34.032978,-118.288024,1.0,1,1,1,16.0,1.0,0.0,1.0,1125.0,0.0,0.0,0.0,0.0,1.0,100.0,10.0,10.0,10.0,10.0,10.0,10.0,1.0,0.09,1,1,1,0,0,1,0,1,1,1,0,1,0,0,0,1,0,0,0,1,0,0,135.513605,159.077333,93.513889,143.682414,91.296296,2015.0,12.0,24.0
4,2,90.0,1,2124,42.272581,-71.068548,2.0,1,0,1,52.5,1.0,0.0,1.0,1125.0,0.0,0.0,0.0,0.0,3.0,100.0,9.0,9.0,9.0,9.0,9.0,10.0,1.0,0.36,1,1,1,1,1,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,170.721303,167.798682,78.7,143.204722,96.518248,2015.0,12.0,5.0


In [176]:
X_train_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92299 entries, 0 to 92298
Data columns (total 59 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Host Response Time                   92299 non-null  int64  
 1   Host Response Rate                   92299 non-null  float64
 2   Host Total Listings Count            92299 non-null  int64  
 3   Zipcode                              92299 non-null  int64  
 4   Latitude                             92299 non-null  float64
 5   Longitude                            92299 non-null  float64
 6   Accommodates                         92299 non-null  float64
 7   Bathrooms                            92299 non-null  int64  
 8   Bedrooms                             92299 non-null  int64  
 9   Beds                                 92299 non-null  int64  
 10  Cleaning Fee                         92299 non-null  float64
 11  Guests Included             

Menggunakan PCC

In [177]:
import pandas as pd

# Pastikan host_response_rate bertipe numerik
X_train['Host Response Time'] = pd.to_numeric(X_train['Host Response Time'], errors='coerce')

# Hitung Pearson Correlation
correlation = X_train['Host Response Time'].corr(y_train, method='spearman')

print(f"Korelasi antara Host Response Time dan Price: {correlation:.2f}")


Korelasi antara Host Response Time dan Price: 0.03


In [178]:
import pandas as pd

# Pastikan host_response_rate bertipe numerik
X_train['Host Response Rate'] = pd.to_numeric(X_train['Host Response Rate'], errors='coerce')

# Hitung Pearson Correlation
correlation = X_train['Host Response Rate'].corr(y_train, method='spearman')

print(f"Korelasi antara Host Response Rate dan Price: {correlation:.2f}")

Korelasi antara Host Response Rate dan Price: 0.01


In [179]:
X_train_with_target = X_train.copy()
X_train_with_target['price'] = y_train  # Tambahkan target ke dalam DataFrame

# Pilih hanya kolom numerik
X_train_numeric = X_train_with_target.select_dtypes(include=['number'])

correlation_matrix = X_train_numeric.corr()['price'].sort_values(ascending=False)
print(correlation_matrix)


price                                  1.000000
Cleaning Fee                           0.612730
Accommodates                           0.573138
Bedrooms                               0.546882
Beds                                   0.482625
Bathrooms                              0.448947
Guests Included                        0.318078
TV                                     0.217873
Washer                                 0.164803
Dryer                                  0.162974
Cancellation Policy_strict             0.133481
Extra People                           0.101025
Kitchen                                0.091357
Host Total Listings Count              0.083522
Cancellation Policy_super_strict_60    0.079156
Bed Type_Real Bed                      0.078611
Calculated host listings count         0.072055
Heating                                0.062990
Availability 30                        0.056698
Zipcode                                0.047873
Availability 60                        0

Menggunakan MI

In [180]:


# Ensure categorical features are encoded (e.g., one-hot or target encoding)
X_train_encoded_MI = X_train_encoded.select_dtypes(include=[np.number])  # Keep only numeric columns

# Compute MI scores
mi_scores = mutual_info_regression(X_train_encoded_MI, y_train, random_state=42)

# Create a DataFrame of MI scores
mi_results = pd.DataFrame({'Feature': X_train_encoded_MI.columns, 'MI_Score': mi_scores})

# Sort by MI score (higher = more relevant)
mi_results = mi_results.sort_values(by="MI_Score", ascending=False)

# Display the results
print(mi_results)


ValueError: Input X contains NaN.