In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Data Load

In [2]:
from google.colab import drive
drive.mount('/content/drive')
import sys
sys.path.append("/content/drive/MyDrive/Colab Notebooks")

train_path = '/content/drive/MyDrive/Colab Notebooks/data/AB_NYC_2019.csv'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# haversine(대원거리) fuction
def haversine(lat1, lon1, lat2, lon2):
    R = 6371
    φ1, φ2 = np.radians(lat1), np.radians(lat2)
    Δφ = φ2 - φ1
    Δλ = np.radians(lon2 - lon1)
    a = np.sin(Δφ/2)**2 + np.cos(φ1)*np.cos(φ2)*np.sin(Δλ/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    return R * c

In [4]:
df = pd.read_csv(train_path)
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [5]:
df.drop(columns=['id', 'name', 'host_id', 'host_name', 'neighbourhood'], inplace=True)

In [6]:
# df=pd.get_dummies(df,columns=['neighbourhood_group'])
# df.head()

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['neighbourhood_group'] = le.fit_transform(df['neighbourhood_group'])
df

Unnamed: 0,neighbourhood_group,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,1,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,2,40.80902,-73.94190,Private room,150,3,0,,,1,365
3,1,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,2,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.10,1,0
...,...,...,...,...,...,...,...,...,...,...,...
48890,1,40.67853,-73.94995,Private room,70,2,0,,,2,9
48891,1,40.70184,-73.93317,Private room,40,4,0,,,2,36
48892,2,40.81475,-73.94867,Entire home/apt,115,10,0,,,1,27
48893,2,40.75751,-73.99112,Shared room,55,1,0,,,6,2


In [7]:
label_map = dict(zip(le.classes_, le.transform(le.classes_)))
print(label_map)

{'Bronx': np.int64(0), 'Brooklyn': np.int64(1), 'Manhattan': np.int64(2), 'Queens': np.int64(3), 'Staten Island': np.int64(4)}


In [8]:
# 라벨 번호에 따른 지역 중심 좌표
centers = {
    0: (40.8448, -73.8648),       # Bronx
    1: (40.6782, -73.9442),       # Brooklyn
    2: (40.7685, -73.9822),       # Manhattan
    3: (40.7282, -73.7949),       # Queens
    4: (40.5795, -74.1502),       # Staten Island
}

# Haversine 거리 계산 함수
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # 지구 반지름 (km)
    phi1, phi2 = np.radians(lat1), np.radians(lat2)
    dphi = np.radians(lat2 - lat1)
    dlambda = np.radians(lon2 - lon1)

    a = np.sin(dphi / 2)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(dlambda / 2)**2
    c = 2 * np.arcsin(np.sqrt(a))

    return R * c

# 거리 계산 함수
def compute_distance(row):
    label = row['neighbourhood_group']
    if label in centers:
        clat, clon = centers[label]
        return haversine(row['latitude'], row['longitude'], clat, clon)
    return np.nan

# 적용
df['distance_to_center'] = df.apply(compute_distance, axis=1)


In [9]:
# 리뷰 날짜 차이 계산 (최근일수록 작은 값)
reference_date = pd.to_datetime("2019-12-01")
df['last_review'] = pd.to_datetime(df['last_review'], errors='coerce')
df['days_since_last_review'] = (reference_date - df['last_review']).dt.days

# 결측치 처리: 리뷰 없음 → 가장 오래된 값보다 더 오래된 것으로 간주 max + 30
temp_days = df['days_since_last_review'].copy()
df['days_since_last_review'] = temp_days.fillna(temp_days.max() + 30)

# 최근일수록 큰 값이 되도록 변환
max_days = df['days_since_last_review'].max()
df['days_since_last_review'] = max_days - df['days_since_last_review']
df.drop(columns=['last_review'], inplace=True)

df[['days_since_last_review']].head()

Unnamed: 0,days_since_last_review
0,2792.0
1,3006.0
2,0.0
3,3051.0
4,2823.0


In [10]:
df['reviews_per_month'].fillna(0, inplace=True)
df['room_type'] = le.fit_transform(df['room_type'])
df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['reviews_per_month'].fillna(0, inplace=True)


Unnamed: 0,neighbourhood_group,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,distance_to_center,days_since_last_review
0,1,40.64749,-73.97237,1,149,1,9,0.21,6,365,4.160116,2792.0
1,2,40.75362,-73.98377,0,225,1,45,0.38,2,355,1.659856,3006.0
2,2,40.80902,-73.94190,1,150,3,0,0.00,1,365,5.640177,0.0
3,1,40.68514,-73.95976,0,89,1,270,4.64,1,194,1.522190,3051.0
4,2,40.79851,-73.94399,0,80,10,9,0.10,1,0,4.635187,2823.0
...,...,...,...,...,...,...,...,...,...,...,...,...
48890,1,40.67853,-73.94995,1,70,2,0,0.00,2,9,0.486273,0.0
48891,1,40.70184,-73.93317,1,40,4,0,0.00,2,36,2.788305,0.0
48892,2,40.81475,-73.94867,0,115,10,0,0.00,1,27,5.866492,0.0
48893,2,40.75751,-73.99112,2,55,1,0,0.00,6,2,1.434483,0.0


In [11]:
label_map = dict(zip(le.classes_, le.transform(le.classes_)))
print(label_map)

{'Entire home/apt': np.int64(0), 'Private room': np.int64(1), 'Shared room': np.int64(2)}


In [12]:
df.drop(columns=['latitude', 'longitude'], inplace=True)
df

Unnamed: 0,neighbourhood_group,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,distance_to_center,days_since_last_review
0,1,1,149,1,9,0.21,6,365,4.160116,2792.0
1,2,0,225,1,45,0.38,2,355,1.659856,3006.0
2,2,1,150,3,0,0.00,1,365,5.640177,0.0
3,1,0,89,1,270,4.64,1,194,1.522190,3051.0
4,2,0,80,10,9,0.10,1,0,4.635187,2823.0
...,...,...,...,...,...,...,...,...,...,...
48890,1,1,70,2,0,0.00,2,9,0.486273,0.0
48891,1,1,40,4,0,0.00,2,36,2.788305,0.0
48892,2,0,115,10,0,0.00,1,27,5.866492,0.0
48893,2,2,55,1,0,0.00,6,2,1.434483,0.0


In [13]:
df.to_csv('label_data.csv', index=False)