In [10]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# 데이터 로드
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# 결측값 처리 (수치형 변수: 평균 대체)
num_cols = ['해당층', '총층', '전용면적', '방수', '욕실수', '총주차대수', '관리비']
imputer = SimpleImputer(strategy='mean')
train[num_cols] = imputer.fit_transform(train[num_cols])
test[num_cols] = imputer.transform(test[num_cols])

# 범주형 변수 처리 (최빈값 대체)
cat_cols = ['매물확인방식', '방향', '주차가능여부', '중개사무소', '제공플랫폼']
for col in cat_cols:
    mode_value = train[col].mode()[0]
    train[col].fillna(mode_value, inplace=True)
    test[col].fillna(mode_value, inplace=True)

# 날짜 데이터 변환 ('게재일'을 '등록 후 경과 일 수'로 변환)
train['게재일'] = pd.to_datetime(train['게재일'])
test['게재일'] = pd.to_datetime(test['게재일'])
reference_date = max(train['게재일'].max(), test['게재일'].max())
train['등록경과일'] = (reference_date - train['게재일']).dt.days
test['등록경과일'] = (reference_date - test['게재일']).dt.days
train.drop(columns=['게재일'], inplace=True)
test.drop(columns=['게재일'], inplace=True)

# 새로운 Feature 추가
train['보증금월세비율'] = train['보증금'] / (train['월세'] + 1)
test['보증금월세비율'] = test['보증금'] / (test['월세'] + 1)
train['층비율'] = train['해당층'] / train['총층']
test['층비율'] = test['해당층'] / test['총층']
train['저층여부'] = (train['해당층'] <= 3).astype(int)
test['저층여부'] = (test['해당층'] <= 3).astype(int)
train['고층여부'] = (train['해당층'] >= (train['총층'] - 3)).astype(int)
test['고층여부'] = (test['해당층'] >= (test['총층'] - 3)).astype(int)

# 로그 변환 적용 (보증금, 월세, 전용면적, 총주차대수, 관리비)
for col in ['보증금', '월세', '전용면적', '총주차대수', '관리비']:
    train[col] = np.log1p(train[col])
    test[col] = np.log1p(test[col])

# 이상치 처리 (총주차대수, 관리비 상한 설정)
train['총주차대수'] = np.clip(train['총주차대수'], a_min=0, a_max=300)
test['총주차대수'] = np.clip(test['총주차대수'], a_min=0, a_max=300)
train['관리비'] = np.clip(train['관리비'], a_min=0, a_max=50)
test['관리비'] = np.clip(test['관리비'], a_min=0, a_max=50)

# Label Encoding 적용 (중개사무소, 제공플랫폼 별도 처리)
le_broker = LabelEncoder()
le_broker.fit(pd.concat([train['중개사무소'], test['중개사무소']], axis=0))
train['중개사무소'] = le_broker.transform(train['중개사무소'])
test['중개사무소'] = le_broker.transform(test['중개사무소'])

le_platform = LabelEncoder()
le_platform.fit(pd.concat([train['제공플랫폼'], test['제공플랫폼']], axis=0))
train['제공플랫폼'] = le_platform.transform(train['제공플랫폼'])
test['제공플랫폼'] = le_platform.transform(test['제공플랫폼'])

# One-Hot Encoding 적용 (매물확인방식, 주차가능여부, 방향)
one_hot_cols = ['매물확인방식', '주차가능여부', '방향']
one_hot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
train_encoded = one_hot_encoder.fit_transform(train[one_hot_cols])
test_encoded = one_hot_encoder.transform(test[one_hot_cols])

train_encoded_df = pd.DataFrame(train_encoded, columns=one_hot_encoder.get_feature_names_out(one_hot_cols), index=train.index)
test_encoded_df = pd.DataFrame(test_encoded, columns=one_hot_encoder.get_feature_names_out(one_hot_cols), index=test.index)

train = pd.concat([train.drop(columns=one_hot_cols), train_encoded_df], axis=1)
test = pd.concat([test.drop(columns=one_hot_cols), test_encoded_df], axis=1)

# 최종 데이터 저장
train.to_csv("processed_train.csv", index=False)
test.to_csv("processed_test.csv", index=False)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train[col].fillna(mode_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test[col].fillna(mode_value, inplace=True)
