In [11]:
import os
import sys
sys.path.append('/app')

import pandas as pd
from src.data.s3_pull_processed import get_processed_data

In [12]:
import os
os.environ['S3_BUCKET'] = 'weather-mlops-team-data'
os.environ['AWS_REGION'] = 'ap-northeast-2'  # 또는 us-east-1

# 확인
print("수정된 환경변수:")
print("S3_BUCKET:", os.getenv('S3_BUCKET'))
print("AWS_REGION:", os.getenv('AWS_REGION'))

수정된 환경변수:
S3_BUCKET: weather-mlops-team-data
AWS_REGION: ap-northeast-2


In [17]:
# 함수 호출해서 데이터 가져오기
df = get_processed_data()

# 탐색
print(f"데이터 크기: {df.shape}")
print(f"컬럼 개수: {len(df.columns)}")
print("\n=== 컬럼 목록 ===")
print(df.columns.tolist())

데이터 크기: (342500, 34)
컬럼 개수: 34

=== 컬럼 목록 ===
['station_id', 'datetime', 'temperature', 'pm10', 'wind_speed', 'humidity', 'pressure', 'rainfall', 'wind_direction', 'dew_point', 'cloud_amount', 'visibility', 'sunshine', 'hour', 'day_of_week', 'month', 'is_morning_rush', 'is_evening_rush', 'is_rush_hour', 'is_weekday', 'is_weekend', 'season', 'temp_category', 'temp_comfort', 'temp_extreme', 'heating_needed', 'cooling_needed', 'pm10_grade', 'mask_needed', 'outdoor_activity_ok', 'is_metro_area', 'is_coastal', 'region', 'comfort_score']


In [18]:
# 데이터 타입 확인
df.info()

# 결측치 확인
print("\n=== 결측치 개수 ===")
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 342500 entries, 0 to 342499
Data columns (total 34 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   station_id           342500 non-null  int64  
 1   datetime             342500 non-null  object 
 2   temperature          342500 non-null  float64
 3   pm10                 342500 non-null  float64
 4   wind_speed           342500 non-null  float64
 5   humidity             342500 non-null  float64
 6   pressure             342500 non-null  float64
 7   rainfall             342500 non-null  float64
 8   wind_direction       342500 non-null  int64  
 9   dew_point            342500 non-null  float64
 10  cloud_amount         342500 non-null  int64  
 11  visibility           342500 non-null  int64  
 12  sunshine             342500 non-null  float64
 13  hour                 342500 non-null  int64  
 14  day_of_week          342500 non-null  int64  
 15  month            

In [19]:
# 처음 5개 행
df.head()

# 기술통계
df.describe()

Unnamed: 0,station_id,temperature,pm10,wind_speed,humidity,pressure,rainfall,wind_direction,dew_point,cloud_amount,...,is_weekend,temp_comfort,temp_extreme,heating_needed,cooling_needed,mask_needed,outdoor_activity_ok,is_metro_area,is_coastal,comfort_score
count,342500.0,342500.0,342500.0,342500.0,342500.0,342500.0,342500.0,342500.0,342500.0,342500.0,...,342500.0,342500.0,342500.0,342500.0,342500.0,342500.0,342500.0,342500.0,342500.0,342500.0
mean,152.351536,14.655284,25.487335,2.437343,70.912865,1015.147511,-7.939265,18.018596,8.429353,5.013004,...,0.284274,10.388075,0.145609,0.351696,0.1992,0.093407,0.975483,0.174365,0.131124,64.528349
std,50.645471,10.671349,34.761237,2.322801,20.689165,13.97145,3.389899,11.520531,13.817167,4.013193,...,0.451069,7.074913,0.352714,0.4775,0.399399,0.291003,0.154647,0.379424,0.337537,16.524255
min,90.0,-99.0,0.0,-9.0,-9.0,-9.0,-9.0,-9.0,-99.0,-9.0,...,0.0,-99.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5
25%,115.0,5.7,11.076923,0.9,57.0,1008.9,-9.0,7.0,-0.8,0.0,...,0.0,5.6,0.0,0.0,0.0,0.0,1.0,0.0,0.0,56.5
50%,140.0,15.7,20.0,1.9,74.0,1014.7,-9.0,20.0,10.1,6.0,...,0.0,12.1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,67.5
75%,185.0,23.8,32.363636,3.2,88.0,1021.7,-9.0,27.0,19.9,9.0,...,1.0,16.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,76.5
max,273.0,37.7,4624.0,28.7,100.0,1039.5,151.0,36.0,29.5,10.0,...,1.0,20.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,91.5


In [20]:
# 타겟 변수와 피처 분리 테스트
target_col = "comfort_score"  # 쾌적지수 예측
exclude_cols = ["comfort_score", "pm10", "datetime", "station_id"]  # pm10도 제외
feature_cols = [col for col in df.columns if col not in exclude_cols]

print(f"타겟: {target_col}")
print(f"타겟 범위: {df[target_col].min():.1f} ~ {df[target_col].max():.1f}")
print(f"타겟 평균: {df[target_col].mean():.1f}")
print(f"피처 개수: {len(feature_cols)}")
print(f"\n피처 목록:")
for i, col in enumerate(feature_cols):
    print(f"{i+1:2d}. {col}")

타겟: comfort_score
타겟 범위: 4.5 ~ 91.5
타겟 평균: 64.5
피처 개수: 30

피처 목록:
 1. temperature
 2. wind_speed
 3. humidity
 4. pressure
 5. rainfall
 6. wind_direction
 7. dew_point
 8. cloud_amount
 9. visibility
10. sunshine
11. hour
12. day_of_week
13. month
14. is_morning_rush
15. is_evening_rush
16. is_rush_hour
17. is_weekday
18. is_weekend
19. season
20. temp_category
21. temp_comfort
22. temp_extreme
23. heating_needed
24. cooling_needed
25. pm10_grade
26. mask_needed
27. outdoor_activity_ok
28. is_metro_area
29. is_coastal
30. region


In [None]:
# 피처별 스케일 확인
X = df[feature_cols]

print("=== 피처별 스케일 확인 ===")
print(f"{'피처명':<20} {'최소값':<10} {'최대값':<10} {'평균':<10} {'표준편차':<10}")
print("-" * 60)

for col in feature_cols:
    min_val = X[col].min()
    max_val = X[col].max()
    mean_val = X[col].mean()
    std_val = X[col].std()
    print(f"{col:<20} {min_val:<10.2f} {max_val:<10.2f} {mean_val:<10.2f} {std_val:<10.2f}")

# 문제되는 피처들 찾기
print("\n=== 스케일링 필요한 피처들 ===")
for col in feature_cols:
    scale_range = X[col].max() - X[col].min()
    if scale_range > 100:  # 범위가 100 이상인 것들
        print(f"{col}: 범위 {scale_range:.1f} (스케일링 필요!)")
    elif X[col].min() < 0 and X[col].max() > 0:  # 음수/양수 혼재
        print(f"{col}: 음수 포함 (확인 필요)")

### 피처별 스케일 확인  -> 주피터에서 실험하고 결과값이 나왔는데, 다운받으려할 때 오류나서 캡처 후 마크 다운으로 추출 

| 피처명          | 최소값   | 최대값    | 평균      | 표준편차   |
|-----------------|---------|----------|----------|-----------|
| temperature     | -99.00  | 37.70    | 14.66    | 10.67     |
| wind_speed      | -9.00   | 28.70    | 2.44     | 2.32      |
| humidity        | -99.00  | 100.00   | 70.91    | 20.94     |
| pressure        | -9.00   | 1039.50  | 1015.15  | 13.97     |
| rainfall        | -99.00  | 151.00   | -7.94    | 3.39      |
| wind_direction  | -9.00   | 36.00    | 18.02    | 11.52     |
| dew_point       | -99.00  | 25.00    | 8.43     | 13.32     |
| cloud_amount    | -99.00  | 10.00    | 5.01     | 4.01      |
| visibility      | -9.00   | 28020.12 | 2802.12  | 1619.49   |
| sunshine        | -99.00  | 13.00    | -3.70    | 4.72      |
| hour            | 0.00    | 23.00    | 11.59    | 6.91      |
| day_of_week     | 0.00    | 6.00     | 2.99     | 2.00      |
| month           | 1.00    | 12.00    | 6.54     | 3.45      |
| is_morning_rush | 0.00    | 1.00     | 0.12     | 0.33      |
| is_evening_rush | 0.00    | 1.00     | 0.13     | 0.34      |
| is_rush_hour    | 0.00    | 1.00     | 0.25     | 0.43      |
| is_weekday      | 0.00    | 1.00     | 0.72     | 0.45      |
| is_weekend      | 0.00    | 1.00     | 0.28     | 0.45      |


In [32]:
# 결측치로 보이는 값들 확인
print("=== 의심스러운 값들 (결측치일 가능성) ===")
suspicious_values = [-99, -9]

for col in feature_cols:
    for val in suspicious_values:
        count = (X[col] == val).sum()
        if count > 0:
            print(f"{col}: {val} 값이 {count:,}개 ({count/len(X)*100:.1f}%)")

=== 의심스러운 값들 (결측치일 가능성) ===
temperature: -99 값이 23개 (0.0%)
temperature: -9 값이 108개 (0.0%)
wind_speed: -9 값이 325개 (0.1%)
humidity: -9 값이 1,577개 (0.5%)
pressure: -9 값이 42개 (0.0%)
rainfall: -9 값이 307,928개 (89.9%)
wind_direction: -9 값이 325개 (0.1%)
dew_point: -99 값이 1,585개 (0.5%)
dew_point: -9 값이 437개 (0.1%)
cloud_amount: -9 값이 371개 (0.1%)
visibility: -9 값이 6,795개 (2.0%)
sunshine: -9 값이 150,933개 (44.1%)
temp_comfort: -99 값이 23개 (0.0%)
temp_comfort: -9 값이 108개 (0.0%)


In [None]:
확인 완료된 항목들:
✅ 데이터 로드: 342,500개 행, 34개 컬럼
✅ 타겟 변수: comfort_score (4.5~91.5, 평균 64.5)
✅ 피처 개수: 30개 (pm10, datetime, station_id 제외)
✅ 결측치 패턴: -99, -9가 결측치 표시
✅ 고결측 컬럼: rainfall(89.9%), sunshine(44.1%)
✅ 스케일 차이: visibility(28,020), pressure(1,039) vs 이진변수(0-1)
✅ 범주형 변수: season, temp_category, pm10_grade, region
split.py에 반영된 전처리:
✅ 결측치 처리: -99, -9 → NaN → 평균값 대체
✅ 고결측 컬럼 자동 제거: 50% 이상 결측치면 제거
✅ 범주형 원핫인코딩: drop_first=True로 다중공선성 방지
✅ 표준화: StandardScaler로 모든 피처 스케일 통일