In [7]:
import pandas as pd

# 파일 경로
train_path = "/Users/eunzinri/Downloads/next-day-air-temperature-forecast-challenge/train_dataset.csv"
station_path = "/Users/eunzinri/Downloads/next-day-air-temperature-forecast-challenge/station_info.csv"

# CSV 파일 불러오기
train_df = pd.read_csv(train_path)
station_df = pd.read_csv(station_path)

# 데이터프레임 크기와 간단한 정보 출력
train_df.info(), station_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13132 entries, 0 to 13131
Columns: 342 entries, id to target
dtypes: float64(338), int64(2), object(2)
memory usage: 34.3+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   지점                9 non-null      int64  
 1   시작일               9 non-null      object 
 2   종료일               1 non-null      object 
 3   지점명               9 non-null      object 
 4   위도                9 non-null      float64
 5   경도                9 non-null      float64
 6   노장해발고도(m)         9 non-null      float64
 7   기압계(관측장비지상높이(m))  9 non-null      float64
 8   기온계(관측장비지상높이(m))  9 non-null      float64
 9   풍속계(관측장비지상높이(m))  9 non-null      float64
 10  강우계(관측장비지상높이(m))  9 non-null      float64
dtypes: float64(7), int64(1), object(3)
memory usage: 924.0+ bytes


(None, None)

In [11]:
import numpy as np

def preprocess_data(train_df, station_df):
    df = train_df.copy()
    
    # 1. -9999를 NaN으로 변환
    df.replace(-9999, np.nan, inplace=True)

    # 2. 날짜 분리 (month, day)
    df[['month', 'day']] = df['date'].str.split('-', expand=True).astype(int)
    df.drop(columns=['date'], inplace=True)

    # 3. 결측치 처리
    # - 자연적인 결측치로 보이는 건 0으로
    for col in df.columns:
        if 'sunshine_duration' in col or 'snow_depth' in col or 'precipitation' in col:
            df[col].fillna(0, inplace=True)
        if "min_cloud_height" in col:
            df[col].fillna(90, inplace=True)
    
    # - 나머지는 평균값으로 (또는 향후 보간 가능)
    df.fillna(df.mean(numeric_only=True), inplace=True)

    # 4. 관측소 정보 병합
    station_df_renamed = station_df.rename(columns={
        '지점': 'station',
        '위도': 'latitude',
        '경도': 'longitude',
        '노장해발고도(m)': 'altitude'
    })[['station', 'latitude', 'longitude', 'altitude']]
    
    station_df_renamed = station_df_renamed.drop_duplicates(subset='station')
    
    
    df = df.merge(station_df_renamed, on='station', how='left')

    # 6. 불필요 컬럼 제거
    df.drop(columns=['station_name'], inplace=True)
    df.drop(columns=['station'], inplace=True)
    df.drop(columns=['id'], inplace=True)

    return df

# 전처리 적용
processed_df = preprocess_data(train_df, station_df)
processed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13132 entries, 0 to 13131
Columns: 343 entries, cloud_cover_0 to altitude
dtypes: float64(341), int64(2)
memory usage: 34.4 MB


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(90, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(0, inplace=True)


In [13]:
processed_df.head

<bound method NDFrame.head of        cloud_cover_0  cloud_cover_1  cloud_cover_10  cloud_cover_11  \
0                0.0            0.0             9.0             0.0   
1                0.0            0.0             0.0             0.0   
2                0.0            0.0             0.0             0.0   
3                0.0            0.0             2.0             0.0   
4                0.0            0.0             0.0             0.0   
...              ...            ...             ...             ...   
13127            2.0            4.0             0.0             0.0   
13128            0.0            0.0             0.0             0.0   
13129            4.0            1.0             0.0             0.0   
13130            9.0            9.0             0.0             0.0   
13131            6.0            3.0             1.0             2.0   

       cloud_cover_12  cloud_cover_13  cloud_cover_14  cloud_cover_15  \
0                 3.0             3.0       

In [15]:
processed_df.dtypes

cloud_cover_0     float64
cloud_cover_1     float64
cloud_cover_10    float64
cloud_cover_11    float64
cloud_cover_12    float64
                   ...   
month               int64
day                 int64
latitude          float64
longitude         float64
altitude          float64
Length: 343, dtype: object

In [186]:
!pip install xgboost



In [17]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np

# 전처리된 데이터 불러온 후 float32로 변환
df = processed_df

X = df.drop(columns=['target'])
y = df['target']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = XGBRegressor(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=6,
    random_state=42,
    tree_method='hist'
)

model.fit(X_train, y_train)
y_pred = model.predict(X_val)

rmse = mean_squared_error(y_val, y_pred, squared=False)
print(f'✅ 검증 RMSE: {rmse:.4f} °C')

✅ 검증 RMSE: 1.4934 °C




In [47]:
model = XGBRegressor(
    n_estimators=856,
    learning_rate=0.06811492360557936,
    max_depth=8,
    random_state=42,
    min_child_weight=10,
    tree_method='exact'
)

model.fit(X_train, y_train)
y_pred = model.predict(X_val)

rmse = mean_squared_error(y_val, y_pred, squared=False)
print(f'✅ 검증 RMSE: {rmse:.4f} °C')

✅ 검증 RMSE: 1.3772 °C




In [39]:
import optuna
import xgboost as xgb
from sklearn.model_selection import cross_val_score, KFold

def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 10),               # 트리 깊이
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),  # 학습률
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),      # 부스팅 반복 횟수
         "min_child_weight": trial.suggest_int('min_child_weight',1, 10),
        'early_stopping_rounds'=50,
        'verbose'=100
    }


    model = xgb.XGBRegressor(
        **params,
        random_state=42,
        tree_method='exact',
        eval_metric='rmse',
        verbosity=1
    )

    # 교차검증으로 성능 평가 (평가지표: 음의 RMSE)
    score = cross_val_score(model, X, y, cv=KFold(n_splits=5), scoring='neg_root_mean_squared_error')
    return score.mean()  # Optuna는 값을 'maximize' 하므로 음수 그대로 반환

# 🔍 최적화 수행
study = optuna.create_study(direction='maximize')  # RMSE가 작을수록 좋지만, neg_rmse라 maximize!
study.optimize(objective, n_trials=50)

# 🎉 결과 출력
print("Best trial:")
print(f"  RMSE (neg): {study.best_trial.value:.4f}")
print("  Params:")
for key, val in study.best_trial.params.items():
    print(f"    {key}: {val}")


[I 2025-04-11 09:56:13,091] A new study created in memory with name: no-name-f8b4cd20-7c97-4db0-b4c3-22c254c79cde
[I 2025-04-11 09:56:23,470] Trial 0 finished with value: -1.9041259262285188 and parameters: {'max_depth': 5, 'learning_rate': 0.039334428117740364, 'n_estimators': 118, 'min_child_weight': 1}. Best is trial 0 with value: -1.9041259262285188.
[I 2025-04-11 09:58:05,738] Trial 1 finished with value: -1.6015661249571989 and parameters: {'max_depth': 9, 'learning_rate': 0.011696947857174456, 'n_estimators': 449, 'min_child_weight': 8}. Best is trial 1 with value: -1.6015661249571989.
[I 2025-04-11 10:01:05,041] Trial 2 finished with value: -1.5216797236985475 and parameters: {'max_depth': 10, 'learning_rate': 0.029802202082262978, 'n_estimators': 737, 'min_child_weight': 10}. Best is trial 2 with value: -1.5216797236985475.
[I 2025-04-11 10:04:59,005] Trial 3 finished with value: -1.5293921826999033 and parameters: {'max_depth': 10, 'learning_rate': 0.023100054588237492, 'n_es

KeyboardInterrupt: 

In [109]:
# TRial 4s' hyperparams

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import numpy as np
from xgboost import XGBRegressor

def run_kfold_xgb(X, y, n_splits=10):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    rmses = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
        print(f"🌱 Fold {fold + 1}")
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model = XGBRegressor(
            n_estimators=856,
            learning_rate=0.06811492360557936,
            max_depth=8,
            random_state=42,
            min_child_weight=10,
            tree_method='hist'
        )
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        rmse = mean_squared_error(y_val, preds, squared=False)
        print(f"  📏 RMSE: {rmse:.4f}")
        rmses.append(rmse)

    print(f"\n✅ 평균 RMSE: {np.mean(rmses):.4f} ± {np.std(rmses):.4f}")
    return rmses

run_kfold_xgb(X, y)

🌱 Fold 1




  📏 RMSE: 1.3543
🌱 Fold 2




  📏 RMSE: 1.3865
🌱 Fold 3




  📏 RMSE: 1.4076
🌱 Fold 4




  📏 RMSE: 1.3402
🌱 Fold 5




  📏 RMSE: 1.3523
🌱 Fold 6




  📏 RMSE: 1.3434
🌱 Fold 7




  📏 RMSE: 1.3650
🌱 Fold 8




  📏 RMSE: 1.3328
🌱 Fold 9




  📏 RMSE: 1.2793
🌱 Fold 10
  📏 RMSE: 1.3273

✅ 평균 RMSE: 1.3489 ± 0.0329




[1.3543448958525113,
 1.3864887450876495,
 1.4076286874363728,
 1.3401912381953658,
 1.352251041513833,
 1.3433796041102624,
 1.364997771712845,
 1.3328261137624113,
 1.2793494071358675,
 1.3273405365335706]

In [111]:
# TRial 13s' hyperparams

from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import numpy as np
from xgboost import XGBRegressor

def run_kfold_xgb(X, y, n_splits=10):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    rmses = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
        print(f"🌱 Fold {fold + 1}")
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model = XGBRegressor(
            n_estimators=665,
            learning_rate=0.07468890154873875,
            max_depth=7,
            random_state=42,
            min_child_weight=10,
            tree_method='hist'
        )
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        rmse = mean_squared_error(y_val, preds, squared=False)
        print(f"  📏 RMSE: {rmse:.4f}")
        rmses.append(rmse)

    print(f"\n✅ 평균 RMSE: {np.mean(rmses):.4f} ± {np.std(rmses):.4f}")
    return rmses

run_kfold_xgb(X, y)

🌱 Fold 1




  📏 RMSE: 1.3330
🌱 Fold 2




  📏 RMSE: 1.3640
🌱 Fold 3




  📏 RMSE: 1.3896
🌱 Fold 4




  📏 RMSE: 1.3393
🌱 Fold 5




  📏 RMSE: 1.3638
🌱 Fold 6




  📏 RMSE: 1.3141
🌱 Fold 7




  📏 RMSE: 1.3407
🌱 Fold 8




  📏 RMSE: 1.3379
🌱 Fold 9




  📏 RMSE: 1.2789
🌱 Fold 10
  📏 RMSE: 1.3310

✅ 평균 RMSE: 1.3392 ± 0.0285




[1.3329740688324943,
 1.364035712081613,
 1.3896027249270326,
 1.3392956273879135,
 1.3637667562705165,
 1.3140602248903255,
 1.3406542746239436,
 1.3379212256529613,
 1.2788749402486332,
 1.3310395111058202]

In [113]:
final_model = XGBRegressor(
    n_estimators=665,
    learning_rate=0.07468890154873875,
    max_depth=7,
    random_state=42,
    min_child_weight=10,
    tree_method='hist'
)

# 💥 전부 다 학습에 씀
final_model.fit(X, y)

In [115]:
y_pred = final_model.predict(X)

rmse = mean_squared_error(y, y_pred, squared=False)
print(f'✅ 검증 RMSE: {rmse:.4f} °C')

✅ 검증 RMSE: 0.1784 °C




In [117]:
test_path = "/Users/eunzinri/Downloads/next-day-air-temperature-forecast-challenge/test_dataset.csv"
test_df = pd.read_csv(test_path)

In [119]:
test_df.count

<bound method DataFrame.count of         id  station station_name   date  cloud_cover_0  cloud_cover_1  \
0        0       99           파주  06-26            0.0            0.0   
1        1      119           수원  01-11            0.0            0.0   
2        2      119           수원  02-23            5.0            6.0   
3        3      119           수원  05-29            0.0            0.0   
4        4      119           수원  06-15            0.0            4.0   
...    ...      ...          ...    ...            ...            ...   
4374  4374      119           수원  06-10            1.0            5.0   
4375  4375       99           파주  04-12            0.0            0.0   
4376  4376      119           수원  06-23            4.0            5.0   
4377  4377      119           수원  05-04            4.0            5.0   
4378  4378       99           파주  05-11            0.0            0.0   

      cloud_cover_10  cloud_cover_11  cloud_cover_12  cloud_cover_13  ...  \
0            

In [121]:
test = preprocess_data(test_df, station_df)
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4379 entries, 0 to 4378
Columns: 342 entries, cloud_cover_0 to altitude
dtypes: float64(340), int64(2)
memory usage: 11.4 MB


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(90, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(0, inplace=True)


In [123]:
test.count

<bound method DataFrame.count of       cloud_cover_0  cloud_cover_1  cloud_cover_10  cloud_cover_11  \
0               0.0            0.0             0.0             1.0   
1               0.0            0.0             0.0             0.0   
2               5.0            6.0             0.0             0.0   
3               0.0            0.0             0.0             0.0   
4               0.0            4.0             2.0             4.0   
...             ...            ...             ...             ...   
4374            1.0            5.0             1.0             2.0   
4375            0.0            0.0             0.0             1.0   
4376            4.0            5.0             8.0             2.0   
4377            4.0            5.0             9.0             7.0   
4378            0.0            0.0             0.0             0.0   

      cloud_cover_12  cloud_cover_13  cloud_cover_14  cloud_cover_15  \
0                3.0             6.0             9.0  

In [125]:
prediction = final_model.predict(test)

In [127]:
submission_df = pd.DataFrame({'target':prediction})

In [129]:
submission_df.to_csv('asdfsadf.csv',index=True)