In [1]:
import pandas as pd

train_path = "/Users/eunzinri/Downloads/next-day-air-temperature-forecast-challenge-2/train_dataset.csv"
station_path = "/Users/eunzinri/Downloads/next-day-air-temperature-forecast-challenge-2/station_info.csv"

train_df = pd.read_csv(train_path)
station_df = pd.read_csv(station_path)

In [3]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from datetime import datetime

# ===== 데이터 전처리 함수들 =====

def dew_to_temp(dew, rh):
    a = 17.62
    b = 243.12
    gamma_inv = (a * dew) / (b + dew) - np.log(rh / 100.0)
    return (b * gamma_inv) / (a - gamma_inv)

def preprocess_data(df, station_df):
    df = df.copy()
    df.replace(-9999, np.nan, inplace=True)
    df['date_obj'] = pd.to_datetime('2024-' + df['date'], format='%Y-%m-%d', errors='coerce')
    df = df[df['date_obj'].notna()]  # 이상한 날짜 제거
    
    df['day_of_year'] = df['date_obj'].dt.dayofyear
    df['doy_sin'] = np.sin(2 * np.pi * df['day_of_year'] / 365)
    df['doy_cos'] = np.cos(2 * np.pi * df['day_of_year'] / 365)
    df.drop(columns=['date', 'date_obj'], inplace=True)

    zero_key = ['sunshine_duration', 'snow_depth', 'precipitation', 'visibility', 'wind_speed', 'cloud_cover']
    for col in df.columns:
        if any(key in col for key in zero_key):
            df[col].fillna(0, inplace=True)
        if "min_cloud_height" in col:
            df[col].fillna(100, inplace=True)

    sum_col = ['snow_depth','precipitation','visibility','sunshine_duration']
    mean_col = ['cloud_cover','wind_speed']
    
    for col in sum_col:
        df[f'{col}_sum'] = df[[f'{col}_{i}' for i in range(21)]].sum(axis=1)

    for col in mean_col:
        df[f'{col}_mean'] = df[[f'{col}_{i}' for i in range(21)]].mean(axis=1)

    # 원래 시간대별 열들 삭제
    drop_cols = [f'{col}_{i}' for col in (sum_col + mean_col) for i in range(21)]
    df.drop(columns=drop_cols, inplace=True)
    
    for i in range(24):
        df[f'mag_temp_{i}'] = dew_to_temp(df[f'dew_point_{i}'], df[f'humidity_{i}'])

    df.drop(columns=['station_name', 'station', 'id'], inplace=True)
    return df
    
processed_df = preprocess_data(train_df, station_df)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(100, inplace=True)


In [9]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np

df = processed_df

X = df.drop(columns=['target'])
y = df['target']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42,stratify=df['day_of_year'])

xgb_params =  {'max_depth': 7, 'learning_rate': 0.05409400236764267, 'n_estimators': 670, 
               'min_child_weight': 8, 'subsample': 0.817198601322624, 'colsample_bytree': 0.6388670310566312,
               'gamma': 0.36186946516047686, 'lambda': 6.072065371409787, 'alpha': 0.07418212565365712}

model = xgb.XGBRegressor(random_state=42, **xgb_params)
model.fit(X_train, y_train)

In [11]:
train_pred = model.predict(X_train)
val_pred = model.predict(X_val)

train_rmse = mean_squared_error(y_train, train_pred, squared=False)
val_rmse = mean_squared_error(y_val, val_pred, squared=False)
print(f'✅ 트레인 RMSE: {train_rmse:.4f} °C')
print(f'✅ 검증 RMSE: {val_rmse:.4f} °C')

✅ 트레인 RMSE: 0.3146 °C
✅ 검증 RMSE: 1.2716 °C




In [13]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np

# 여기에 너가 쓰는 하이퍼파라미터 넣기
xgb_params =  {'max_depth': 7, 'learning_rate': 0.05409400236764267, 'n_estimators': 670, 
               'min_child_weight': 8, 'subsample': 0.817198601322624, 'colsample_bytree': 0.6388670310566312,
               'gamma': 0.36186946516047686, 'lambda': 6.072065371409787, 'alpha': 0.07418212565365712}

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=df['day_of_year'])

seeds = [12,29,53,77,111,123,404,333,222,923]
val_preds = []
test_preds = []
for seed in seeds:
    print(f"Training model with seed {seed}")
    model = xgb.XGBRegressor(random_state=seed, **xgb_params)
    model.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
    val_pred = model.predict(X_val)
    val_preds.append(val_pred)

    # 테스트 데이터셋 예측도 할 경우 여기서 추가
    # test_pred = model.predict(X_test)
    # test_preds.append(test_pred)

# 앙상블 예측 평균
val_preds_mean = np.mean(val_preds, axis=0)

# 평가
rmse = mean_squared_error(y_val, val_preds_mean, squared=False)
print(f"[Soft-Ensemble] Validation RMSE: {rmse:.4f}")

Training model with seed 12
Training model with seed 29
Training model with seed 53
Training model with seed 77
Training model with seed 111
Training model with seed 123
Training model with seed 404
Training model with seed 333
Training model with seed 222
Training model with seed 923
[Soft-Ensemble] Validation RMSE: 1.2458




In [15]:
models = []
for seed in seeds:
    print(f"Training model with seed {seed}")
    model = xgb.XGBRegressor(random_state=seed, **xgb_params)
    model.fit(X,y)
    models.append(model)

Training model with seed 12
Training model with seed 29
Training model with seed 53
Training model with seed 77
Training model with seed 111
Training model with seed 123
Training model with seed 404
Training model with seed 333
Training model with seed 222
Training model with seed 923


In [31]:
test_path = '/Users/eunzinri/Downloads/test_dataset(1).csv'
test_df = pd.read_csv(test_path)
test = preprocess_data(test_df, station_df)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(100, inplace=True)


In [33]:
final_preds = []
for m in models :
    pred = m.predict(test)
    final_preds.append(pred)
final = np.mean(final_preds, axis=0)

In [35]:
submission_df = pd.DataFrame({'target':final})
submission_df.to_csv('soft_2458.csv',index=True)

In [41]:
final_model = XGBRegressor(
    n_estimators=983,
    learning_rate=0.03894921380040612,
    max_depth=8,
    random_state=42,
    min_child_weight=8,
    tree_method='hist'
)

# 💥 전부 다 학습에 씀
final_model.fit(X, y)

In [21]:
y_pred = final_model.predict(X)
rmse = mean_squared_error(y, y_pred, squared=False)
print(f'✅ RMSE: {rmse:.4f} °C')

✅ RMSE: 0.1350 °C




In [23]:
test_path = '/Users/eunzinri/Downloads/test_dataset(1).csv'
test_df = pd.read_csv(test_path)
test = preprocess_data(test_df, station_df)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(100, inplace=True)


In [25]:
prediction = final_model.predict(test)
submission_df = pd.DataFrame({'target':prediction})
submission_df.to_csv('sun_mean.csv',index=True)