### Import & Data Load

In [129]:
import numpy as np
import pandas as pd
from lightgbm import LGBMRegressor
from sklearn.preprocessing import LabelEncoder
import wandb
wandb.login()

True

In [130]:
csv_path = "../datasets/DACON_Stress/"  # 예: "data/train.csv" 또는 "hand_data.csv"

train = pd.read_csv(csv_path+'train.csv')
test = pd.read_csv(csv_path+'test.csv')

In [131]:
test.head(5)

Unnamed: 0,ID,gender,age,height,weight,cholesterol,systolic_blood_pressure,diastolic_blood_pressure,glucose,bone_density,activity,smoke_status,medical_history,family_medical_history,sleep_pattern,edu_level,mean_working
0,TEST_0000,F,72,161.95,77.66,312.36,137,112,131.16,0.51,moderate,current-smoker,high blood pressure,heart disease,oversleeping,graduate degree,
1,TEST_0001,F,84,152.35,58.7,269.54,162,95,129.36,0.51,intense,current-smoker,,,normal,graduate degree,
2,TEST_0002,F,45,164.09,76.71,209.08,146,91,169.21,1.13,light,ex-smoker,diabetes,heart disease,sleep difficulty,bachelors degree,6.0
3,TEST_0003,M,58,165.06,76.34,246.52,146,98,133.78,0.52,light,current-smoker,heart disease,heart disease,sleep difficulty,high school diploma,10.0
4,TEST_0004,F,35,158.17,65.6,181.24,142,80,73.93,1.42,moderate,ex-smoker,diabetes,heart disease,oversleeping,,7.0


In [132]:
from sklearn.preprocessing import StandardScaler

def standardize_range_01_columns(df):
    df = df.copy()
    scaler = StandardScaler()
    
    # 숫자형 컬럼만 추출
    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
    
    # 0~1 범위로 구성된 컬럼만 필터링
    target_cols = [
        col for col in numeric_cols
        if df[col].min() >= 0 and df[col].max() <= 1
    ]
    
    # 표준화 수행 (예외 처리 포함)
    if target_cols:
        df[target_cols] = scaler.fit_transform(df[target_cols])
    
    return df
stress_mean = train['stress_score'].mean()
stress_std = train['stress_score'].std()
# train = standardize_range_01_columns(train)
# test = standardize_range_01_columns(test)


### Check Data

In [133]:
train.isnull().sum()

ID                             0
gender                         0
age                            0
height                         0
weight                         0
cholesterol                    0
systolic_blood_pressure        0
diastolic_blood_pressure       0
glucose                        0
bone_density                   0
activity                       0
smoke_status                   0
medical_history             1289
family_medical_history      1486
sleep_pattern                  0
edu_level                    607
mean_working                1032
stress_score                   0
dtype: int64

In [134]:
# 결측값 있는 칼럼(column) 확인
missing_columns_train = train.columns[train.isnull().sum() > 0]
missing_columns_train

Index(['medical_history', 'family_medical_history', 'edu_level',
       'mean_working'],
      dtype='object')

In [135]:
train[missing_columns_train].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   medical_history         1711 non-null   object 
 1   family_medical_history  1514 non-null   object 
 2   edu_level               2393 non-null   object 
 3   mean_working            1968 non-null   float64
dtypes: float64(1), object(3)
memory usage: 93.9+ KB


In [136]:
categorical_na_cols = []
numerical_na_cols = []

# 결측값이 있는 각 칼럼에 대해 데이터 타입 확인
for col in missing_columns_train:
    # 해당 칼럼이 범주형(object 또는 category)인지 확인
    if col in train.select_dtypes(include=['object', 'category']).columns:
        categorical_na_cols.append(col)
    # 해당 칼럼이 수치형(int 또는 float)인지 확인
    elif col in train.select_dtypes(include=['int', 'float']).columns:
        numerical_na_cols.append(col)

print("결측값이 있는 범주형 변수:", categorical_na_cols)
print("결측값이 있는 수치형 변수:", numerical_na_cols)

결측값이 있는 범주형 변수: ['medical_history', 'family_medical_history', 'edu_level']
결측값이 있는 수치형 변수: ['mean_working']


### Data Preprocessing

In [137]:
# 범주형 변수의 결측값을 최빈값으로 대체
for col in categorical_na_cols:
    # 학습 데이터에서 최빈값 계산
    most_frequent = train[col].mode()[0]
    
    # 학습 데이터와 테스트 데이터 모두 해당 칼럼의 최빈값으로 대체
    train[col] = train[col].fillna(most_frequent)
    test[col] = test[col].fillna(most_frequent)

In [138]:
train = train.drop(columns=['weight', 'height','bone_density'])

In [139]:
# mean_working에 대해 중앙값 대체
median_value = train['mean_working'].median()

train['mean_working'] = train['mean_working'].fillna(median_value)
test['mean_working'] = test['mean_working'].fillna(median_value)

In [140]:
# Label Encoding 적용 열 - 범주형 데이터
categorical_cols = train.select_dtypes(include='object').columns.drop('ID')

for feature in categorical_cols:
    le = LabelEncoder()
    le = le.fit(train[feature])
    train[feature] = le.transform(train[feature])
    
    for label in np.unique(test[feature]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    
    test[feature] = le.transform(test[feature])

In [141]:
x_train = train.drop(['ID', 'stress_score'], axis = 1)
y_train = train['stress_score']

test = test.drop(['ID','height', 'weight','bone_density'], axis = 1)

In [142]:
test

Unnamed: 0,gender,age,cholesterol,systolic_blood_pressure,diastolic_blood_pressure,glucose,activity,smoke_status,medical_history,family_medical_history,sleep_pattern,edu_level,mean_working
0,0,72,312.36,137,112,131.16,2,0,2,1,1,1,9.0
1,0,84,269.54,162,95,129.36,0,0,2,0,0,1,9.0
2,0,45,209.08,146,91,169.21,1,1,0,1,2,0,6.0
3,1,58,246.52,146,98,133.78,1,0,1,1,2,2,10.0
4,0,35,181.24,142,80,73.93,2,1,0,1,1,2,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,1,75,239.68,152,103,114.46,2,0,0,0,1,1,9.0
2996,0,82,237.88,168,107,121.83,2,0,2,0,2,2,9.0
2997,1,23,194.43,117,87,117.15,1,2,2,0,2,2,7.0
2998,0,58,233.43,151,108,133.83,2,0,1,2,2,0,9.0


### Train / Predict

In [143]:
import wandb
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
wandb.init(project="lgbm-tracking",name = 'without-WHB-5000')

# 학습/검증 데이터 분리
# x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = LGBMRegressor(n_estimators=5000)

# 훈련
model.fit(
    x_train, y_train,
    eval_set=[(test, None)],
    eval_metric=['rmse', 'l1', 'mape'],
    callbacks=[wandb.lightgbm.wandb_callback()]
)
pred = model.predict(test)

wandb.finish()

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000071 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 760
[LightGBM] [Info] Number of data points in the train set: 3000, number of used features: 13
[LightGBM] [Info] Start training from score 0.482130


0,1
iteration,▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇▇▇█
valid_0_l1,▁▁▃▄▆▇██████████████████████████████████
valid_0_l2,▁▁▃▃▄▅▇▇▇▇██████████████████████████████
valid_0_mape,▁▆▇▇▇███████████████████████████████████
valid_0_rmse,▁▆▆▇▇▇▇▇▇███████████████████████████████

0,1
iteration,4999


### Submission

In [144]:
submission = pd.read_csv(csv_path+'sample_submission.csv')

In [145]:
submission['stress_score'] = pred
# submission['stress_score'] = submission['stress_score']*stress_std + stress_std
submission.head()

Unnamed: 0,ID,stress_score
0,TEST_0000,0.504909
1,TEST_0001,0.876108
2,TEST_0002,0.342448
3,TEST_0003,0.424806
4,TEST_0004,0.509979


In [100]:
submission['stress_score'] = pred
submission.head()

Unnamed: 0,ID,stress_score
0,TEST_0000,0.470718
1,TEST_0001,0.544434
2,TEST_0002,0.212051
3,TEST_0003,0.442751
4,TEST_0004,0.59384


In [17]:
submission['stress_score'] = pred
submission.head()

Unnamed: 0,ID,stress_score
0,TEST_0000,0.412394
1,TEST_0001,0.689009
2,TEST_0002,0.183309
3,TEST_0003,0.522015
4,TEST_0004,0.572112


In [146]:
submission.to_csv('submit.csv', index=False)