In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
import numpy as np
import re
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

In [71]:
train = pd.read_csv('~/Aiffel/DATAThon/playground-series-s4e11/train.csv')
test = pd.read_csv('~/Aiffel/DATAThon/playground-series-s4e11/test.csv')

In [72]:
# 필요없는 컬럼 삭제
train.drop(columns=['id'], inplace=True)
train.drop(columns=['Name'], inplace=True)

# 결측치 처리
train.drop(columns=['CGPA'], inplace=True)
train['Academic_Pressure_missing'] = train['Academic Pressure'].isnull().astype(int)
train['Study_Satisfaction_missing'] = train['Study Satisfaction'].isnull().astype(int)
imputer_ap = SimpleImputer(strategy='mean')
imputer_ss = SimpleImputer(strategy='mean')
train['Academic Pressure'] = imputer_ap.fit_transform(train[['Academic Pressure']])
train['Study Satisfaction'] = imputer_ss.fit_transform(train[['Study Satisfaction']])
train['Work_Pressure_missing'] = train['Work Pressure'].isnull().astype(int)
train['Job_Satisfaction_missing'] = train['Job Satisfaction'].isnull().astype(int)
# train['Work Pressure'] = imputer.fit_transform(train[['Work Pressure']])
# train['Job Satisfaction'] = imputer.fit_transform(train[['Job Satisfaction']])
train = train[train['Financial Stress'].notnull()].copy()

# 범주형 변수 처리
delete_values = [
    'Indore', 'Pune', 'Moderate', 'Unhealthy', 'Sleep_Duration',
    'Work_Study_Hours', 'No', '45', '49 hours', '55-66 hours', '40-45 hours', 
    '9-5 hours', '10-6 hours', '9-6 hours', '9-5', '45-48 hours', '35-36 hours'
]
train = train[~train['Sleep Duration'].isin(delete_values)].copy()
def convert_sleep_to_hours(val):
    try:
        val = str(val).strip().lower()
        if 'than' in val and 'less' not in val and 'more' not in val:
            match = re.search(r'\d+', val)
            if match:
                return float(match.group()) - 0.5
        if 'less than' in val:
            match = re.search(r'\d+', val)
            if match:
                return float(match.group()) - 0.5
        elif 'more than' in val:
            match = re.search(r'\d+', val)
            if match:
                return float(match.group()) + 0.5
        elif re.match(r'^\d+\s*hours$', val):
            return float(re.findall(r'\d+', val)[0])
        elif re.search(r'\d+\s*[-–~]\s*\d+', val):
            nums = [int(n) for n in re.findall(r'\d+', val)]
            if len(nums) == 2:
                return sum(nums) / 2
        elif re.match(r'^\d+(\.\d+)?$', val):
            return float(val)
        else:
            return np.nan
    except:
        return np.nan
train['Sleep Duration'] = train['Sleep Duration'].apply(convert_sleep_to_hours)
degree_group_map = {
    'MD': 'Medical',
    'MBBS': 'Medical',
    'B.Pharm': 'Pharmacy',
    'M.Pharm': 'Pharmacy',
    'MPharm': 'Pharmacy',
    'P.Pharm': 'Pharmacy',
    'S.Pharm': 'Pharmacy',
    'N.Pharm': 'Pharmacy',
    'B.Tech': 'Engineering',
    'M.Tech': 'Engineering',
    'ME': 'Engineering',
    'MTech': 'Engineering',
    'M_Tech': 'Engineering',
    'BE': 'Engineering',
    'BCA': 'Engineering',
    'MCA': 'Engineering',
    'E.Tech': 'Engineering',
    'S.Tech': 'Engineering',
    'LLTech': 'Engineering',
    'LLCom': 'Engineering',
    'BBA': 'Business',
    'MBA': 'Business',
    'M. Business Analyst': 'Business',
    'B.Com': 'Commerce',
    'M.Com': 'Commerce',
    'P.Com': 'Commerce',
    'LLB': 'Law',
    'LLM': 'Law',
    'LLBA': 'Law',
    'LL.Com': 'Law',
    'LL B.Ed': 'Education',
    'B.Ed': 'Education',
    'M.Ed': 'Education',
    'L.Ed': 'Education',
    'K.Ed': 'Education',
    'LLEd': 'Education',
    'BEd': 'Education',
    'BSc': 'Science',
    'MSc': 'Science',
    'B.Sc': 'Science',
    'BHM': 'Hospitality',
    'MHM': 'Hospitality',
    'B.Arch': 'Architecture',
    'M.Arch': 'Architecture',
    'BArch': 'Architecture',
    'B.B.Arch': 'Architecture',
    'PhD': 'PhD',
    'Class 12': 'School',
    'Class 11': 'School',
}
train['degree_group'] =train['Degree'].apply(lambda x: degree_group_map.get(x, 'Other'))
train=train.drop('Degree',axis=1)

top_cities = train['City'].value_counts().nlargest(15).index
train['City'] = train['City'].where(train['City'].isin(top_cities), other='Other')

valid_dietary = ['Moderate', 'Unhealthy', 'Healthy']
train['Dietary Habits'] = train['Dietary Habits'].where(train['Dietary Habits'].isin(valid_dietary))
train['Dietary Habits'] = train['Dietary Habits'].fillna(train['Dietary Habits'].mode()[0])

binary_cols = ['Have you ever had suicidal thoughts ?', 'Family History of Mental Illness']
for col in binary_cols:
    train[col] = train[col].map({'Yes': 1, 'No': 0})
train['Gender'] = train['Gender'].map({'Male': 1, 'Female': 0})
train['Working Professional or Student'] = train['Working Professional or Student'].map({'Working Professional': 1, 'Student': 0})

# 수치형 변수 스케일링
scale_cols = ['Age', 'Sleep Duration', 'Work/Study Hours', 'Financial Stress']
scaler = StandardScaler()
train[scale_cols] = scaler.fit_transform(train[scale_cols])

# 원핫인코딩 + 타겟인코딩
train = pd.get_dummies(train, columns=['degree_group', 'Dietary Habits'], drop_first=True) # 다중공선성 방지를 위해 첫 번째 범주는 제거
for col in ['Profession', 'City']:
    target_mean = train.groupby(col)['Depression'].mean()
    train[col + '_target'] = train[col].map(target_mean)
    

train.drop(columns=['Profession', 'City'], inplace=True)

In [73]:
from sklearn.model_selection import train_test_split

def get_train_test_dataset(df=None):
    train_copy = train

    y_target = train_copy['Depression']
    X_features = train_copy.drop('Depression', axis=1)

    X_train, X_test, y_train, y_test = train_test_split(X_features, y_target,
                                                        test_size=0.3, random_state=0, stratify=y_target)

    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = get_train_test_dataset(train)

from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

model = XGBClassifier(
    n_estimators=100,       # 트리 개수 (적당함)
    max_depth=4,            # 과적합 방지용 적당한 깊이
    learning_rate=0.1,      # 기본 학습률
    subsample=0.8,          # 전체 샘플 중 일부만 사용 → 일반화 성능↑
    colsample_bytree=0.8,   # 피처 중 일부만 사용 → 과적합 방지
    use_label_encoder=False,
    eval_metric='logloss',  # 분류 문제의 표준 평가지표
    random_state=42,
    n_jobs=-1               # 병렬 처리
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.9389
