In [39]:
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
import re
%matplotlib inline

In [40]:
train = pd.read_csv('mental_train.csv')

In [41]:
train = train.drop(['Name','Profession','CGPA','id'], axis=1)

In [42]:
def age_to_decade(age):
    return (age // 10) * 10

train['Age_Group'] = train['Age'].apply(age_to_decade)

train = train.drop('Age', axis=1)

1. AGE -> age를 decade로 변환

In [43]:
valid_cities_map = {
    'Visakhapatnam', 'Bangalore', 'Varanasi', 'Jaipur', 'Pune',
    'Thane', 'Chennai', 'Nagpur', 'Nashik', 'Vadodara', 'Kalyan',
    'Rajkot', 'Srinagar', 'Ahmedabad', 'Kolkata', 'Mumbai', 'Lucknow',
    'Indore', 'Surat', 'Ludhiana', 'Bhopal', 'Meerut', 'Agra',
    'Ghaziabad', 'Hyderabad', 'Vasai-Virar', 'Patna', 'Faridabad',
    'Delhi', 'Kanpur', 'Morena', 'Gurgaon'
}

train['City'] =train['City'].apply(lambda x: x if x in valid_cities_map else 'Other')

2. City -> valid_cities_map 이외 other 처리

In [44]:
train['Academic_Pressure_missing'] = train['Academic Pressure'].isnull().astype(int)
train['Work_Pressure_missing'] = train['Work Pressure'].isnull().astype(int)
train['Study_Satisfaction_missing'] = train['Study Satisfaction'].isnull().astype(int)
train['Job_Satisfaction_missing'] = train['Job Satisfaction'].isnull().astype(int)

cols_to_impute = ['Academic Pressure', 'Work Pressure', 'Study Satisfaction', 'Job Satisfaction']
imputer = SimpleImputer(strategy='mean')  # 또는 'median', 'most_frequent'
train[cols_to_impute] = imputer.fit_transform(train[cols_to_impute])

In [45]:
train

Unnamed: 0,Gender,City,Working Professional or Student,Academic Pressure,Work Pressure,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression,Age_Group,Academic_Pressure_missing,Work_Pressure_missing,Study_Satisfaction_missing,Job_Satisfaction_missing
0,Female,Ludhiana,Working Professional,3.142273,5.000000,2.94494,2.000000,More than 8 hours,Healthy,BHM,No,1.0,2.0,No,0,40.0,1,0,1,0
1,Male,Varanasi,Working Professional,3.142273,4.000000,2.94494,3.000000,Less than 5 hours,Unhealthy,LLB,Yes,7.0,3.0,No,1,20.0,1,0,1,0
2,Male,Visakhapatnam,Student,5.000000,2.998998,2.00000,2.974404,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1,30.0,0,1,0,1
3,Male,Mumbai,Working Professional,3.142273,5.000000,2.94494,1.000000,Less than 5 hours,Moderate,BBA,Yes,10.0,1.0,Yes,1,20.0,1,0,1,0
4,Female,Kanpur,Working Professional,3.142273,1.000000,2.94494,1.000000,5-6 hours,Unhealthy,BBA,Yes,9.0,4.0,Yes,0,30.0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140695,Female,Ahmedabad,Working Professional,3.142273,5.000000,2.94494,4.000000,5-6 hours,Unhealthy,Class 12,No,2.0,4.0,Yes,1,10.0,1,0,1,0
140696,Female,Hyderabad,Working Professional,3.142273,5.000000,2.94494,4.000000,7-8 hours,Moderate,B.Tech,Yes,6.0,5.0,Yes,0,40.0,1,0,1,0
140697,Female,Kolkata,Working Professional,3.142273,3.000000,2.94494,1.000000,More than 8 hours,Moderate,B.Com,No,4.0,4.0,No,0,20.0,1,0,1,0
140698,Female,Srinagar,Working Professional,3.142273,5.000000,2.94494,2.000000,5-6 hours,Moderate,ME,Yes,10.0,1.0,No,0,40.0,1,0,1,0


In [46]:
def convert_sleep_to_hours(val):
    try:
        val = str(val).strip().lower()

        # 특별 처리: 'than n hours' → 'less than n hours' 간주
        if 'than' in val and 'less' not in val and 'more' not in val:
            match = re.search(r'\d+', val)
            if match:
                return float(match.group()) - 0.5

        # Less than n hours → n - 0.5
        if 'less than' in val:
            match = re.search(r'\d+', val)
            if match:
                return float(match.group()) - 0.5

        # More than n hours → n + 0.5
        elif 'more than' in val:
            match = re.search(r'\d+', val)
            if match:
                return float(match.group()) + 0.5

        # 정확히 n hours → 숫자만 추출
        elif re.match(r'^\d+\s*hours$', val):
            return float(re.findall(r'\d+', val)[0])

        # n-m hours 또는 n–n → 평균값
        elif re.search(r'\d+\s*[-–~]\s*\d+', val):
            nums = [int(n) for n in re.findall(r'\d+', val)]
            if len(nums) == 2:
                return sum(nums) / 2

        # 숫자만 → 그대로
        elif re.match(r'^\d+(\.\d+)?$', val):
            return float(val)

        # 나머지는 이상값으로 간주
        else:
            return np.nan

    except:
        return np.nan


train['Sleep Duration'] = train['Sleep Duration'].apply(convert_sleep_to_hours)

In [47]:
train

Unnamed: 0,Gender,City,Working Professional or Student,Academic Pressure,Work Pressure,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression,Age_Group,Academic_Pressure_missing,Work_Pressure_missing,Study_Satisfaction_missing,Job_Satisfaction_missing
0,Female,Ludhiana,Working Professional,3.142273,5.000000,2.94494,2.000000,8.5,Healthy,BHM,No,1.0,2.0,No,0,40.0,1,0,1,0
1,Male,Varanasi,Working Professional,3.142273,4.000000,2.94494,3.000000,4.5,Unhealthy,LLB,Yes,7.0,3.0,No,1,20.0,1,0,1,0
2,Male,Visakhapatnam,Student,5.000000,2.998998,2.00000,2.974404,5.5,Healthy,B.Pharm,Yes,3.0,1.0,No,1,30.0,0,1,0,1
3,Male,Mumbai,Working Professional,3.142273,5.000000,2.94494,1.000000,4.5,Moderate,BBA,Yes,10.0,1.0,Yes,1,20.0,1,0,1,0
4,Female,Kanpur,Working Professional,3.142273,1.000000,2.94494,1.000000,5.5,Unhealthy,BBA,Yes,9.0,4.0,Yes,0,30.0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140695,Female,Ahmedabad,Working Professional,3.142273,5.000000,2.94494,4.000000,5.5,Unhealthy,Class 12,No,2.0,4.0,Yes,1,10.0,1,0,1,0
140696,Female,Hyderabad,Working Professional,3.142273,5.000000,2.94494,4.000000,7.5,Moderate,B.Tech,Yes,6.0,5.0,Yes,0,40.0,1,0,1,0
140697,Female,Kolkata,Working Professional,3.142273,3.000000,2.94494,1.000000,8.5,Moderate,B.Com,No,4.0,4.0,No,0,20.0,1,0,1,0
140698,Female,Srinagar,Working Professional,3.142273,5.000000,2.94494,2.000000,5.5,Moderate,ME,Yes,10.0,1.0,No,0,40.0,1,0,1,0


In [48]:
habits = ['Moderate','Healthy','Unhealthy']

train_filtered = train[train['Dietary Habits'].isin(habits)]
train_filtered

train = train_filtered

In [49]:
degree_group_map = {
    # 의학/보건
    'MD': 'Medical',
    'MBBS': 'Medical',
    'B.Pharm': 'Pharmacy',
    'M.Pharm': 'Pharmacy',
    'MPharm': 'Pharmacy',
    'P.Pharm': 'Pharmacy',
    'S.Pharm': 'Pharmacy',
    'N.Pharm': 'Pharmacy',

    # 공학/기술
    'B.Tech': 'Engineering',
    'M.Tech': 'Engineering',
    'ME': 'Engineering',
    'MTech': 'Engineering',
    'M_Tech': 'Engineering',
    'BE': 'Engineering',
    'BCA': 'Engineering',
    'MCA': 'Engineering',
    'E.Tech': 'Engineering',
    'S.Tech': 'Engineering',
    'LLTech': 'Engineering',
    'LLCom': 'Engineering',

    # 인문/사회/비즈니스
    'BBA': 'Business',
    'MBA': 'Business',
    'M. Business Analyst': 'Business',
    'B.Com': 'Commerce',
    'M.Com': 'Commerce',
    'P.Com': 'Commerce',
    'LLB': 'Law',
    'LLM': 'Law',
    'LLBA': 'Law',
    'LL.Com': 'Law',
    'LL B.Ed': 'Education',
    'B.Ed': 'Education',
    'M.Ed': 'Education',
    'L.Ed': 'Education',
    'K.Ed': 'Education',
    'LLEd': 'Education',
    'BEd': 'Education',

    # 과학
    'BSc': 'Science',
    'MSc': 'Science',
    'B.Sc': 'Science',

    # 기타, 건축, 호텔경영 등
    'BHM': 'Hospitality',
    'MHM': 'Hospitality',
    'B.Arch': 'Architecture',
    'M.Arch': 'Architecture',
    'BArch': 'Architecture',
    'B.B.Arch': 'Architecture',

    # 학위 및 학교 졸업
    'PhD': 'PhD',
    'Class 12': 'School',
    'Class 11': 'School',
}

train['degree_group'] =train['Degree'].apply(lambda x: degree_group_map.get(x, 'Other'))

train=train.drop('Degree',axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['degree_group'] =train['Degree'].apply(lambda x: degree_group_map.get(x, 'Other'))


In [50]:
train['Have you ever had suicidal thoughts ?'] = train['Have you ever had suicidal thoughts ?'].map({'Yes': 1, 'No': 0})
train['Family History of Mental Illness'] = train['Family History of Mental Illness'].map({'Yes': 1, 'No': 0})
train['Gender'] = train['Gender'].map({'Male': 1, 'Female': 0})

In [51]:
train

Unnamed: 0,Gender,City,Working Professional or Student,Academic Pressure,Work Pressure,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression,Age_Group,Academic_Pressure_missing,Work_Pressure_missing,Study_Satisfaction_missing,Job_Satisfaction_missing,degree_group
0,0,Ludhiana,Working Professional,3.142273,5.000000,2.94494,2.000000,8.5,Healthy,0,1.0,2.0,0,0,40.0,1,0,1,0,Hospitality
1,1,Varanasi,Working Professional,3.142273,4.000000,2.94494,3.000000,4.5,Unhealthy,1,7.0,3.0,0,1,20.0,1,0,1,0,Law
2,1,Visakhapatnam,Student,5.000000,2.998998,2.00000,2.974404,5.5,Healthy,1,3.0,1.0,0,1,30.0,0,1,0,1,Pharmacy
3,1,Mumbai,Working Professional,3.142273,5.000000,2.94494,1.000000,4.5,Moderate,1,10.0,1.0,1,1,20.0,1,0,1,0,Business
4,0,Kanpur,Working Professional,3.142273,1.000000,2.94494,1.000000,5.5,Unhealthy,1,9.0,4.0,1,0,30.0,1,0,1,0,Business
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140695,0,Ahmedabad,Working Professional,3.142273,5.000000,2.94494,4.000000,5.5,Unhealthy,0,2.0,4.0,1,1,10.0,1,0,1,0,School
140696,0,Hyderabad,Working Professional,3.142273,5.000000,2.94494,4.000000,7.5,Moderate,1,6.0,5.0,1,0,40.0,1,0,1,0,Engineering
140697,0,Kolkata,Working Professional,3.142273,3.000000,2.94494,1.000000,8.5,Moderate,0,4.0,4.0,0,0,20.0,1,0,1,0,Commerce
140698,0,Srinagar,Working Professional,3.142273,5.000000,2.94494,2.000000,5.5,Moderate,1,10.0,1.0,0,0,40.0,1,0,1,0,Engineering


In [52]:
train.isnull().sum()

Gender                                    0
City                                      0
Working Professional or Student           0
Academic Pressure                         0
Work Pressure                             0
Study Satisfaction                        0
Job Satisfaction                          0
Sleep Duration                           12
Dietary Habits                            0
Have you ever had suicidal thoughts ?     0
Work/Study Hours                          0
Financial Stress                          4
Family History of Mental Illness          0
Depression                                0
Age_Group                                 0
Academic_Pressure_missing                 0
Work_Pressure_missing                     0
Study_Satisfaction_missing                0
Job_Satisfaction_missing                  0
degree_group                              0
dtype: int64

In [53]:
train['Financial Stress'] = train['Financial Stress'].fillna(0)
train['Sleep Duration'] = train['Sleep Duration'].fillna(0)

In [54]:
train.isnull().sum()

Gender                                   0
City                                     0
Working Professional or Student          0
Academic Pressure                        0
Work Pressure                            0
Study Satisfaction                       0
Job Satisfaction                         0
Sleep Duration                           0
Dietary Habits                           0
Have you ever had suicidal thoughts ?    0
Work/Study Hours                         0
Financial Stress                         0
Family History of Mental Illness         0
Depression                               0
Age_Group                                0
Academic_Pressure_missing                0
Work_Pressure_missing                    0
Study_Satisfaction_missing               0
Job_Satisfaction_missing                 0
degree_group                             0
dtype: int64

In [61]:
train_copy = train.copy()

X = train_copy.drop('Depression', axis=1)
y = train_copy['Depression']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)

num_columns=['Gender', 'Age_Group', 'Academic_Pressure_missing', 'Work_Pressure_missing', 'Study_Satisfaction_missing',
             'Job_Satisfaction_missing','Sleep Duration', 'Have you ever had suicidal thoughts ?', 'Work/Study Hours',
             'Financial Stress','Family History of Mental Illness']
cat_columns=['City', 'Working Professional or Student', 'Dietary Habits', 'degree_group']

numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, num_columns), 
        ('cat', categorical_transformer, cat_columns)
    ],
    remainder='passthrough' 
)

preprocessor.fit(X_train)

X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [62]:
model = XGBClassifier(
    n_estimators=100,       # 트리 개수 (적당함)
    max_depth=4,            # 과적합 방지용 적당한 깊이
    learning_rate=0.1,      # 기본 학습률
    subsample=0.8,          # 전체 샘플 중 일부만 사용 → 일반화 성능↑
    colsample_bytree=0.8,   # 피처 중 일부만 사용 → 과적합 방지
    use_label_encoder=False,
    eval_metric='logloss',  # 분류 문제의 표준 평가지표
    random_state=42,
    n_jobs=-1               # 병렬 처리
)

# 모델 학습
model.fit(X_train_processed, y_train)

# 예측
y_pred = model.predict(X_test_processed)

# 정확도 측정
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.9374
