In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import joblib
import json
import warnings
import matplotlib.pyplot as plt

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import TargetEncoder, StandardScaler, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve
from sklearn.feature_selection import mutual_info_classif

warnings.filterwarnings('ignore')

# ==============================================================================
# 1. C√ÅC CLASS X·ª¨ L√ù (LOGIC GOM NH√ìM M·ªöI)
# ==============================================================================

class LogicalCleaner(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): return self
    def transform(self, X):
        X_out = X.copy()
        
        # --- 1. CLEAN SLEEP DURATION (Gom nh√≥m theo √Ω nghƒ©a) ---
        if 'Sleep Duration' in X_out.columns:
            def clean_sleep(val):
                s = str(val).lower().strip()
                # Nh√≥m < 4 hours
                if any(x in s for x in ['1-2', '2-3', '3-4',  '1-3']):
                    return 'Less than 4 hours'
                # Nh√≥m 4-6 hours
                elif any(x in s for x in ['less than 5','4-5','than 5']):
                    return '4-5 hours'
                # Nh√≥m 4-6 hours
                elif any(x in s for x in ['5-6', '4-6', '3-6','than 5']):
                    return '5-6 hours'
                # Nh√≥m 7-8 hours (Chu·∫©n) - G·ªìm c·∫£ c√°c m·ªëc gi·ªù l√†m vi·ªác b·ªã nh·∫≠p nh·∫ßm th√†nh gi·ªù ng·ªß (9-5)
                elif any(x in s for x in ['7-8', '6-8', '6-7', '8 hours', '9-5', '10-6']): 
                    return '7-8 hours'
                # Nh√≥m > 8 hours
                elif any(x in s for x in ['more than 8', '8-9', '9-11', '10-11']):
                    return 'More than 8 hours'
                # R√°c -> Unknown
                else:
                    return 'Unknown'
            X_out['Sleep Duration'] = X_out['Sleep Duration'].apply(clean_sleep)

        # --- 2. CLEAN DIETARY HABITS (Gom nh√≥m theo √Ω nghƒ©a) ---
        if 'Dietary Habits' in X_out.columns:
            def clean_diet(val):
                s = str(val).lower().strip()
                if s in ['healthy', 'more healthy']:
                    return 'Healthy'
                elif s in ['moderate']:
                    return 'Moderate'
                elif s in ['unhealthy', 'less than healthy', 'no healthy', 'less healthy']:
                    return 'Unhealthy'
                else: # R√°c -> Unknown
                    return 'Unknown'
            X_out['Dietary Habits'] = X_out['Dietary Habits'].apply(clean_diet)

        # --- 3. G·ªòP C·ªòT CHO MODEL ---
        if 'Profession' in X_out.columns and 'Degree' in X_out.columns:
            X_out['Occupation'] = X_out['Profession'].fillna(X_out['Degree'])
            
        if 'Work Pressure' in X_out.columns and 'Academic Pressure' in X_out.columns:
            X_out['Pressure'] = X_out['Work Pressure'].fillna(X_out['Academic Pressure'])

        if 'Job Satisfaction' in X_out.columns and 'Study Satisfaction' in X_out.columns:
            X_out['Satisfaction'] = X_out['Job Satisfaction'].fillna(X_out['Study Satisfaction'])
            
        return X_out

class RareLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, variables=None, threshold=5):
        self.variables = variables or []
        self.threshold = threshold
        self.valid_labels_ = {} 

    def fit(self, X, y=None):
        for col in self.variables:
            if col in X.columns:
                counts = X[col].value_counts()
                self.valid_labels_[col] = counts[counts > self.threshold].index.tolist()
        return self

    def transform(self, X):
        X_out = X.copy()
        for col in self.variables:
            if col in X_out.columns:
                valid_list = self.valid_labels_.get(col, [])
                # Gom gi√° tr·ªã hi·∫øm th√†nh 'Other'
                X_out[col] = np.where(X_out[col].isin(valid_list), X_out[col], 'Other')
        return X_out

class ScoreBasedSelector(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.005): 
        self.threshold = threshold
        self.selected_features_ = []

    def fit(self, X, y):
        print(f"\n--- [SELECTOR] Calculating Mutual Information... ---")
        X_temp = X.copy()
        cat_cols = X_temp.select_dtypes(include=['object', 'category']).columns
        num_cols = X_temp.select_dtypes(exclude=['object', 'category']).columns
        X_temp[num_cols] = X_temp[num_cols].fillna(0)
        
        ord_enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
        X_temp[cat_cols] = ord_enc.fit_transform(X_temp[cat_cols].fillna('Missing'))

        mi_scores = mutual_info_classif(X_temp, y, discrete_features='auto', random_state=42)
        score_df = pd.DataFrame({'feature': X_temp.columns, 'score': mi_scores}).sort_values(by='score', ascending=False)
        
        print(score_df.head(10))
        self.selected_features_ = score_df[score_df['score'] > self.threshold]['feature'].tolist()
        return self

    def transform(self, X):
        return X[self.selected_features_]

# ==============================================================================
# 2. PROCESS
# ==============================================================================
print(">>> 1. LOADING DATA...")
try:
    df_train = pd.read_csv('train.csv').drop_duplicates()
except FileNotFoundError:
    print("‚ùå L·ªói: Kh√¥ng t√¨m th·∫•y file 'train.csv'")
    exit()

target_col = 'Depression'
for c in ['id', 'Name', 'PassengerId']:
    if c in df_train.columns: df_train.drop(c, axis=1, inplace=True)

X = df_train.drop(target_col, axis=1)
y = df_train[target_col]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

print("\n>>> 2. PRE-PROCESSING...")
# CH·ªà D√ôNG RARE ENCODER CHO C√ÅC BI·∫æN C·∫¶N GOM NH√ìM 'OTHER' (Degree, Profession, Occupation, City)
# Sleep v√† Diet ƒë√£ ƒë∆∞·ª£c x·ª≠ l√Ω th√†nh nh√≥m c·ªë ƒë·ªãnh ho·∫∑c 'Unknown' ·ªü LogicalCleaner r·ªìi
vars_to_rare = ['Occupation', 'Degree', 'Profession', 'City']

pre_cleaner = Pipeline([
    ('cleaner', LogicalCleaner()), 
    ('rare_encoder', RareLabelEncoder(variables=vars_to_rare, threshold=5))
])

X_train_pre = pre_cleaner.fit_transform(X_train, y_train)
X_val_pre = pre_cleaner.transform(X_val)

selector = ScoreBasedSelector(threshold=0.005) 
selector.fit(X_train_pre, y_train)
final_features = selector.selected_features_

X_train_selected = X_train_pre[final_features]
X_val_selected = X_val_pre[final_features]

print(f"\n>>> 3. TRAINING MODEL ({len(final_features)} features)...")
cat_cols = X_train_selected.select_dtypes(include=['object', 'category']).columns.tolist()
num_cols = X_train_selected.select_dtypes(exclude=['object', 'category']).columns.tolist()

num_pipe = Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='Other')),
    ('target_enc', TargetEncoder(smooth='auto', random_state=42)),
    ('scaler', StandardScaler())
])

final_preprocessor = ColumnTransformer(
    transformers=[('num', num_pipe, num_cols), ('cat', cat_pipe, cat_cols)],
    verbose_feature_names_out=False
)

X_train_processed = final_preprocessor.fit_transform(X_train_selected, y_train)
X_val_processed = final_preprocessor.transform(X_val_selected)

xgb_model = xgb.XGBClassifier(
    n_estimators=3000, learning_rate=0.01, max_depth=5,
    scale_pos_weight=scale_pos_weight, eval_metric='aucpr',
    early_stopping_rounds=50, random_state=42, n_jobs=-1
)

xgb_model.fit(
    X_train_processed, y_train,
    eval_set=[(X_train_processed, y_train), (X_val_processed, y_val)],
    verbose=100
)

y_val_prob = xgb_model.predict_proba(X_val_processed)[:, 1]
precisions, recalls, thresholds = precision_recall_curve(y_val, y_val_prob)
f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-10)
best_threshold = thresholds[np.argmax(f1_scores)]
print(f"\nOptimal Threshold: {best_threshold:.4f}")

# ==============================================================================
# 5. EXPORT UI CONFIG
# ==============================================================================
print("\n>>> 4. EXPORTING CONFIG...")
valid_labels_dict = pre_cleaner.named_steps['rare_encoder'].valid_labels_

ui_config = {
    "model_threshold": float(best_threshold),
    "input_fields": []
}

ui_features_set = set()
ui_features_set.add("Working Professional or Student")

BLACKLIST_UI = ['Occupation', 'Pressure', 'Satisfaction']

for feat in final_features:
    if feat == 'Occupation':
        ui_features_set.add('Degree')
        ui_features_set.add('Profession')
    elif feat == 'Pressure':
        ui_features_set.add('Academic Pressure')
        ui_features_set.add('Work Pressure')
    elif feat == 'Satisfaction':
        ui_features_set.add('Study Satisfaction')
        ui_features_set.add('Job Satisfaction')
    elif feat not in BLACKLIST_UI:
        ui_features_set.add(feat)

for col in ui_features_set:
    if col == "Working Professional or Student": continue
    if col not in X_train.columns: continue

    field_info = {"name": col, "label": col.replace("_", " ").title()}
    is_numeric = pd.api.types.is_numeric_dtype(X_train[col].dtype)
    is_scale_var = any(x in col for x in ['Pressure', 'Satisfaction', 'Stress'])
    
    if is_numeric and not is_scale_var:
        field_info["type"] = "number"
        field_info["min"] = float(X_train[col].min())
        field_info["max"] = float(X_train[col].max())
    else:
        field_info["type"] = "select"
        
        # Logic l·∫•y options
        if col in ['Sleep Duration', 'Dietary Habits']:
            # L·∫•y c√°c gi√° tr·ªã chu·∫©n h√≥a t·ª´ X_train_pre (ƒë√£ qua LogicalCleaner)
            # V√¨ X_train_pre l√† numpy array sau b∆∞·ªõc fit_transform c·ªßa pipeline
            # Ta n√™n d√πng LogicalCleaner transform l·∫°i X_train g·ªëc ƒë·ªÉ l·∫•y unique values
            temp_cleaner = LogicalCleaner()
            temp_df = temp_cleaner.transform(X_train[[col]])
            raw = temp_df[col].unique().tolist()
        elif col in valid_labels_dict:
            raw = valid_labels_dict[col]
        else:
            raw = X_train[col].value_counts().head(30).index.tolist()

        options = [str(x) for x in raw if str(x) != 'nan' and str(x) != 'Unknown']
        options = sorted(options)
        
        if col in ['Degree', 'Profession']:
            if 'Other' not in options: options.append('Other')
            
        field_info["options"] = options

    ui_config["input_fields"].append(field_info)

with open('model_ui_config.json', 'w', encoding='utf-8') as f:
    json.dump(ui_config, f, indent=4, ensure_ascii=False)

# ==============================================================================
# 6. SAVE
# ==============================================================================
pipeline_inference = Pipeline([
    ('cleaner', pre_cleaner.named_steps['cleaner']),
    ('rare_encoder', pre_cleaner.named_steps['rare_encoder'])
])

full_system = {
    'selector_pipeline': pipeline_inference, 
    'preprocessor': final_preprocessor,
    'model': xgb_model,
    'threshold': best_threshold,
    'required_features': final_features 
}
joblib.dump(full_system, 'depression_prediction_system.pkl')
print("\n‚úÖ System Saved Successfully!")

>>> 1. LOADING DATA...

>>> 2. PRE-PROCESSING...

--- [SELECTOR] Calculating Mutual Information... ---
                                  feature     score
1                                     Age  0.199941
17                             Occupation  0.137938
6                           Work Pressure  0.137533
4                              Profession  0.137291
5                       Academic Pressure  0.136853
3         Working Professional or Student  0.132424
9                        Job Satisfaction  0.127905
8                      Study Satisfaction  0.115910
7                                    CGPA  0.114302
13  Have you ever had suicidal thoughts ?  0.074834

>>> 3. TRAINING MODEL (19 features)...
[0]	validation_0-aucpr:0.82554	validation_1-aucpr:0.82520
[100]	validation_0-aucpr:0.86568	validation_1-aucpr:0.86405
[200]	validation_0-aucpr:0.88590	validation_1-aucpr:0.88268
[300]	validation_0-aucpr:0.89807	validation_1-aucpr:0.89352
[400]	validation_0-aucpr:0.90512	validation_1-a

In [None]:
import pandas as pd

# 1. Load d·ªØ li·ªáu
try:
    df = pd.read_csv('train.csv') # Thay t√™n file c·ªßa b·∫°n n·∫øu kh√°c
except FileNotFoundError:
    print("Kh√¥ng t√¨m th·∫•y file d·ªØ li·ªáu.")
    exit()

# 2. L·ªçc l·∫•y c√°c c·ªôt l√† bi·∫øn ph√¢n lo·∫°i (Object ho·∫∑c Category)
cat_cols = df.select_dtypes(include=['object', 'category']).columns

# 3. In ra gi√° tr·ªã unique
print(f"\n{'='*20} DANH S√ÅCH GI√Å TR·ªä UNIQUE {'='*20}\n")

for col in cat_cols:
    unique_values = df[col].unique()
    num_unique = len(unique_values)
    
    # B·ªè qua c√°c c·ªôt ƒë·ªãnh danh qu√° nhi·ªÅu gi√° tr·ªã (nh∆∞ ID, Name) ƒë·ªÉ ƒë·ª° r·ªëi m·∫Øt
    # B·∫°n c√≥ th·ªÉ b·ªè d√≤ng if n√†y n·∫øu mu·ªën in H·∫æT b·∫•t k·ªÉ s·ªë l∆∞·ª£ng
    if num_unique > 50: 
        print(f"‚ö†Ô∏è C·ªôt [{col}] c√≥ qu√° nhi·ªÅu gi√° tr·ªã ({num_unique}). B·ªè qua hi·ªÉn th·ªã.")
        print("-" * 60)
        continue

    print(f"üîµ C·ªôt: [{col}] - C√≥ {num_unique} gi√° tr·ªã kh√°c nhau:")
    print(unique_values)
    
    # N·∫øu mu·ªën xem th√™m s·ªë l∆∞·ª£ng c·ªßa t·ª´ng gi√° tr·ªã th√¨ d√πng d√≤ng d∆∞·ªõi n√†y:
    # print(df[col].value_counts()) 
    
    print("-" * 60)



‚ö†Ô∏è C·ªôt [Name] c√≥ qu√° nhi·ªÅu gi√° tr·ªã (422). B·ªè qua hi·ªÉn th·ªã.
------------------------------------------------------------
üîµ C·ªôt: [Gender] - C√≥ 2 gi√° tr·ªã kh√°c nhau:
['Female' 'Male']
------------------------------------------------------------
‚ö†Ô∏è C·ªôt [City] c√≥ qu√° nhi·ªÅu gi√° tr·ªã (98). B·ªè qua hi·ªÉn th·ªã.
------------------------------------------------------------
üîµ C·ªôt: [Working Professional or Student] - C√≥ 2 gi√° tr·ªã kh√°c nhau:
['Working Professional' 'Student']
------------------------------------------------------------
‚ö†Ô∏è C·ªôt [Profession] c√≥ qu√° nhi·ªÅu gi√° tr·ªã (65). B·ªè qua hi·ªÉn th·ªã.
------------------------------------------------------------
üîµ C·ªôt: [Sleep Duration] - C√≥ 36 gi√° tr·ªã kh√°c nhau:
['More than 8 hours' 'Less than 5 hours' '5-6 hours' '7-8 hours'
 'Sleep_Duration' '1-2 hours' '6-8 hours' '4-6 hours' '6-7 hours'
 '10-11 hours' '8-9 hours' '40-45 hours' '9-11 hours' '2-3 hours'
 '3-4 hours' 'M