In [1]:
#Import Dependencies

import pandas as pd

import numpy as np

from catboost import CatBoostClassifier

from sklearn.model_selection import StratifiedKFold,  cross_val_score

from sklearn.metrics import accuracy_score, log_loss

from scipy.stats import mode

from sklearn.base import BaseEstimator, ClassifierMixin

from sklearn.calibration import CalibratedClassifierCV

import optuna

import warnings

warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
#Read In our Data

train = pd.read_csv(r'PATH', index_col='id')

test = pd.read_csv(r'PATH', index_col='id')

ogdata = pd.read_csv(r'PATH')

In [3]:
#add our ogdata to our training data encoding the Yes and No depression values to 1 and 0 respectively
ogdata['Depression'] = ogdata['Depression'].map({'Yes': 1, 'No': 0})
train = pd.concat([train, ogdata], ignore_index=True)

In [None]:
#Assign our x and y

y = train["Depression"]

y = y.astype(int) 

x = train.drop(["Depression"], axis=1)

In [5]:
#Clean Degree data
def clean_degree(value):
    # Degree mapping for common variations
    degree_mapping = {
        'B.Pharm': ['BPharm', 'P.Pharm', 'B. Pharm', 'S.Pharm', "H_Pharm", 'N.Pharm'],
        'B.Sc': ['BSc', 'B.Sc', 'B.Student'],
        'B.Tech': ['BTech', 'B.Tech', 'B B.Tech', 'S.Tech', 'E.Tech', 'LLTech'],
        'M.Tech': ['MTech', 'M.Tech', 'M_Tech'],
        'MBA': ['M. Business Analyst'],
        'B.Arch': ['B.B.Arch', 'B.Arch', 'B. Architecture', 'BArch', 'S.Arch'],
        'M.Arch': ['M.Arch'],
        'MBBS': ['MBBS'],
        'M.Pharm': ['MPharm', 'M.Pharm'],
        'PhD': ['PhD'],
        'Class 12': ['Class 12'],
        'Class 11': ['Class 11'],
        'M.A.': ['MA'],
        'M.Sc': ['M.S', 'MSc'],
        'B.Com': ['B.Com', "B B.Com", 'LLCom', 'LL.Com', 'B_Com'],
        'M.Com': ['M.Com', 'B.M.Com', 'P.Com'],
        'LLB': ['LLB', 'LLBA'],
        'BCA': ['B.CA', 'BCA', 'B BCA'],
        'BBA': ['BBA', 'B.BA', "BBA", 'B BA', 'BA'],
        'BHM': ['BHM', 'B.H', "BH"],
        'MHM': ['MHM', 'LHM'],
        'B.Ed': ['BEd', 'B.Ed.', 'B. Ed'],
        'M.Ed': ['M.Ed', 'M.M.Ed', 'MEd'],
        'LL B.Ed': ['LL B.Ed', 'LLEd', "L.Ed"],
        'M.B.Ed': ['M.B.Ed'],
        'MCA': ['MCA'],
        'MD' : ['MD'],
        'ME' : ['ME'],
        'BE' : ['BE'],
        'MBA' : ['MBA'],
        'LLM' : ['LLM'],
        'MPA' : ['MPA'],
        'BHCA' : ['BHCA'],
        'BPA' : ['BPA'],
        'Doctor' : ['Doctor'],
        'B. Gender' : ['B. Gender'],
        'Mechanical Engineer' : ['Mechanical Engineer'],
        'M.UI' : ['M.UI'],
        'B.Press' : ['B.Press'],
        'General Ed' : ['E.Ed', 'G.Ed', 'J.Ed', 'K.Ed', 'I.Ed', 'A.Ed'],
        'xCA' : ['RCA', 'GCA', 'PCA', 'LCA', 'ACA'],
        'LLS' : ['LLS'],
        'BB' : ['BB']
    }
    
            
    for key, values in degree_mapping.items():
        if value in values:
            return key
    return np.nan  # Set to NaN if not in clean categories

In [6]:
#Add Identifier Column to group bachelors together, masters together, etc after degree cleaning has been done
def map_degree_level(degree):
    degree_level_mapping = {
        'Bachelors': ['B.Pharm', 'B.Sc', 'B.Tech', 'B.Arch', 'B.Com', 'BBA', 'BCA', 'BHM', 'B.Ed', 'BPA', 'BH', 'B. Gender', 'BB', 'BE', 'BHCA', 'B.Press', 'Mechanical Engineer'],
        'Masters': ['M.Tech', 'MBA', 'M.Arch', 'M.Sc', 'M.Com', 'M.Pharm', 'M.Ed', 'MCA', 'MHM', 'M.B.Ed', 'MPA', 'MD', 'ME', 'MBA', 'M.UI', 'MBBS', 'M.A.'],
        'Doctorate': ['PhD', 'Doctor'],
        'High School' : ['Class 12', 'Class 11'],
        'Random CA' : ['xCA'],
        'General Ed' : ['General Ed'],
        'Law' : ['LLS', 'LLB', 'LLM', 'LL B.Ed'],
    }
    if degree in degree_level_mapping['Bachelors']:
        return 'Bachelors'
    elif degree in degree_level_mapping['Masters']:
        return 'Masters'
    elif degree in degree_level_mapping['Doctorate']:
        return 'Doctorate'
    elif degree in degree_level_mapping['High School']:
        return 'High School'
    elif degree in degree_level_mapping['Random CA']:
        return 'Random CA'
    elif degree in degree_level_mapping['General Ed']:
        return 'General Ed'
    elif degree in degree_level_mapping['Law']:
        return 'Law'
    else:
        return "No Higher ED"

In [7]:
#Transform Sleep data to a continuous numerical
def transform_sleep_duration(value):
    if value in ["2-3 hours", "3-4 hours", "4-5 hours", "1-2 hours", "1-3 hours", "1-6 hours", "3-6 Hours", '3-6 hours', '5-6 hours', "4-6 hours", "6-7 hours", "7-8 hours", "6-8 hours", '9-5', '9-5 Hours', '9-5 hours', "10-11 hours", "9-11 hours", "8-9 hours", "9-10 hours", "9-6 hours", "10-6 hours"]:
        return abs(int(value.split("-")[0]) + int(value.split("-")[1].replace(" hours", "")) / 2)
    elif value in ['6 hours']:
        return 6
    elif value in ["Less than 5 hours"]: 
        return 4
    elif value in ["7 hours"]:
        return 7
    elif value in ["8 hours"]:
        return 8
    elif value in ["More than 8 hours"]:
        return 9
    elif value in ['8-89 hours']:
        return 8.5
    elif value in ['More than 8 hours']: 
        return 9
    else:
        return np.nan  # Non-relevant entries set to NaN


In [8]:
#Clean Profession Column
def clean_profession(value):
    cols_profession_to_delete = ["Simran", "Name", "No", "24th", "Unhealthy", "Yuvraj", "Yogesh", "Patna", "Nagpur", "Pranav", "Visakhapatnam", "Moderate", "Manvi", "Samar", "Surat", 'nan']
    remap_dict = {
        'Analyst': ['Finanancial Analyst', 'Analyst', 'Financial Analyst', 'Business Analyst', 'Research Analyst', 'Data Scientist'],
    }
    for key, values in remap_dict.items():
        if value in values:
            return key
    return np.nan if value in cols_profession_to_delete else value

In [9]:
#Function For preprocessing a dataframe (Right Now Testing If Name Should be Removed or Not, uncomment name part and remove from cat to remove)

def preprocessing(df):

    if "Depression" in df.columns:
        df['Depression'] = df['Depression'].map({"Yes": 1, "No": 0, "1": 1, "0": 0})
    df['Sleep Duration'] = df['Sleep Duration'].apply(transform_sleep_duration)
    df['Degree'] = df['Degree'].apply(clean_degree)
    df['Degree Level'] = df['Degree'].apply(map_degree_level)
    df['Profession'] = df['Profession'].apply(clean_profession)

    df = df.fillna("Unknown")



    binary_cols = [

    'Gender',

    'Have you ever had suicidal thoughts ?',

    'Working Professional or Student',

    'Family History of Mental Illness'

    ]

    for col in binary_cols:

        df[col] = df[col].astype('category').cat.codes


    numeric_cols = [

    'CGPA',

    'Study Satisfaction',

    'Job Satisfaction',

    'Work Pressure',

    'Academic Pressure',

    'Financial Stress',

    'Age',

    'Work/Study Hours',

    'Sleep Duration'

    ]
    for col in numeric_cols:

        df[col] = df[col].astype('category').cat.codes 


    nominal_cols = [

    'City',

    'Profession',

    'Dietary Habits',

    'Degree',

    'Name', 

    'Degree Level'

    ]

    for col in nominal_cols:

        df[col] = df[col].astype('category')


    return df

In [10]:
#Run preprocessing on our data

x = preprocessing(x)

test = preprocessing(test)

In [11]:
#Section for holding the best parameters

bestparams = {

    'learning_rate':   0.06941528142896314,  

    'depth': 7,            

    'l2_leaf_reg': 2.3069172853265654,

    'iterations': 1005

}

In [12]:
cv = StratifiedKFold(n_splits=8, shuffle=True, random_state=11)  
cv_splits = cv.split(x , y)

scores = []
test_preds = []


for i, (train_idx, val_idx) in enumerate(cv_splits):

    X_train_fold, X_val_fold = x.loc[train_idx], x.loc[val_idx]
    y_train_fold, y_val_fold = y.loc[train_idx], y.loc[val_idx]
    
    
    model = CatBoostClassifier(
        iterations=1005,  
        depth=7,  
        learning_rate=0.06941528142896314,  
        loss_function='Logloss',  
        cat_features=[0,1,3,5,12,13,4,14,17, 18],  
        l2_leaf_reg=2.3069172853265654,
        eval_metric='Accuracy',
        random_state=11
    )
    
    model.fit(X_train_fold, y_train_fold, eval_set=(X_val_fold, y_val_fold), verbose=False)
    val_pred = model.predict(X_val_fold)
    

    score = accuracy_score(y_val_fold, val_pred)
    scores.append(score)
    

    test_pred = model.predict_proba(test)[:, 1]  
    test_preds.append(test_pred)
print(sum(scores) / len(scores))  
predictions = np.mean(test_preds, axis=0)
predictions = np.where(predictions > 0.5, 1.0, 0.0)



0.9410495895459875


In [15]:
#Declare Our Final Model

model = CatBoostClassifier(

        **bestparams,

        eval_metric='Accuracy',

        #cat_features=[0,2,4,10,11,12,3,13,16],
        cat_features=[0,1,3,5,12,13,4,14,17, 18],

        verbose=0,

        random_state=11,

        loss_function="Logloss"

    )

In [16]:
#Fit Model

model.fit(x, y)

<catboost.core.CatBoostClassifier at 0x198ced3f670>

In [17]:
#Get Predictions

predictions_prob = model.predict_proba(test)[:, 1]

predictions = np.where(predictions_prob > 0.5, 1.0, 0.0)

In [18]:
#Generate Submission


submission = pd.DataFrame({

    'id': test.index,  

    'Depression': predictions

})



submission.to_csv('submission.csv', index=False)