## Objective :Predict whether an employee needs treatment using demographic and mental health data.

### Step 1: Import Dependencies

In [26]:
pip install --upgrade numpy scikit-learn catboost


Collecting numpy
  Using cached numpy-2.2.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)


In [36]:
# Data manipulation libraries
import pandas as pd
import numpy as np

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning libraries and utilities
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Advanced models (optional)
from xgboost import XGBClassifier
import lightgbm as lgb
from catboost import CatBoostClassifier, Pool
from sklearn.neural_network import MLPClassifier

# For handling imbalanced data (optional)
from imblearn.over_sampling import SMOTE

# Miscellaneous utilities
import warnings
warnings.filterwarnings('ignore')
import joblib  # for model persistence


In [37]:
# Mount the googl collb
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [38]:
# Change the Directory
import os
os.chdir('/content/drive/MyDrive/My Practice/Hackehon')

### Step 2: Load Dataset

In [39]:
sheet_test_url = "https://docs.google.com/spreadsheets/d/1iIcbW8nM0CjTTCfvDA2E0UgGJoq_LvEFt0r4ICA4GEg/export?format=csv"
test_df = pd.read_csv(sheet_test_url)

sheet_train_url = "https://docs.google.com/spreadsheets/d/1EVO5I_WMQZp6c41ZBSmGIDzsBr0HZ_-jmYOEwYCZrG8/export?format=csv"
train_df = pd.read_csv(sheet_train_url)

# Quick look at the data
print("Train Data:")
print(train_df.head())
print("\nTest Data:")
print(test_df.head())


Train Data:
   S.No        Timestamp  Age  Gender         Country state self_employed  \
0     1  8/27/2014 11:29   37  Female   United States    IL           NaN   
1     2  8/27/2014 11:29   44       M   United States    IN           NaN   
2     3  8/27/2014 11:29   32    Male          Canada   NaN           NaN   
3     4  8/27/2014 11:29   31    Male  United Kingdom   NaN           NaN   
4     5  8/27/2014 11:30   31    Male   United States    TX           NaN   

  family_history treatment work_interfere  ...               leave  \
0             No       Yes          Often  ...       Somewhat easy   
1             No        No         Rarely  ...          Don't know   
2             No        No         Rarely  ...  Somewhat difficult   
3            Yes       Yes          Often  ...  Somewhat difficult   
4             No        No          Never  ...          Don't know   

  mental_health_consequence phys_health_consequence     coworkers supervisor  \
0                       

In [40]:
# Save test identifiers for submission later
test_df = test_df.reset_index(drop=True)
if 'S.No' in test_df.columns:
    test_ids = test_df['S.No']
else:
    test_ids = pd.Series(np.arange(1, len(test_df) + 1))


### Step 3: Data Preprocessing

In [41]:
# Define columns to drop
cols_to_drop = ['Timestamp', 'comments']

# Drop columns from train dataset if they exist
dropped_train_cols = [col for col in cols_to_drop if col in train_df.columns]
if dropped_train_cols:
    train_df.drop(columns=dropped_train_cols, inplace=True)
    print("Dropped from train:", dropped_train_cols)

# Drop columns from test dataset if they exist
dropped_test_cols = [col for col in cols_to_drop if col in test_df.columns]
if dropped_test_cols:
    test_df.drop(columns=dropped_test_cols, inplace=True)
    print("Dropped from test:", dropped_test_cols)


Dropped from train: ['Timestamp', 'comments']
Dropped from test: ['Timestamp', 'comments']


In [42]:
train_df.head()

Unnamed: 0,S.No,Age,Gender,Country,state,self_employed,family_history,treatment,work_interfere,no_employees,...,anonymity,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence
0,1,37,Female,United States,IL,,No,Yes,Often,25-Jun,...,Yes,Somewhat easy,No,No,Some of them,Yes,No,Maybe,Yes,No
1,2,44,M,United States,IN,,No,No,Rarely,More than 1000,...,Don't know,Don't know,Maybe,No,No,No,No,No,Don't know,No
2,3,32,Male,Canada,,,No,No,Rarely,25-Jun,...,Don't know,Somewhat difficult,No,No,Yes,Yes,Yes,Yes,No,No
3,4,31,Male,United Kingdom,,,Yes,Yes,Often,26-100,...,No,Somewhat difficult,Yes,Yes,Some of them,No,Maybe,Maybe,No,Yes
4,5,31,Male,United States,TX,,No,No,Never,100-500,...,Don't know,Don't know,No,No,Some of them,Yes,Yes,Yes,Don't know,No


In [43]:
test_df.head()

Unnamed: 0,S.No,Age,Gender,Country,state,self_employed,family_history,work_interfere,no_employees,remote_work,...,anonymity,leave,mental_health_consequence,phys_health_consequence,coworkers,supervisor,mental_health_interview,phys_health_interview,mental_vs_physical,obs_consequence
0,1,39,Male,United Kingdom,,Yes,Yes,Sometimes,5-Jan,Yes,...,Yes,Somewhat difficult,No,No,Yes,Yes,No,Maybe,Yes,Yes
1,2,26,female,United States,WA,No,Yes,Sometimes,More than 1000,No,...,Don't know,Don't know,No,No,Some of them,Yes,No,Maybe,No,Yes
2,3,23,Female,United States,IL,No,Yes,Sometimes,26-100,No,...,Don't know,Somewhat difficult,Yes,No,No,Some of them,No,Maybe,No,No
3,4,35,Male,Switzerland,,No,Yes,Often,More than 1000,No,...,Yes,Very easy,No,No,Some of them,Some of them,No,Maybe,No,No
4,5,36,Male,United States,FL,No,No,Never,5-Jan,Yes,...,Don't know,Very easy,No,No,Some of them,Some of them,No,No,Don't know,No


In [44]:
# Function to check null values
def check_null_values(df, df_name):
    print(f"\n------------------ Checking Null Values in {df_name} ------------------")
    null_counts = df.isnull().sum()
    print(null_counts[null_counts > 0] if null_counts.sum() > 0 else "No missing values found.")

# Function to display dataset information
def display_info(df, df_name):
    print(f"\n------------------ Dataset Information: {df_name} ------------------")
    print(df.info())

# Function to display summary statistics
def display_statistics(df, df_name):
    print(f"\n------------------ Summary Statistics: {df_name} ------------------")
    print(df.describe())

# Perform checks for train and test datasets
check_null_values(train_df, "Train Dataset")
check_null_values(test_df, "Test Dataset")

display_info(train_df, "Train Dataset")
display_info(test_df, "Test Dataset")

display_statistics(train_df, "Train Dataset")
display_statistics(test_df, "Test Dataset")



------------------ Checking Null Values in Train Dataset ------------------
state               412
self_employed        18
work_interfere      236
benefits             13
wellness_program      4
leave                 4
dtype: int64

------------------ Checking Null Values in Test Dataset ------------------
state               103
work_interfere       28
benefits              3
wellness_program      2
leave                 5
dtype: int64

------------------ Dataset Information: Train Dataset ------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048 entries, 0 to 1047
Data columns (total 26 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   S.No                       1048 non-null   int64 
 1   Age                        1048 non-null   int64 
 2   Gender                     1048 non-null   object
 3   Country                    1048 non-null   object
 4   state                      636 non-null  

In [45]:
# check category level of all columns
def check_category_levels(df, df_name="Dataset"):
    print(f"\n{'='*20} {df_name} Category Levels {'='*20}\n")
    for col in df.columns:
        print(f"------ {col} ------")
        print(df[col].value_counts(dropna=False))  # Includes NaN values
        print("-" * 50)

# Checking category levels for both train and test datasets
check_category_levels(train_df, "Train Data")
check_category_levels(test_df, "Test Data")




------ S.No ------
S.No
1048    1
1       1
2       1
3       1
4       1
       ..
28      1
29      1
30      1
31      1
32      1
Name: count, Length: 1048, dtype: int64
--------------------------------------------------
------ Age ------
Age
 29             73
 32             64
 33             63
 26             63
 27             61
 31             57
 28             56
 30             56
 34             52
 35             49
 25             48
 23             41
 24             38
 37             36
 36             31
 38             30
 40             26
 39             26
 43             23
 22             20
 42             17
 41             15
 21             15
 19              9
 45              9
 44              8
 46              7
 18              7
 50              5
 48              5
 51              4
 49              4
 20              4
 56              3
 55              3
 57              3
 54              3
 47              2
 329             1
 60       

In [46]:
# Step 1: Find the median of valid ages
valid_age_median = train_df[(train_df['Age'] >= 18) & (train_df['Age'] <= 65)]['Age'].median()

# Step 2: Replace invalid ages with the median
train_df['Age'] = train_df['Age'].apply(lambda x: valid_age_median if (x < 18 or x > 65) else x)

# Step 3: Check if the dataset is clean
print(train_df['Age'].describe())


count    1048.000000
mean       31.916031
std         7.195364
min        18.000000
25%        27.000000
50%        31.000000
75%        36.000000
max        65.000000
Name: Age, dtype: float64


In [47]:
# Map target variable 'treatment': Yes -> 1, No -> 0
if 'treatment' in train_df.columns:
    train_df['treatment'] = train_df['treatment'].map({'Yes': 1, 'No': 0})
    print("Target variable mapped successfully.")

Target variable mapped successfully.


In [48]:
# Drop identifier column ('S.No') if present
identifier_col = 'S.No'
if identifier_col in train_df.columns:
    train_df.drop(columns=[identifier_col], inplace=True)
    print(f"Dropped identifier column: {identifier_col}")

Dropped identifier column: S.No


In [49]:
# Fill missing values in categorical columns with the mode
def fill_missing_with_mode(df):
    categorical_cols = df.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        df[col].fillna(df[col].mode()[0], inplace=True)
    return categorical_cols

train_categorical_cols = fill_missing_with_mode(train_df)
test_categorical_cols = fill_missing_with_mode(test_df)
print(f"Filled missing values in categorical columns: {set(train_categorical_cols) | set(test_categorical_cols)}")

Filled missing values in categorical columns: {'phys_health_consequence', 'state', 'care_options', 'self_employed', 'coworkers', 'tech_company', 'benefits', 'phys_health_interview', 'no_employees', 'mental_health_interview', 'mental_health_consequence', 'family_history', 'seek_help', 'supervisor', 'wellness_program', 'Gender', 'anonymity', 'Country', 'work_interfere', 'leave', 'mental_vs_physical', 'remote_work', 'obs_consequence'}


In [50]:
# Convert categorical columns to string type
def convert_to_string(df):
    categorical_cols = df.select_dtypes(include=['object']).columns
    df[categorical_cols] = df[categorical_cols].astype(str)

convert_to_string(train_df)
convert_to_string(test_df)
print("Converted all categorical columns to string type.")

Converted all categorical columns to string type.


### Step 4: Encoding Categorical Variables

In [51]:
from sklearn.preprocessing import LabelEncoder

def encode_categorical_columns(train_df, test_df, target='treatment'):
    """
    Encode categorical columns in both training and test DataFrames using LabelEncoder.
    Unseen categories in the test set are replaced with the most frequent category from training.

    Parameters:
        train_df (pd.DataFrame): Training dataset.
        test_df (pd.DataFrame): Test dataset.
        target (str): Target column to exclude from encoding (default is 'treatment').

    Returns:
        tuple: Updated train_df, test_df, and a dictionary of fitted LabelEncoders for each column.
    """
    label_encoders = {}
    # Identify categorical columns, excluding the target variable
    categorical_cols = [col for col in train_df.select_dtypes(include='object').columns if col != target]

    for col in categorical_cols:
        le = LabelEncoder()
        # Get the most frequent (mode) category from the training data before encoding
        common_cat = train_df[col].mode()[0]
        # Fit and transform the training column
        train_df[col] = le.fit_transform(train_df[col])
        label_encoders[col] = le

        # For the test set, replace unseen categories with the mode from training
        test_df[col] = test_df[col].apply(lambda x: x if x in le.classes_ else common_cat)
        test_df[col] = le.transform(test_df[col])

    print("Encoded categorical columns:", categorical_cols)
    return train_df, test_df, label_encoders

# # Example usage:
# train_df, test_df, label_encoders = encode_categorical_columns(train_df, test_df)


In [52]:
train_df, test_df, label_encoders = encode_categorical_columns(train_df, test_df)

Encoded categorical columns: ['Gender', 'Country', 'state', 'self_employed', 'family_history', 'work_interfere', 'no_employees', 'remote_work', 'tech_company', 'benefits', 'care_options', 'wellness_program', 'seek_help', 'anonymity', 'leave', 'mental_health_consequence', 'phys_health_consequence', 'coworkers', 'supervisor', 'mental_health_interview', 'phys_health_interview', 'mental_vs_physical', 'obs_consequence']


### Step 5: Spliting Data

In [53]:
from sklearn.model_selection import train_test_split

# Separate features (X) and target (y)
X = train_df.drop(columns=['treatment']) # Assign train_df without 'treatment' column to X
y = train_df['treatment']  # Assign 'treatment' column to y


# For test dataset: Drop identifier 'S.No' if it exists and align columns with training features
identifier = 'S.No'
if identifier in test_df.columns:
    X_test = test_df.drop(columns=[identifier])
else:
    X_test = test_df.copy()

# Ensure X_test has the same columns and order as X
X_test = X_test[X.columns]

print(f"Training features shape: {X.shape} | Test features shape: {X_test.shape}")

# Split training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Train/Validation split completed.")
print(f"X_train shape: {X_train.shape} | X_val shape: {X_val.shape}")


Training features shape: (1048, 24) | Test features shape: (210, 24)
Train/Validation split completed.
X_train shape: (838, 24) | X_val shape: (210, 24)


In [54]:
# Identify categorical features (all object-type columns)
cat_features = X.select_dtypes(include=['object']).columns.tolist() # Now X is defined
print("Categorical features:", cat_features)

Categorical features: []


### Step 6: Feature Scalling

In [55]:
# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform it
X_train_scaled = scaler.fit_transform(X_train)
print("Training data scaled using StandardScaler.")
print(f"Training data mean: {scaler.mean_}")
print(f"Training data standard deviation: {scaler.scale_}")

# Transform the validation and test data using the same scaler parameters
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)
print("Validation and test data scaled using the training data parameters.")


Training data scaled using StandardScaler.
Training data mean: [32.11575179 20.4176611  32.97732697 13.95942721  0.12649165  0.38186158
  2.150358    2.36038186  0.31145585  0.82458234  1.075179    0.96420048
  1.03579952  0.90214797  0.66109785  1.44033413  0.849642    0.83651551
  0.98806683  1.12887828  0.87231504  0.72195704  0.82458234  0.14081146]
Training data standard deviation: [ 7.29024687  8.73187305 11.50347925 14.65272709  0.33240263  0.48584289
  1.16176913  1.71792622  0.46308866  0.38032395  0.84493699  0.8607914
  0.58718008  0.69603045  0.91640563  1.52407752  0.75766166  0.47941221
  0.60613477  0.83771563  0.4333057   0.72313953  0.84136326  0.34782695]
Validation and test data scaled using the training data parameters.


In [56]:

print(y_train.isna().sum())  # Should print 0
print(np.isnan(X_train_scaled).sum().sum())  # Should print 0

0
0


In [57]:
print(y_train.unique())  # Should show only valid classes (e.g., [0,1] for binary classification)


[1 0]


### Step 7: Train and Evaluate Models

In [58]:
# Note: CatBoost can directly handle categorical features by passing their column names.
train_pool = Pool(data=X_train, label=y_train, cat_features=cat_features)
val_pool = Pool(data=X_val, label=y_val, cat_features=cat_features)

In [59]:
cat_model = CatBoostClassifier(
    iterations=300,
    learning_rate=0.05,
    depth=6,
    random_seed=42,
    verbose=100,
    early_stopping_rounds=20
)
cat_model.fit(train_pool, eval_set=val_pool)
print("CatBoost training completed.")

0:	learn: 0.6736128	test: 0.6783110	best: 0.6783110 (0)	total: 1.08ms	remaining: 325ms
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.5657773403
bestIteration = 44

Shrink model to first 45 iterations.
CatBoost training completed.


In [60]:
# # Check for NaNs in y_val and fill them with the mode
# if y_val.isna().any():
#     # If y_val.mode() is empty, fill with a default value (e.g., 0)
#     fill_value = y_val.mode().iloc[0] if not y_val.mode().empty else 0
#     y_val = y_val.fillna(fill_value)
#     print(f"Filled {y_val.isna().sum()} NaN values in y_val with {fill_value}")
y_val_pred = cat_model.predict(val_pool)
acc = accuracy_score(y_val, y_val_pred)
f1 = f1_score(y_val, y_val_pred)
print("---------Validation Accuracy------------")
print(acc)
print("---------Validation F1 Score------------")
print(f1)
print("---------Classification Report------------")
print(classification_report(y_val, y_val_pred))
print("---------Confusion Matrix------------")
print(confusion_matrix(y_val, y_val_pred))


---------Validation Accuracy------------
0.7333333333333333
---------Validation F1 Score------------
0.7333333333333333
---------Classification Report------------
              precision    recall  f1-score   support

           0       0.76      0.71      0.73       109
           1       0.71      0.76      0.73       101

    accuracy                           0.73       210
   macro avg       0.73      0.73      0.73       210
weighted avg       0.74      0.73      0.73       210

---------Confusion Matrix------------
[[77 32]
 [24 77]]


### Step 9: Generating Submission File

In [61]:
test_pool = Pool(data=X_test, cat_features=cat_features)
test_preds = cat_model.predict(test_pool)

submission = pd.DataFrame({'S.No': test_ids, 'treatment': test_preds})
# Map numeric predictions back to "Yes" and "No"
submission['treatment'] = submission['treatment'].apply(lambda x: 'Yes' if x == 1 else 'No')
submission_file_path = "/content/drive/MyDrive/My Practice/Hackehon/submission.csv"
submission.to_csv(submission_file_path, index=False)
print("Submission file saved to", submission_file_path)

Submission file saved to /content/drive/MyDrive/My Practice/Hackehon/submission.csv


### Step 10: Save Model and Scaler

In [None]:
joblib.dump(cat_model, "catboost_model.pkl")
print("CatBoost model saved.")

MLP model saved to employee_treatment_mlp_model.pkl
Scaler saved to scaler_mlp.pkl
Both MLP model and scaler have been saved successfully.


In [32]:
print("Length of test_ids:", len(test_ids))
print("Length of test_preds:", len(test_preds))


Length of test_ids: 210
Length of test_preds: 210
