The Remote Work Challenge

As Crescent Corp shifted to remote work, HR noticed rising reports of stress, burnout, and social isolation. To address this, they collected data from 5,000 employees worldwide, capturing information on job roles, stress levels, and work setups (remote, hybrid, onsite). With this data, Crescent Corp partnered with a data science team to build a predictive model to identify employees at risk of mental health challenges. This model will allow HR to proactively provide targeted support, ensuring that employees thrive in the remote work environment and maintain productivity.



The Problem: Rising mental health challenges among employees working remotely.
The Goal: Build a predictive model that identifies potential mental health issues to help HR provide targeted support.
The Impact: Using insights from the model to improve well-being and productivity among remote and hybrid teams.

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import pickle


In [3]:
# Loading the data
data = pd.read_csv('Remote.csv')

In [None]:
data.head()

In [None]:
data.info()

In [6]:
# Encode categorical columns using Label Encoding 
label_encoders = {}
for column in ['Age', 'access_to_Mental_Health_Resource']:
    le = LabelEncoder()
    data[column] = le.fit_transform(data[column])
    label_encoders[column] = le

# Encoding target variable (Mental_Health_Condition)
target_encoder = LabelEncoder()
data['Mental_Health_Condition'] = target_encoder.fit_transform(data['Mental_Health_Condition'])

In [8]:
from sklearn.preprocessing import OrdinalEncoder
categories = [
    ['Poor', 'Average', 'Good'],      
    ['Low', 'Medium', 'High'],          
    ['Not At All', 'Weekly', 'Daily']   
]

encoder = OrdinalEncoder(categories=categories)
data[['sleep_Quality', 'stress_Level', 'physical_Activity']] = encoder.fit_transform(
    data[['sleep_Quality', 'stress_Level', 'physical_Activity']]
)

In [10]:
from sklearn.preprocessing import OneHotEncoder

# Defined the nominal columns for one-hot encoding
nominal_columns = ['gender', 'industry', 'Location','region']
onehot_encoder = OneHotEncoder(sparse_output=False, drop='first') 

onehot_encoded = onehot_encoder.fit_transform(data[nominal_columns])
onehot_encoded_df = pd.DataFrame(onehot_encoded, columns=onehot_encoder.get_feature_names_out(nominal_columns))

# Concatenated the original DataFrame with the one-hot encoded DataFrame
data = data.drop(nominal_columns, axis=1) 
data = pd.concat([data, onehot_encoded_df], axis=1)

In [12]:
#Splitting the dataset
from sklearn.model_selection import train_test_split

X = data.drop(columns=['Mental_Health_Condition'])  
y = data['Mental_Health_Condition']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

## LOGISTIC

In [14]:
# Initialize and train the logistic regression model
model = LogisticRegression(solver='lbfgs', max_iter=2000)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))

# Get target names from the label encoder classes
target_names = label_encoder.classes_ if 'label_encoder' in locals() else ['Class1', 'Class2', 'Class3']  # Replace with actual class names if needed

Accuracy: 0.761


In [None]:
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

f1_train = f1_score(y_train, y_train_pred)
f1_test = f1_score(y_test, y_test_pred)

train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print("Train Accuracy Score:", train_accuracy)
print("Test Accuracy Score:", test_accuracy)
print("Train F1 Score:", f1_train)
print("Test F1 Score:", f1_test)

## RANDOM FOREST

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(class_weight='balanced', random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

In [None]:
y_train_pred = rf_model.predict(X_train)
y_test_pred = rf_model.predict(X_test)

f1_train = f1_score(y_train, y_train_pred)
f1_test = f1_score(y_test, y_test_pred)

train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print("Train Accuracy Score:", train_accuracy)
print("Test Accuracy Score:", test_accuracy)
print("Train F1 Score:", f1_train)
print("Test F1 Score:", f1_test)

## KNN

In [77]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score

# Initialize the KNN classifier (you can tune n_neighbors)
knn = KNeighborsClassifier(n_neighbors=4)

# Train the model
knn.fit(X_train, y_train)

# Predict on test data
y_pred = knn.predict(X_test)

In [None]:
y_train_pred = knn.predict(X_train)
y_test_pred = knn.predict(X_test)

f1_train = f1_score(y_train, y_train_pred)
f1_test = f1_score(y_test, y_test_pred)

train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print("Train Accuracy Score:", train_accuracy)
print("Test Accuracy Score:", test_accuracy)
print("Train F1 Score:", f1_train)
print("Test F1 Score:", f1_test)

## DECISION TREE

In [None]:
y_train_pred = dtree.predict(X_train)
y_test_pred = dtree.predict(X_test)

f1_train = f1_score(y_train, y_train_pred)
f1_test = f1_score(y_test, y_test_pred)

train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print("Train Accuracy Score:", train_accuracy)
print("Test Accuracy Score:", test_accuracy)
print("Train F1 Score:", f1_train)
print("Test F1 Score:", f1_test)

# SVC

In [85]:
from sklearn.svm import LinearSVC
from sklearn.svm import SVC

In [None]:
svc_model = SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)
svc_model.fit(X_train, y_train)

In [None]:
y_train_pred = svc_model.predict(X_train)
y_test_pred1 = svc_model.predict(X_test)

f1_train = f1_score(y_train, y_train_pred)
f1_test = f1_score(y_test, y_test_pred)

print("Train Accuracy Score:", train_accuracy)
print("Test Accuracy Score:", test_accuracy)
print("Train F1 Score:", f1_train)
print("Test F1 Score:", f1_test)

## Gradient Boost

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gb_model.fit(X_train, y_train)

In [None]:
y_train_pred = gb_model.predict(X_train)
y_test_pred = gb_model.predict(X_test)

f1_train = f1_score(y_train, y_train_pred)
f1_test = f1_score(y_test, y_test_pred)

print("Train Accuracy Score:", train_accuracy)
print("Test Accuracy Score:", test_accuracy)
print("Train F1 Score:", f1_train)
print("Test F1 Score:", f1_test)