# Load Libraries

In [4]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from scipy.stats import chi2_contingency

# Load dataset

In [6]:
# Load the dataset 
file_path = '../data/raw/Impact_of_Remote_Work_on_Mental_Health.csv'
df = pd.read_csv(file_path)

# Check the shape and head
print("Data Shape:", df.shape)
df.head()

Data Shape: (5000, 20)


Unnamed: 0,Employee_ID,Age,Gender,Job_Role,Industry,Years_of_Experience,Work_Location,Hours_Worked_Per_Week,Number_of_Virtual_Meetings,Work_Life_Balance_Rating,Stress_Level,Mental_Health_Condition,Access_to_Mental_Health_Resources,Productivity_Change,Social_Isolation_Rating,Satisfaction_with_Remote_Work,Company_Support_for_Remote_Work,Physical_Activity,Sleep_Quality,Region
0,EMP0001,32,Non-binary,HR,Healthcare,13,Hybrid,47,7,2,Medium,Depression,No,Decrease,1,Unsatisfied,1,Weekly,Good,Europe
1,EMP0002,40,Female,Data Scientist,IT,3,Remote,52,4,1,Medium,Anxiety,No,Increase,3,Satisfied,2,Weekly,Good,Asia
2,EMP0003,59,Non-binary,Software Engineer,Education,22,Hybrid,46,11,5,Medium,Anxiety,No,No Change,4,Unsatisfied,5,,Poor,North America
3,EMP0004,27,Male,Software Engineer,Finance,20,Onsite,32,8,4,High,Depression,Yes,Increase,3,Unsatisfied,3,,Poor,Europe
4,EMP0005,49,Male,Sales,Consulting,32,Onsite,35,12,2,High,,Yes,Decrease,3,Unsatisfied,3,Weekly,Average,North America


# Cleaning Data

## Missing value

In [9]:
# Check for missing values
print("Missing Values in each column:")
print(df.isnull().sum())

# Check the data types of each column
print("Data Types:")
print(df.dtypes)


Missing Values in each column:
Employee_ID                             0
Age                                     0
Gender                                  0
Job_Role                                0
Industry                                0
Years_of_Experience                     0
Work_Location                           0
Hours_Worked_Per_Week                   0
Number_of_Virtual_Meetings              0
Work_Life_Balance_Rating                0
Stress_Level                            0
Mental_Health_Condition              1196
Access_to_Mental_Health_Resources       0
Productivity_Change                     0
Social_Isolation_Rating                 0
Satisfaction_with_Remote_Work           0
Company_Support_for_Remote_Work         0
Physical_Activity                    1629
Sleep_Quality                           0
Region                                  0
dtype: int64
Data Types:
Employee_ID                          object
Age                                   int64
Gender          

In [10]:
# Filling missing values in 'Mental_Health_Condition' with the most frequent value ('Burnout')
df['Mental_Health_Condition'] = df['Mental_Health_Condition'].fillna(df['Mental_Health_Condition'].mode()[0])

# Filling missing values in 'Physical_Activity' with the most frequent value ('Weekly')
df['Physical_Activity'] = df['Physical_Activity'].fillna(df['Physical_Activity'].mode()[0])

# Verifying that there are no more missing values
print("Missing Values after Imputation:")
print(df.isnull().sum())


Missing Values after Imputation:
Employee_ID                          0
Age                                  0
Gender                               0
Job_Role                             0
Industry                             0
Years_of_Experience                  0
Work_Location                        0
Hours_Worked_Per_Week                0
Number_of_Virtual_Meetings           0
Work_Life_Balance_Rating             0
Stress_Level                         0
Mental_Health_Condition              0
Access_to_Mental_Health_Resources    0
Productivity_Change                  0
Social_Isolation_Rating              0
Satisfaction_with_Remote_Work        0
Company_Support_for_Remote_Work      0
Physical_Activity                    0
Sleep_Quality                        0
Region                               0
dtype: int64


## Encoder for ML

In [12]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# List of categorical columns to encode
categorical_columns = ['Gender', 'Job_Role', 'Industry', 
                       'Stress_Level', 
                       'Access_to_Mental_Health_Resources', 'Satisfaction_with_Remote_Work', 
                       'Physical_Activity', 'Sleep_Quality', 'Region']

# Apply LabelEncoder to each categorical column
for col in categorical_columns:
    df[col] = label_encoder.fit_transform(df[col])

# Check the updated dataset after encoding
print("Dataset after Encoding:")
df.head()


Dataset after Encoding:


Unnamed: 0,Employee_ID,Age,Gender,Job_Role,Industry,Years_of_Experience,Work_Location,Hours_Worked_Per_Week,Number_of_Virtual_Meetings,Work_Life_Balance_Rating,Stress_Level,Mental_Health_Condition,Access_to_Mental_Health_Resources,Productivity_Change,Social_Isolation_Rating,Satisfaction_with_Remote_Work,Company_Support_for_Remote_Work,Physical_Activity,Sleep_Quality,Region
0,EMP0001,32,2,2,3,13,Hybrid,47,7,2,2,Depression,0,Decrease,1,2,1,1,1,2
1,EMP0002,40,0,0,4,3,Remote,52,4,1,2,Anxiety,0,Increase,3,1,2,1,1,1
2,EMP0003,59,2,6,1,22,Hybrid,46,11,5,2,Anxiety,0,No Change,4,2,5,1,2,3
3,EMP0004,27,1,6,2,20,Onsite,32,8,4,0,Depression,1,Increase,3,2,3,1,2,2
4,EMP0005,49,1,5,0,32,Onsite,35,12,2,0,Burnout,1,Decrease,3,2,3,1,0,3


In [13]:
# Check the unique values to verify encoding
print("Work Location Unique Values:", df['Work_Location'].unique())
print("Mental Health Condition Unique Values:", df['Mental_Health_Condition'].unique())

# Verify counts for Mental_Health_Condition and Stress_Level after encoding
print("Mental Health Condition Counts:")
print(df['Mental_Health_Condition'].value_counts())

print("Stress Level Counts:")
print(df['Stress_Level'].value_counts())


Work Location Unique Values: ['Hybrid' 'Remote' 'Onsite']
Mental Health Condition Unique Values: ['Depression' 'Anxiety' 'Burnout']
Mental Health Condition Counts:
Mental_Health_Condition
Burnout       2476
Anxiety       1278
Depression    1246
Name: count, dtype: int64
Stress Level Counts:
Stress_Level
0    1686
2    1669
1    1645
Name: count, dtype: int64


In [14]:
df.to_csv('../data/clean/cleaned_data2.csv', index=False)

In [15]:
# Create a contingency table of Work Location and Mental Health Condition
contingency_table = pd.crosstab(df['Work_Location'], df['Mental_Health_Condition'])

# Display the table to check data
print(contingency_table)


Mental_Health_Condition  Anxiety  Burnout  Depression
Work_Location                                        
Hybrid                       428      800         421
Onsite                       407      818         412
Remote                       443      858         413


# Hypothesis

## General Hypothesis: Work Location has a significant impact on Mental Health

We hypothesize that work location (Remote, Hybrid, Onsite) is significantly associated with mental health conditions like anxiety, burnout, and depression.

### Chi-Square Test of Independence

We'll perform the Chi-Square test to check if the observed mental health conditions significantly differ between work locations.

In [18]:

# Perform the Chi-Square Test of Independence
chi2, p, dof, expected = chi2_contingency(contingency_table)

# Results
print(f"Chi-Square Statistic: {chi2}")
print(f"P-value: {p}")

# Interpretation
if p < 0.05:
    print("Reject the null hypothesis: There is a significant association between Work Location and Mental Health Conditions.")
else:
    print("Fail to reject the null hypothesis: There is no significant association between Work Location and Mental Health Conditions.")

Chi-Square Statistic: 1.725516230047377
P-value: 0.7860775179211476
Fail to reject the null hypothesis: There is no significant association between Work Location and Mental Health Conditions.


## Hypothesis 1: Relationship Between Stress Levels and Work-Life Balance

### Hypothesis:

Null Hypothesis (H0): There is no significant relationship between Stress Level and Work-Life Balance.

Alternative Hypothesis (H1): There is a significant relationship between Stress Level and Work-Life Balance.

Test: Use a Chi-Square test to check for independence between stress levels and work-life balance.

In [29]:
# Creating a contingency table
contingency_table_stress_balance = pd.crosstab(df['Stress_Level'], df['Work_Life_Balance_Rating'])

# Performing the Chi-Square test
chi2_stress_balance, p_stress_balance, dof_stress_balance, expected_stress_balance = chi2_contingency(contingency_table_stress_balance)

print(f"Chi-Square Statistic: {chi2_stress_balance}")
print(f"P-value: {p_stress_balance}")

# Interpretation
if p_stress_balance < 0.05:
    print("Reject the null hypothesis: There is a significant relationship between Stress Level and Work-Life Balance.")
else:
    print("Fail to reject the null hypothesis: There is no significant relationship between Stress Level and Work-Life Balance.")


Chi-Square Statistic: 8.35298303598177
P-value: 0.3997709095022564
Fail to reject the null hypothesis: There is no significant relationship between Stress Level and Work-Life Balance.


Based on the Chi-Square test results, we fail to reject the null hypothesis. This means there is no significant relationship between Work Location (Remote, Onsite, Hybrid) and Mental Health Conditions (Anxiety, Burnout, Depression). Employees' mental health does not appear to be significantly affected by where they work, at least within the limitations of this dataset.


## Hypothesis 2: Impact of Remote Work on Anxiety and Burnout


### Hypothesis:


Null Hypothesis (H0): There is no significant difference in the occurrence of Anxiety and Burnout across different Work Locations (Remote, Hybrid, Onsite).

Alternative Hypothesis (H1): There is a significant difference in the occurrence of Anxiety and Burnout across different Work Locations.

Test: Chi-Square test for independence between Work Location and Mental Health Condition (Anxiety, Burnout).

In [31]:
# Creating a contingency table
contingency_table_work_location = pd.crosstab(df['Work_Location'], df['Mental_Health_Condition'])

# Performing the Chi-Square test
chi2_work_location, p_work_location, dof_work_location, expected_work_location = chi2_contingency(contingency_table_work_location)

print(f"Chi-Square Statistic: {chi2_work_location}")
print(f"P-value: {p_work_location}")

# Interpretation
if p_work_location < 0.05:
    print("Reject the null hypothesis: There is a significant relationship between Work Location and Mental Health Conditions (Anxiety, Burnout).")
else:
    print("Fail to reject the null hypothesis: There is no significant relationship between Work Location and Mental Health Conditions.")


Chi-Square Statistic: 1.725516230047377
P-value: 0.7860775179211476
Fail to reject the null hypothesis: There is no significant relationship between Work Location and Mental Health Conditions.


The Chi-Square test for Stress Level and Work-Life Balance shows a P-value that is greater than 0.05. Therefore, we fail to reject the null hypothesis. This indicates that there is no significant relationship between Stress Level and Work-Life Balance in this dataset. This suggests that work-life balance is not directly associated with reported stress levels among employees.


## Hypothesis 3: Productivity and Work-Life Balance

### Hypothesis: 

Null Hypothesis (H0): There is no significant relationship between Productivity Change and Work-Life Balance.

Alternative Hypothesis (H1): There is a significant relationship between Productivity Change and Work-Life Balance.

Test: Chi-Square test to assess if changes in productivity are related to work-life balance.

In [34]:
# Creating a contingency table
contingency_table_productivity_balance = pd.crosstab(df['Productivity_Change'], df['Work_Life_Balance_Rating'])

# Performing the Chi-Square test
chi2_productivity_balance, p_productivity_balance, dof_productivity_balance, expected_productivity_balance = chi2_contingency(contingency_table_productivity_balance)

print(f"Chi-Square Statistic: {chi2_productivity_balance}")
print(f"P-value: {p_productivity_balance}")

# Interpretation
if p_productivity_balance < 0.05:
    print("Reject the null hypothesis: There is a significant relationship between Productivity Change and Work-Life Balance.")
else:
    print("Fail to reject the null hypothesis: There is no significant relationship between Productivity Change and Work-Life Balance.")


Chi-Square Statistic: 3.5894546446202544
P-value: 0.892137278325681
Fail to reject the null hypothesis: There is no significant relationship between Productivity Change and Work-Life Balance.


The Chi-Square test result for the relationship between Productivity Change and Work-Life Balance shows a P-value greater than 0.05. We fail to reject the null hypothesis, meaning that there is no significant relationship between Productivity Change and Work-Life Balance in this dataset. Employees’ productivity changes do not appear to be directly tied to their work-life balance.
