In [2]:
import pandas as pd
from pymongo import MongoClient
import seaborn as sns
import matplotlib.pyplot as plt



# Load the dataset
data = pd.read_csv('Resources/Impact_of_Remote_Work_on_Mental_Health.csv')  # Update with your dataset path

# Connect to MongoDB
client = MongoClient('mongodb://localhost:27017/')
db = client['remote_work_db']  # Create a new database
collection = db['mental_health']  # Create a new collection

# Insert data into MongoDB
collection.insert_many(data.to_dict('records'))

print("Data loaded successfully!")


Data loaded successfully!


In [3]:
# Retrieve data from MongoDB
data = pd.DataFrame(list(collection.find()))

# Display initial data info
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 21 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   _id                                25000 non-null  object
 1   Employee_ID                        25000 non-null  object
 2   Age                                25000 non-null  int64 
 3   Gender                             25000 non-null  object
 4   Job_Role                           25000 non-null  object
 5   Industry                           25000 non-null  object
 6   Years_of_Experience                25000 non-null  int64 
 7   Work_Location                      25000 non-null  object
 8   Hours_Worked_Per_Week              25000 non-null  int64 
 9   Number_of_Virtual_Meetings         25000 non-null  int64 
 10  Work_Life_Balance_Rating           25000 non-null  int64 
 11  Stress_Level                       25000 non-null  object
 12  Ment

In [4]:
missing_count = data.isnull().sum()
print(missing_count[missing_count > 0]) 

Mental_Health_Condition    5980
Physical_Activity          8145
dtype: int64


In [5]:
# Display unique values in the Mental_Health_Condition column
unique_conditions = data['Mental_Health_Condition'].unique()
print(unique_conditions)


['Depression' 'Anxiety' nan 'Burnout']


In [6]:
# Replace nan values with 'None'
data['Mental_Health_Condition'] = data['Mental_Health_Condition'].fillna('None')



In [7]:
# Display unique values in the Physical_Activity column
unique_conditions = data['Physical_Activity'].unique()
print(unique_conditions)

['Weekly' nan 'Daily']


In [8]:
# Replace nan values with 'None'
data['Physical_Activity'] = data['Physical_Activity'].fillna('None')

In [9]:
# Count unique occurrences for each mental health condition
mental_health_unique_counts = data.groupby('Mental_Health_Condition')['Employee_ID'].nunique().reset_index()
mental_health_unique_counts.columns = ['Mental_Health_Condition', 'Unique_Employee_Count']

# Count unique occurrences for each physical activity status
physical_activity_unique_counts = data.groupby('Physical_Activity')['Employee_ID'].nunique().reset_index()
physical_activity_unique_counts.columns = ['Physical_Activity', 'Unique_Employee_Count']

# Display results
print("Unique counts for Mental Health Conditions:")
print(mental_health_unique_counts)

print("\nUnique counts for Physical Activity:")
print(physical_activity_unique_counts)


Unique counts for Mental Health Conditions:
  Mental_Health_Condition  Unique_Employee_Count
0                 Anxiety                   1278
1                 Burnout                   1280
2              Depression                   1246
3                    None                   1196

Unique counts for Physical Activity:
  Physical_Activity  Unique_Employee_Count
0             Daily                   1616
1              None                   1629
2            Weekly                   1755


In [10]:
# Check for missing values
print(data.isnull().sum())

_id                                  0
Employee_ID                          0
Age                                  0
Gender                               0
Job_Role                             0
Industry                             0
Years_of_Experience                  0
Work_Location                        0
Hours_Worked_Per_Week                0
Number_of_Virtual_Meetings           0
Work_Life_Balance_Rating             0
Stress_Level                         0
Mental_Health_Condition              0
Access_to_Mental_Health_Resources    0
Productivity_Change                  0
Social_Isolation_Rating              0
Satisfaction_with_Remote_Work        0
Company_Support_for_Remote_Work      0
Physical_Activity                    0
Sleep_Quality                        0
Region                               0
dtype: int64


In [11]:
# Check for duplicates
duplicates = data.duplicated().sum()
print(f"Number of duplicate records: {duplicates}")

Number of duplicate records: 0


In [12]:
# Create a cleaned DataFrame
data_cleaned = data.copy()

# Reset index
data_cleaned.reset_index(drop=True, inplace=True)

In [15]:
# Summary statistics for numerical columns
summary_stats = data_cleaned.describe()
print(summary_stats)

# Summary statistics for categorical columns
categorical_summary = data_cleaned.describe(include='object')
print(categorical_summary)


                Age  Years_of_Experience  Hours_Worked_Per_Week  \
count  25000.000000          25000.00000           25000.000000   
mean      40.995000             17.81020              39.614600   
std       11.295117             10.01961              11.859245   
min       22.000000              1.00000              20.000000   
25%       31.000000              9.00000              29.000000   
50%       41.000000             18.00000              40.000000   
75%       51.000000             26.00000              50.000000   
max       60.000000             35.00000              60.000000   

       Number_of_Virtual_Meetings  Work_Life_Balance_Rating  \
count                 25000.00000              25000.000000   
mean                      7.55900                  2.984200   
std                       4.63575                  1.410401   
min                       0.00000                  1.000000   
25%                       4.00000                  2.000000   
50%               