## Data Loading

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Import Libraries
import pandas as pd
import numpy as np

In [None]:
# Import the dataset
data=pd.read_csv("https://raw.githubusercontent.com/Dilum-Alahakoon/AIML-Project/refs/heads/main/data/post_pandemic_remote_work_health_impact_2025.csv")

# Converting to a dataframe
data_df=pd.DataFrame(data)



In [None]:
# Display the first five rows
data_df.head()

In [None]:
# Display the last five rows
data_df.tail()

In [None]:
# Shape of the dataset

print(f"Shape of the dataset: {data_df.shape}")
print(f"Number of rows : {data_df.shape[0]}")
print(f"Number of rows : {data_df.shape[1]}")

In [None]:
# Dataset Information
data_df.info()

In [None]:
# Descriptive statistics of numerical data in dataset
data_df.describe()

In [None]:
data_df.columns

In [None]:
data_df['Mental_Health_Status'].value_counts()

In [None]:
import matplotlib.pyplot as plt

ax = data_df.hist(
    bins=50,
    figsize=(20, 10),
    color='skyblue',
    edgecolor='black',
    grid=False
)


for a in ax.ravel():
    a.set_facecolor("#f9f9f9")
    a.grid(True, linestyle='--', alpha=0.5)
    a.tick_params(axis='x', labelsize=10)
    a.tick_params(axis='y', labelsize=10)
    a.set_title(a.get_title(), fontsize=12, fontweight='bold')


plt.suptitle("Feature Distributions", fontsize=20, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

In [None]:
categorical_features=data_df.select_dtypes(include=['object']).columns.tolist()
numerical_features=data_df.select_dtypes(include=[np.number]).columns.tolist()
print(f"Categorical Features: {categorical_features}\n")
print(f"Numerical Features: {numerical_features}")

In [None]:
for col in categorical_features:
  print(data_df[col].value_counts())
  print()

In [None]:
import seaborn as sns

for col in categorical_features:
  plt.figure(figsize=(8,4))
  sns.countplot(y=col,data=data_df,order=data_df[col].value_counts().index,palette='viridis')
  plt.title(f"Distribution of {col}")
  plt.tight_layout()
  plt.show()

## Handling Missing Data

In [None]:
# Identifying the missing data

data_df.isnull().sum()

In [None]:
# Heatmap for missing values

import seaborn as sns
plt.figure(figsize=(12,6))
sns.heatmap(data_df.isnull(),cbar=False)
plt.title("Missing Values Heatmap")
plt.show()

missing_counts=data_df.isnull().sum().sort_values(ascending=False)
print(missing_counts[missing_counts>0])

In [None]:
missing_counts=data_df.isnull().sum()
missing_cols=missing_counts[missing_counts>0].sort_values(ascending=False)

if len(missing_cols)==0:
  print("No missing values in the dataset!")
else:
  plt.figure(figsize=(10,6))
  sns.barplot(x=missing_cols.index, y=missing_cols.values,palette='viridis')
  plt.xticks(rotation=45,ha='right')
  plt.ylabel('Number of Missing Values')
  plt.title('Missing Values Count by Feature')
  plt.tight_layout()
  plt.show()

In [None]:
# Drop the missing values of the target variable(Mental Helth Status)

data_df.dropna(subset=['Mental_Health_Status'],inplace=True)

In [None]:
data_df.isnull().sum()

In [None]:
# data shape after removing the missing values of the target variable

print(f"\nNumber of rows & columns after removing the missing values of the target variable: {data_df.shape}")

In [None]:
# hetamap after removing the missing values of the target variable

import seaborn as sns
plt.figure(figsize=(12,6))
sns.heatmap(data_df.isnull(),cbar=False)
plt.title('Missing Values Heatmap')
plt.show()


missing_counts=data_df.isnull().sum().sort_values(ascending=False)
print(missing_counts[missing_counts>0])

In [None]:
missing_counts=data_df.isnull().sum()
missing_cols=missing_counts[missing_counts>0].sort_values(ascending=False)

if len(missing_cols)==0:
  print("No missing values in the dataset!")
else:
  plt.figure(figsize=(10,6))
  sns.barplot(x=missing_cols.index, y=missing_cols.values,palette='viridis')
  plt.xticks(rotation=45,ha='right')
  plt.ylabel('Number of Missing Values')
  plt.title('Missing Values Count by Feature')
  plt.tight_layout()
  plt.show()

In [None]:
data_df['Physical_Health_Issues'].value_counts()

In [None]:
X=data_df.drop(labels=['Physical_Health_Issues'],axis=1)
y=data_df['Mental_Health_Status']

In [None]:
numerical_features=X.select_dtypes(include=[np.number]).columns.tolist()

In [None]:
test_dataframe=pd.DataFrame(X[numerical_features])

In [None]:
test_dataframe['target']=data_df['Physical_Health_Issues']

In [None]:
test_dataframe.head()

In [None]:
test_dataframe.isnull().sum()

In [None]:
known=test_dataframe[test_dataframe['target'].notna()]
missing=test_dataframe[test_dataframe['target'].isna()]

In [None]:
known.head()

In [None]:
mis_X=known.drop('target',axis=1)
mis_y=known['target']

In [None]:
from sklearn.ensemble import RandomForestClassifier

model=RandomForestClassifier(random_state=42)
model.fit(mis_X,mis_y)

In [None]:
X_unknown=missing.drop('target',axis=1)
predicted=model.predict(X_unknown)

In [None]:
test_dataframe.loc[test_dataframe['target'].isna(),'target']=predicted

In [None]:
data_df.drop('Physical_Health_Issues',axis=1,inplace=True)

In [None]:
data_df['Physical_Health_Issues']=test_dataframe['target']

In [None]:
data_df.isnull().sum()

In [None]:
data_df['Physical_Health_Issues'].value_counts()

In [None]:
# After removing the missing values

data_df.isnull().sum()

