# Group Pipeline

## Data Loading

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Import Libraries
import pandas as pd
import numpy as np

In [None]:
# Import the dataset
data=pd.read_csv("https://raw.githubusercontent.com/Dilum-Alahakoon/AIML-Project/refs/heads/main/data/raw/post_pandemic_remote_work_health_impact_2025.csv")

# Converting to a dataframe
data_df=pd.DataFrame(data)



In [None]:
# Display the first five rows
data_df.head()

In [None]:
# Display the last five rows
data_df.tail()

In [None]:
# Shape of the dataset

print(f"Shape of the dataset: {data_df.shape}")
print(f"Number of rows : {data_df.shape[0]}")
print(f"Number of rows : {data_df.shape[1]}")

In [None]:
# Dataset Information
data_df.info()

In [None]:
# Descriptive statistics of numerical data in dataset
data_df.describe()

In [None]:
data_df['Mental_Health_Status'].value_counts()

In [None]:
import matplotlib.pyplot as plt

ax = data_df.hist(
    bins=50,
    figsize=(20, 10),
    color='skyblue',
    edgecolor='black',
    grid=False
)


for a in ax.ravel():
    a.set_facecolor("#f9f9f9")
    a.grid(True, linestyle='--', alpha=0.5)
    a.tick_params(axis='x', labelsize=10)
    a.tick_params(axis='y', labelsize=10)
    a.set_title(a.get_title(), fontsize=12, fontweight='bold')


plt.suptitle("Feature Distributions", fontsize=20, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

In [None]:
categorical_features=data_df.select_dtypes(include=['object']).columns.tolist()
numerical_features=data_df.select_dtypes(include=[np.number]).columns.tolist()
print(f"Categorical Features: {categorical_features}\n")
print(f"Numerical Features: {numerical_features}")

In [None]:
for col in categorical_features:
  print(data_df[col].value_counts())
  print()

In [None]:
import seaborn as sns

for col in categorical_features:
  plt.figure(figsize=(8,4))
  sns.countplot(y=col,data=data_df,order=data_df[col].value_counts().index,palette='viridis')
  plt.title(f"Distribution of {col}")
  plt.tight_layout()
  plt.show()

## Handling Missing Data

In [None]:
# Identifying the missing data

data_df.isnull().sum()

In [None]:
# Heatmap for missing values

import seaborn as sns
plt.figure(figsize=(12,6))
sns.heatmap(data_df.isnull(),cbar=False)
plt.title("Missing Values Heatmap")
plt.show()

missing_counts=data_df.isnull().sum().sort_values(ascending=False)
print(missing_counts[missing_counts>0])

In [None]:
# Bar graph for the missing values

missing_counts=data_df.isnull().sum()
missing_cols=missing_counts[missing_counts>0].sort_values(ascending=False)

if len(missing_cols)==0:
  print("No missing values in the dataset!")
else:
  plt.figure(figsize=(10,6))
  sns.barplot(x=missing_cols.index, y=missing_cols.values,palette='viridis')
  plt.xticks(rotation=45,ha='right')
  plt.ylabel('Number of Missing Values')
  plt.title('Missing Values Count by Feature')
  plt.tight_layout()
  plt.show()

In [None]:
# Drop the missing values of the target variable(Mental Helth Status)

data_df.dropna(subset=['Mental_Health_Status'],inplace=True)

In [None]:
data_df.isnull().sum()

In [None]:
# data shape after removing the missing values of the target variable

print(f"\nNumber of rows & columns after removing the missing values of the target variable: {data_df.shape}")

In [None]:
# hetamap after removing the missing values of the target variable

import seaborn as sns
plt.figure(figsize=(12,6))
sns.heatmap(data_df.isnull(),cbar=False)
plt.title('Missing Values Heatmap')
plt.show()


missing_counts=data_df.isnull().sum().sort_values(ascending=False)
print(missing_counts[missing_counts>0])

In [None]:
# Bar graph after removing the target variable (Mental Health Status) missing values

missing_counts=data_df.isnull().sum()
missing_cols=missing_counts[missing_counts>0].sort_values(ascending=False)

if len(missing_cols)==0:
  print("No missing values in the dataset!")
else:
  plt.figure(figsize=(10,6))
  sns.barplot(x=missing_cols.index, y=missing_cols.values,palette='viridis')
  plt.xticks(rotation=45,ha='right')
  plt.ylabel('Number of Missing Values')
  plt.title('Missing Values Count by Feature')
  plt.tight_layout()
  plt.show()

After removing the missing values of the target variable there are 203 missing values in the Phisical Health Issues because of that we need to handle that missing values

In [None]:
# getting the count of the physical health issues count

data_df['Physical_Health_Issues'].value_counts()

In [None]:
X=data_df.drop(labels=['Physical_Health_Issues'],axis=1)
y=data_df['Mental_Health_Status']

In [None]:
numerical_features=X.select_dtypes(include=[np.number]).columns.tolist()

In [None]:
# creating a new dataframe only using numerical features
test_dataframe=pd.DataFrame(X[numerical_features])

In [None]:
# to this dataframe attaching the target variable as Physical_Health_Issues
test_dataframe['target']=data_df['Physical_Health_Issues']

In [None]:
test_dataframe.head()

In [None]:
# calculating the missing values in the new dataframe
# here in the data frame in the target variable there are 208 missing values

test_dataframe.isnull().sum()

In [None]:
# here i split the newly formed dataset into two parts known part contain the rows with not missing values in the target variable and missing part it contains rows with missing values in the target variable

known=test_dataframe[test_dataframe['target'].notna()]
missing=test_dataframe[test_dataframe['target'].isna()]

In [None]:
known.head()

In [None]:
# here in the known column we are split into input and and predicting variable

mis_X=known.drop('target',axis=1)
mis_y=known['target']

In [None]:
# using randomforest classifier to train a model using abobe splitted data

from sklearn.ensemble import RandomForestClassifier

model=RandomForestClassifier(random_state=42)
model.fit(mis_X,mis_y)

In [None]:
# seperating the unknown part and drop the target variable
X_unknown=missing.drop('target',axis=1)

# predicting the values for X_unknown
predicted=model.predict(X_unknown)

In [None]:
# attaching the predicted part to the dataframe

test_dataframe.loc[test_dataframe['target'].isna(),'target']=predicted

In [None]:
data_df.drop('Physical_Health_Issues',axis=1,inplace=True)

In [None]:
data_df['Physical_Health_Issues']=test_dataframe['target']

In [None]:
data_df['Physical_Health_Issues'].value_counts()

In [None]:
# After removing the missing values

data_df.isnull().sum()



## Feature Creation

In [None]:
print(f"Shape of the dataset: {data_df.shape}")
print(f"Number of rows : {data_df.shape[0]}")
print(f"Number of columns : {data_df.shape[1]}")

In [None]:
# Viewing the basic details of the dataset
if 'data_df' in locals():
    print("\n--- First 5 rows of the dataset: ---")
    print(data_df.head())

    print("\n--- Dataset Info (Columns, Data Types, Non-null counts): ---")
    data_df.info()

In [None]:
data_df['Salary_Range'].value_counts()

In [None]:
data_df['Salary_Range_Clean'] = data_df['Salary_Range'].str.replace('$', '', regex=False).str.replace('K', '', regex=False).str.replace('+', '', regex=False)


In [None]:
data_df.info()


In [None]:
salary_split = data_df['Salary_Range_Clean'].str.split('-', expand=True)


In [None]:
min_salary = pd.to_numeric(salary_split[0])


In [None]:
max_salary = pd.to_numeric(salary_split[1]).fillna(min_salary)


In [None]:
data_df['Average_Salary'] = (min_salary + max_salary) / 2


In [None]:
data_df.info()


In [None]:
# Drop the original and temporary salary columns as they are no longer needed
data_df = data_df.drop(columns=['Salary_Range', 'Salary_Range_Clean'])

print("Successfully created 'Average_Salary' feature (with error fixed).")
print(data_df.head())

In [None]:
data_df['Physical_Issue_Count'] = data_df['Physical_Health_Issues'].apply(lambda x: 0 if x == 'None' else len(x.split(';')))


In [None]:
print("\nSuccessfully created 'Physical_Issue_Count' feature.")


In [None]:
print(data_df[['Physical_Health_Issues', 'Physical_Issue_Count']].head())


In [None]:
print(data_df.head(5))


In [None]:
data_df.columns


In [None]:
data_df.info()


## Encoding_Categorical_Varables

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import joblib

In [None]:
display(data_df.head())


In [None]:
categorical_features=data_df.select_dtypes(include=['object']).columns.tolist()


In [None]:
# information about categorical labels
for col in categorical_features:
  print(data_df[col].value_counts())
  print()

Now we have to choose categorical columns for encode using onehot encoding and label encoding



In [None]:
one_hot_columns=['Physical_Health_Issues','Work_Arrangement','Job_Role','Region','Gender']
label_columns=['Burnout_Level']

In [None]:
# importing onehot encoder from sklearn
from sklearn.preprocessing import OneHotEncoder
onehot=OneHotEncoder(sparse_output=False)


# fitting the data to onehot encoder
onehot.fit(data_df[one_hot_columns])

In [None]:
# transform the data
transformed_onehot_data=onehot.transform(data_df[one_hot_columns])

In [None]:
# dropping the onehot columns from the origianl dataset
data_df.drop(one_hot_columns,axis=1,inplace=True)

In [None]:
# Get feature names from onehot.categories_
onehot_feature_names = []
for i, col in enumerate(one_hot_columns):
  for cat in onehot.categories_[i]:
    onehot_feature_names.append(f'{col}_{cat}')

# Create a DataFrame from the transformed data
transformed_one_hot_df = pd.DataFrame(transformed_onehot_data, columns=onehot_feature_names, index=data_df.index)

# Concatenate the original DataFrame and the new one-hot encoded DataFrame
data_df = pd.concat([data_df, transformed_one_hot_df], axis=1)

In [None]:
# doing the label encoding
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

In [None]:
data_df.head()

In [None]:
#One Hot Encoded Histograms
for col in onehot_feature_names:
    plt.figure(figsize=(6,4))
    sns.histplot(data_df[col], bins=2, kde=False)
    plt.title(f'Histogram of {col} (One-Hot Encoded)')
    plt.xlabel("Value (0 = Absent, 1 = Present)")
    plt.ylabel("Count")
    plt.show()

In [None]:
# Sum each one-hot column
category_counts = data_df[onehot_feature_names].sum().sort_values(ascending=False)

plt.figure(figsize=(10,6))
sns.barplot(x=category_counts.index, y=category_counts.values)
plt.xticks(rotation=45, ha="right")
plt.title("Frequency of Categories (One-Hot Encoded)")
plt.xlabel("Category")
plt.ylabel("Count")
plt.show()

In [None]:
#Label Encoded Histogram
for col in label_columns:
    plt.figure(figsize=(6,4))
    sns.histplot(data_df[col], bins=len(data_df[col].unique()), kde=False)
    plt.title(f'Histogram of {col} (Label Encoded)')
    plt.xlabel(col)
    plt.ylabel('Count')
    plt.show()

##  Feature Selection

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

In [None]:
# Displaying the first 5 rows of the dataset
print("First few rows of the dataset:")
print(data_df.head())

# Getting a summary of the dataset's information
print("\nDataset Info:")
data_df.info()

In [None]:
# Dropping the 'Survey_Date' column
data_df = data_df.drop('Survey_Date', axis=1)

In [None]:
# Converting the 'Industry' column using one-hot encoding
# This creates multiple columns based on the industry type instead of a single column
data_df = pd.get_dummies(data_df, columns=['Industry'], drop_first=True)

In [None]:
# Encoding our target variable, the 'Mental_Health_Status' column
# Here, text values like 'Anxiety', 'Depression' are converted to numbers like 0, 1, 2
label_encoder = LabelEncoder()
data_df['Mental_Health_Status'] = label_encoder.fit_transform(data_df['Mental_Health_Status'])

In [None]:
# Viewing the dataset after preprocessing
print("\nDataset after preprocessing:")
print(data_df.head())
data_df.info()

In [None]:
# Separating the features (X) and the target variable (y)
X = data_df.drop('Mental_Health_Status', axis=1)
y = data_df['Mental_Health_Status']

In [None]:
# Separating the features (X) and the target variable (y)
X = data_df.drop('Mental_Health_Status', axis=1)
y = data_df['Mental_Health_Status']

# Select only the numerical columns for correlation analysis, excluding the 'Burnout_Level'
numerical_cols_for_corr = X.select_dtypes(include=np.number).columns

# Creating the correlation matrix of the numerical features
# Exclude 'Burnout_Level' for now as it will be label encoded later
corr_matrix = X[numerical_cols_for_corr].corr().abs()

# Selecting the upper triangle of the correlation matrix
# Because the matrix is symmetric, we only need to check one half
upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Identifying features with a correlation greater than 0.90 to add to the 'to_drop' list
to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > 0.90)]

print(f"\nFeatures to drop due to high correlation ({len(to_drop)}):")
print(to_drop)

# Dropping these features from the DataFrame X
X_filtered = X.drop(columns=to_drop)

# Label encode the 'Burnout_Level' column in X_filtered
label_encoder = LabelEncoder()
X_filtered['Burnout_Level'] = label_encoder.fit_transform(X_filtered['Burnout_Level'])

print(f"\nNumber of remaining features after dropping: {X_filtered.shape[1]}")

# Visualizing the correlation matrix with a heatmap (Optional)
# Include 'Burnout_Level' now that it's encoded
plt.figure(figsize=(12, 10))
sns.heatmap(X_filtered.corr(), cmap='coolwarm')
plt.title("Correlation Heatmap of Features (After Dropping Highly Correlated)")
plt.show()

In [None]:
X_filtered.info()

In [None]:
# Assuming 'X_filtered' and 'y' are available from the previous steps

# Creating a RandomForestClassifier model
# n_estimators is the number of trees in the forest
# random_state ensures that the results are reproducible
model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

# Training the model using our filtered features and the target variable
model.fit(X_filtered, y)

# Getting the feature importances from the trained model
importances = model.feature_importances_

# Putting the feature importances into a pandas DataFrame and sorting them
feature_importance_df = pd.DataFrame({
    'feature': X_filtered.columns,
    'importance': importances
}).sort_values('importance', ascending=False)

print("\nFeature Importances (Ranked):")
print(feature_importance_df)

# Visualizing the top 15 most important features
plt.figure(figsize=(10, 8))
sns.barplot(x='importance', y='feature', data=feature_importance_df.head(15), palette='viridis', hue='feature', legend=False)
plt.title('Top 15 Most Important Features (from Random Forest)')
plt.xlabel('Importance Score')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

# Selecting the list of the top 15 most important features
top_15_features = feature_importance_df.head(15)['feature'].tolist()

print("\nFinal list of the selected top 15 features:")
print(top_15_features)

In [None]:
# Assuming 'X_filtered' and 'top_15_features' are available from the previous steps

# We have the list of top 15 features selected by Random Forest from the previous step
# top_15_features = ['Burnout_Level', 'Work_Life_Balance_Score', 'Social_Isolation_Score', ...]

# Now, from the correlation-filtered DataFrame (X_filtered),
# we create a new DataFrame by selecting only those most important features.
# This will be our final feature set.
X_final = X_filtered[top_15_features]

# The target variable (y) remains the same.

#--------------------------------------------------------------
# From now on, you will use X_final and y for your future steps
#--------------------------------------------------------------

print("Final Features to be used in the Model (X_final):")
print(f"Number of features: {X_final.shape[1]}")
print(X_final.head())

print("\nTarget Variable (y):")
print(y.head())

## Zscore_Scaling

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Columns to scale (continuous numeric ones)
cols_to_scale = ["Age", "Hours_Per_Week", "Average_Salary"]

# Initialize scaler
scaler = StandardScaler()

# Fit and transform the selected columns
data_df[cols_to_scale] = scaler.fit_transform(data_df[cols_to_scale])

print(data_df.head())

## Outlier__Removal

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# selecting numerical features
numerical_features=data_df.select_dtypes(include=[np.number]).columns.tolist()
print(f"Numerical features: \n{numerical_features}")

In [None]:
import math
sns.set(style="whitegrid", palette="pastel")

n = len(numerical_features)
cols = 3
rows = math.ceil(n / cols)

fig, axes = plt.subplots(rows, cols, figsize=(cols * 5, rows * 5))


axes = axes.flatten()

for i, col in enumerate(numerical_features):
    sns.boxplot(y=df[col], ax=axes[i], color=sns.color_palette("Set2")[i % 8])
    axes[i].set_title(f"{col} Distribution", fontsize=12, fontweight="bold")
    axes[i].set_xlabel("")
    axes[i].tick_params(axis='y', labelsize=10)


for j in range(i+1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()

