In [1584]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchaudio
import numpy as np
import scipy
import sklearn
import pandas as pd
import PIL
import matplotlib
import tokenizers
import datasets
import transformers
import matplotlib.pyplot as plt
from collections import Counter
import datetime

# Data exploration

Here we explore each feature column and comment on how we are going to preprocess each one. We have a total of 140700 samples. The last feature is the label depression which we will train on.

## Id column



In [1585]:
# Load the dataset
data = pd.read_csv("./data/train.csv")

# Print examples of the 'id' column
print("Examples of 'id' column:")
print(data["id"].head())

print("")

print("We have " ,data["id"].count()," amount of samples")

Examples of 'id' column:
0    0
1    1
2    2
3    3
4    4
Name: id, dtype: int64

We have  140700  amount of samples


We will drop the id column since there is no correlation between this and the label.

## Name column

In [1586]:
# Print examples of the 'Name' column
print("Examples of 'Name' column:")
print(data["Name"].head())

print("")

print("Number of unique names:", data["Name"].nunique())

Examples of 'Name' column:
0    Aaradhya
1       Vivan
2      Yuvraj
3      Yuvraj
4        Rhea
Name: Name, dtype: object

Number of unique names: 422


We thought about dropping the name column since we thought that this may not have any correlation to the label. But we realised that there might be a correlation, so we decided to keep it. The correlation being that for example having a "unattractive" name can affect your life in a bad way. 


For pre-processing this we want to give the name a value of how unique it is.

## Gender

In [1587]:
# Print examples of the 'Name' column
print("Examples of 'Name' column:")
print(data["Gender"].head())

print("")

print("Number of unique genders:", data["Gender"].nunique())

Examples of 'Name' column:
0    Female
1      Male
2      Male
3      Male
4    Female
Name: Gender, dtype: object

Number of unique genders: 2


We will just encode male to 1 and female to 0.

## Age

In [1588]:
print("Examples of 'Age' column:")
print(data["Age"].head())

print("")

print("Number of unique ages:", data["Age"].nunique())

Examples of 'Age' column:
0    49.0
1    26.0
2    33.0
3    22.0
4    30.0
Name: Age, dtype: float64

Number of unique ages: 43


We will only change the number from float to integer.

# Academic pressure

In [1589]:
print("Examples of 'Academic pressure' column:")
print(data["Academic Pressure"].head())

print("")

print("Number of unique Academic pressures:", data["Academic Pressure"].nunique())

Examples of 'Academic pressure' column:
0    NaN
1    NaN
2    5.0
3    NaN
4    NaN
Name: Academic Pressure, dtype: float64

Number of unique Academic pressures: 5


For the "Academic Pressure" column we want to replace NaN with zeros.

# Family History of Mental Illnes

In [1590]:
print("Examples of 'Family History of Mental Illness' column:")
print(data["Family History of Mental Illness"].head())

print("")

print("Number of unique Family History of Mental Illnesses:", data["Family History of Mental Illness"].nunique())

Examples of 'Family History of Mental Illness' column:
0     No
1     No
2     No
3    Yes
4    Yes
Name: Family History of Mental Illness, dtype: object

Number of unique Family History of Mental Illnesses: 2


For this column we will change yes and no to 1 and 0

# Have you ever had suicidal thoughts ?

In [1591]:
print("Examples of 'Have you ever had suicidal thoughts ?' column:")
print(data["Have you ever had suicidal thoughts ?"].head())

print("")

print("Number of unique Have you ever had suicidal thoughts ?:", data["Have you ever had suicidal thoughts ?"].nunique())

Examples of 'Have you ever had suicidal thoughts ?' column:
0     No
1    Yes
2    Yes
3    Yes
4    Yes
Name: Have you ever had suicidal thoughts ?, dtype: object

Number of unique Have you ever had suicidal thoughts ?: 2


We will also change this column to binary no / yes to 0 / 1

# Working Professional or Student

In [1592]:
print("Examples of 'Working Professional or Student' column:")
print(data["Working Professional or Student"].head())

print("")

print("Number of unique Working Professional or Student:", data["Working Professional or Student"].nunique())

Examples of 'Working Professional or Student' column:
0    Working Professional
1    Working Professional
2                 Student
3    Working Professional
4    Working Professional
Name: Working Professional or Student, dtype: object

Number of unique Working Professional or Student: 2


In this column we will change "working professional" to 0 and "student" to 1.

# Profession

In [1593]:
print("Examples of 'Profession' column:")
print(data["Profession"].head())
print("")
print("NaN is student:")
print(data["Profession"][2])
print(data["Working Professional or Student"][2])

print("")

print("Number of unique Profession:", data["Profession"].nunique())



Examples of 'Profession' column:
0                Chef
1             Teacher
2                 NaN
3             Teacher
4    Business Analyst
Name: Profession, dtype: object

NaN is student:
nan
Student

Number of unique Profession: 64


There are occurences of NaN in this column, this happens when the sample is a student. We find it reasonable to insert "Student" in those slots. there are also occurences of NaN on samples that are not students, here we will insert "Missing Profession"

# Work pressure

In [1594]:
print("Examples of 'Work Pressure' column:")
print(data["Work Pressure"].head())

print("")

print("Number of unique Work Pressure:", data["Work Pressure"].unique())

Examples of 'Work Pressure' column:
0    5.0
1    4.0
2    NaN
3    5.0
4    1.0
Name: Work Pressure, dtype: float64

Number of unique Work Pressure: [ 5.  4. nan  1.  2.  3.]


We assume that a lower number means less work pressure, therefore We will change Nan to 0 because it is the students that has NaN on Work Pressure

# CGPA

In [1595]:
print("Examples of 'CGPA' column:")
print(data["CGPA"].head())
print(data["CGPA"].mean())

Examples of 'CGPA' column:
0     NaN
1     NaN
2    8.97
3     NaN
4     NaN
Name: CGPA, dtype: float64
7.658636192558608


We consider changing the NaN slots to the average of the dataset, which is 7.66

# Study Satisfaction and Job Satisfaction

In [1596]:
print("Examples of 'Study Satisfaction' and 'Job Satisfaction' column:")
print(data["Study Satisfaction"].head())
print(data["Job Satisfaction"].head())

Examples of 'Study Satisfaction' and 'Job Satisfaction' column:
0    NaN
1    NaN
2    2.0
3    NaN
4    NaN
Name: Study Satisfaction, dtype: float64
0    2.0
1    3.0
2    NaN
3    1.0
4    1.0
Name: Job Satisfaction, dtype: float64


You can tell that when there is missing a value in the study satisfaction column, there is a value in the same sample but on the job satisfaction problem. these two columns complete eachother, so we will combine these two columns into one "satisfaction" column.

# Sleep Duration

In [1597]:
sleep_counts = data["Sleep Duration"].value_counts()

print("Unique sleep sorted by count (most to least):")
for e, (duration, count) in enumerate(sleep_counts.items()):
    print(duration, count)
    if e == 6:
        break

Unique sleep sorted by count (most to least):
Less than 5 hours 38784
7-8 hours 36969
More than 8 hours 32726
5-6 hours 32142
3-4 hours 12
6-7 hours 8
4-5 hours 7


This one looks a bit tricky. since there are so few occurnces of other than the four most common inputs in this feature, we will change the numbers to a scale from 1 to 4, where 1 is "less than 5 hours" all the way to 4 which is "more than 8 hours". all the others will be set to 1.

# Dietary Habits

In [1598]:
diet_counts = data["Dietary Habits"].value_counts()

print("Unique sleep sorted by count (most to least):")
for e, (diet, count) in enumerate(diet_counts.items()):
    print(diet, count)
    if e == 6:
        break

Unique sleep sorted by count (most to least):
Moderate 49705
Unhealthy 46227
Healthy 44741
Yes 2
No 2
More Healthy 2
No Healthy 1


* Healthy -> 2
* Moderate -> 1
* Unhealthy -> 0
* The rest -> 1

# Degree

In [1599]:
degree_counts = data["Degree"].value_counts()
print(data["Degree"].nunique())

print("Unique sleep sorted by count (most to least):")
for e, (degree, count) in enumerate(degree_counts.items()):
    print(degree, count)
    if e == 10:
        break

115
Unique sleep sorted by count (most to least):
Class 12 14729
B.Ed 11691
B.Arch 8742
B.Com 8113
B.Pharm 5856
BCA 5739
M.Ed 5668
MCA 5234
BBA 5030
BSc 5027
MSc 4879


This one is particularly difficult because there are 115 unique degrees and there are not just a few degrees that covers the majority of the dataset either as the case is in the dietary habit feature. Our approach here is to somehow categorize the different degrees into bachelor, master, doctrine etc. and then give each of them a number from -1 to 4 based on the rank of the degree, going from "other" (which is the case where we can't define what degree it is) to professional.

# Pre-processing

Dropping the Id column

In [1600]:
def dropId(data):

    # Drop the 'id' column
    data = data.drop(columns=["id"])
    return data


Encoding the name column

In [1601]:
def name_freq(data):
    # Replace each name with its frequency in the dataset
    data["Name"] = data["Name"].map(data["Name"].value_counts())
    return data

Encoding gender column

In [1602]:
def gender_encode(data):
    data["Gender"] = data["Gender"].map({"Female": 0, "Male": 1 })
    return data

Age

In [1603]:
def age_integer(data):
    data["Age"] = data["Age"].astype(int)
    return data

Academic Preessure

In [1604]:
def academic_pressure(data):
    data["Academic Pressure"] = np.where(
        data["Academic Pressure"].isnull(), 0,
        data["Academic Pressure"]
    )
    return data

Family History of Mental Illnes

In [1605]:
def family_mental_illness(data):
    data["Family History of Mental Illness"] = np.where(
        data["Family History of Mental Illness"] == "No", 0,
        np.where(
            data["Family History of Mental Illness"] == "Yes", 1,
            data["Family History of Mental Illness"]
        )
    )
    return data

Have you ever had suicidal thoughts ?

In [1606]:
def suicidal_thoughts(data):
    data["Have you ever had suicidal thoughts ?"] = np.where(
        data["Have you ever had suicidal thoughts ?"] == "No", 0,
        np.where(
            data["Have you ever had suicidal thoughts ?"] == "Yes", 1,
            data["Have you ever had suicidal thoughts ?"]
        )
    )
    return data

Working Professional or Student

In [1607]:
def working_or_student(data):
    data["Working Professional or Student"] = np.where(
        data["Working Professional or Student"] == "Working Professional", 0,
        np.where(
            data["Working Professional or Student"] == "Student", 1,
            data["Working Professional or Student"]
        )
    )       
    return data

Profession

In [1608]:
def profession(data):
    # Count occurrences of each profession
    profession_counts = data["Profession"].value_counts()
    
    # Update the "Profession" column
    data["Profession"] = np.where(
        data["Profession"].isnull() & (data["Working Professional or Student"] == 1),
        "Student",
        np.where(
            data["Profession"].isnull() & (data["Working Professional or Student"] == 0), 
            "Missing Profession", 
            np.where(
                data["Profession"].map(profession_counts) < 6,  # If occurrence < 10, set to "Missing Profession"
                "Missing Profession",
                data["Profession"]
            )
        )
    )
    return data


Work Pressure

In [1609]:
def work_pressure(data):
    data["Work Pressure"] = np.where(
        data["Work Pressure"].isnull(), 0.0, data["Work Pressure"]
    )
    return data

CGPA

In [1610]:
def cgpa(data):
    data["CGPA"] = np.where(
        data["CGPA"].isnull(), data["CGPA"].mean(), data["CGPA"]
    )
    return data

Study- and Job Satisfaction

In [1611]:
def satisfaction(data):
    satisfaction = np.where(
        data["Study Satisfaction"].notnull(), data["Study Satisfaction"], np.where(
            data["Job Satisfaction"].notnull(), data["Job Satisfaction"], 0
        )
    )
    data["Job Satisfaction"] = satisfaction
    data = data.drop(columns="Study Satisfaction")
    data.rename(columns={"Job Satisfaction": "Satisfaction"}, inplace=True)
    return data

Sleep Duration

In [1612]:
def sleep_duration(data):
    data["Sleep Duration"] = np.where(
        data["Sleep Duration"] == "Less than 5 hours", 1, np.where(
            data["Sleep Duration"] == "5-6 hours", 2, np.where(
                data["Sleep Duration"] == "7-8 hours", 3, np.where(
                    data["Sleep Duration"] == "More than 8 hours", 4, 1
                )
            )
        )
    )
    return data

Dietary Habits

In [1613]:
def diet(data):
    data["Dietary Habits"] = np.where(
        data["Dietary Habits"] == "Healthy", 2, np.where(
            data["Dietary Habits"] == "Moderate", 1, np.where(
                data["Dietary Habits"] == "Unhealthy", 0, 1
            )
        )
    )
    return data

In [1614]:
def categorize_degree(degree):

    """
    Categorizes the differnet degrees into Schoo, Bachelors, Masters, Doctrate, professional and other
    """
    
    degree = str(degree).strip().lower()

    # School-Level
    if "class 11" in degree or "class 12" in degree:
        return "School"

    # Bachelor's Degrees
    bachelor_keywords = ["b.", "b ", "bachelor", "bcom", "bsc", "btech", "be", "bba", "bca", "ba", "b.ed", "b.arch"]
    if any(keyword in degree for keyword in bachelor_keywords):
        return "Bachelors"

    # Master's Degrees
    master_keywords = ["m.", "m ", "master", "mba", "mcom", "msc", "mtech", "me", "mca", "m.ed", "mpharm", "m.arch"]
    if any(keyword in degree for keyword in master_keywords):
        return "Masters"

    # Doctoral Degrees
    if "phd" in degree:
        return "Doctorate"

    # Professional Degrees
    professional_keywords = ["mbbs", "md", "llb", "llm"]
    if any(keyword in degree for keyword in professional_keywords):
        return "Professional"

    # Unknown / Noisy Data
    return "Other"

def degree(data):
    
    # Apply categorization
    data["Degree"] = data["Degree"].apply(categorize_degree)


    # Define an ordinal mapping
    degree_mapping = {
        "School": 0,
        "Bachelors": 1,
        "Masters": 2,
        "Professional": 3,
        "Doctorate": 4,
        "Other": -1  # Keep 'Other' at the highest level or remove it depending on the approach
    }

    # Apply mapping
    data["Degree"] = data["Degree"].map(degree_mapping)

    return data 



Categorizing Professions

In [1615]:
# Function to categorize professions into broader industry groups
def categorize_profession(profession):
    if pd.isna(profession) or "missing" in profession.lower():
        return "Unknown"
    
    profession = profession.lower().strip()

    # Technology
    tech_keywords = ["software", "data scientist", "ux/ui", "developer", "engineer"]
    if any(keyword in profession for keyword in tech_keywords):
        return "Technology"

    # Finance
    finance_keywords = ["accountant", "financial", "investment", "banker", "analyst"]
    if any(keyword in profession for keyword in finance_keywords):
        return "Finance"

    # Healthcare
    healthcare_keywords = ["doctor", "pharmacist", "dentist", "nurse"]
    if any(keyword in profession for keyword in healthcare_keywords):
        return "Healthcare"

    # Education
    education_keywords = ["teacher", "professor", "educational"]
    if any(keyword in profession for keyword in education_keywords):
        return "Education"

    # Engineering
    engineering_keywords = ["civil engineer", "mechanical engineer", "architect"]
    if any(keyword in profession for keyword in engineering_keywords):
        return "Engineering"

    # Marketing & Sales
    marketing_keywords = ["marketing", "sales", "digital marketer", "content writer"]
    if any(keyword in profession for keyword in marketing_keywords):
        return "Marketing/Sales"

    # Trade & Manual Work
    trade_keywords = ["electrician", "plumber", "chef", "mechanic"]
    if any(keyword in profession for keyword in trade_keywords):
        return "Trade"

    # Legal
    legal_keywords = ["lawyer", "judge", "legal"]
    if any(keyword in profession for keyword in legal_keywords):
        return "Legal"

    # Consulting
    consulting_keywords = ["consultant", "business analyst"]
    if any(keyword in profession for keyword in consulting_keywords):
        return "Consulting"

    # Other / Unknown
    return "Other"

def Prefession_categorization(data):
    # Apply categorization
    data["Profession_Category"] = data["Profession"].apply(categorize_profession)

    # Encoding Options

    ## Option 1: Ordinal Encoding (useful if there's a natural order)
    profession_mapping = {
        "Unknown": 0,
        "Other": 1,
        "Trade": 2,
        "Marketing/Sales": 3,
        "Consulting": 4,
        "Education": 5,
        "Finance": 6,
        "Engineering": 7,
        "Healthcare": 8,
        "Legal": 9,
        "Technology": 10
    }

    data["Profession_Encoded"] = data["Profession_Category"].map(profession_mapping)

    data["Profession"] = data["Profession_Encoded"]
    return data
    ## Option 2: One-Hot Encoding (better for categorical data)
    # data = pd.get_dummies(data, columns=["Profession_Category"], prefix="Profession")


Pre-processing function:

In [1616]:
def pre_processing(data):

    data = dropId(data)
    data = name_freq(data)
    data = gender_encode(data)
    data = age_integer(data)
    data = academic_pressure(data)
    data = family_mental_illness(data)
    data = suicidal_thoughts(data)
    data = working_or_student(data)
    data = profession(data)
    data = work_pressure(data)
    data = cgpa(data)
    data = satisfaction(data)
    data = sleep_duration(data)
    data = diet(data)
    data = degree(data)
    data = Prefession_categorization(data)
    return data


data = pre_processing(data)

print(data.head())


name_counts = data["Degree"].value_counts()

print("Unique names sorted by count (most to least):")
for name, count in name_counts.items():
    print(f"{name}: {count}")

print(f"Total unique names: {len(name_counts)}")

data.to_csv("preprocessed_data.csv", index=False)

   Name  Gender  Age           City Working Professional or Student  \
0  2045       0   49       Ludhiana                               0   
1   963       1   26       Varanasi                               0   
2   730       1   33  Visakhapatnam                               1   
3   730       1   22         Mumbai                               0   
4   499       0   30         Kanpur                               0   

   Profession  Academic Pressure  Work Pressure      CGPA  Satisfaction  \
0           2                0.0            5.0  7.658636           2.0   
1           5                0.0            4.0  7.658636           3.0   
2           1                5.0            0.0  8.970000           2.0   
3           5                0.0            5.0  7.658636           1.0   
4           6                0.0            1.0  7.658636           1.0   

   Sleep Duration  Dietary Habits  Degree  \
0               4               2      -1   
1               1               