# IMPORT DEPENDENCIES

In [1]:
import os
import pandas as pd
import numpy as np

#### CREATING FILE PATHS

In [3]:
# Directory where the original, unmodified data files are stored.
raw_path = "../data/raw"
# Directory where the cleaned, transformed will be saved for use by the model training pipeline.
processed_path = "../data/processed"
# Constructs the full, absolute path for the final processed dataset file.
output_file = os.path.join(processed_path, "hybrid_dataset.csv")

In [5]:
# path to the student dataset file.
student_file = "../data/raw/student_depression_dataset.csv"
# path to the Reddit mental health dataset file.
reddit_file =  "../data/raw/reddit_mental_health.csv"
student_df = pd.read_csv(student_file)
reddit_df = pd.read_csv(reddit_file)

# DATA CLEANING AND PREPROCESSING (STUDENT DATA)

In [7]:
# Display summary of the DataFrame, including the data types of each column,
# the number of non-null values, and memory usage. 
# to check for missing data and confirm column data types.
student_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27901 entries, 0 to 27900
Data columns (total 18 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   id                                     27901 non-null  int64  
 1   Gender                                 27901 non-null  object 
 2   Age                                    27901 non-null  float64
 3   City                                   27901 non-null  object 
 4   Profession                             27901 non-null  object 
 5   Academic Pressure                      27901 non-null  float64
 6   Work Pressure                          27901 non-null  float64
 7   CGPA                                   27901 non-null  float64
 8   Study Satisfaction                     27901 non-null  float64
 9   Job Satisfaction                       27901 non-null  float64
 10  Sleep Duration                         27901 non-null  object 
 11  Di

In [9]:
#shape of the data 
student_df.shape

(27901, 18)

In [11]:
# the first 5 rows of the data set
student_df.head()

Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,2,Male,33.0,Visakhapatnam,Student,5.0,0.0,8.97,2.0,0.0,'5-6 hours',Healthy,B.Pharm,Yes,3.0,1.0,No,1
1,8,Female,24.0,Bangalore,Student,2.0,0.0,5.9,5.0,0.0,'5-6 hours',Moderate,BSc,No,3.0,2.0,Yes,0
2,26,Male,31.0,Srinagar,Student,3.0,0.0,7.03,5.0,0.0,'Less than 5 hours',Healthy,BA,No,9.0,1.0,Yes,0
3,30,Female,28.0,Varanasi,Student,3.0,0.0,5.59,2.0,0.0,'7-8 hours',Moderate,BCA,Yes,4.0,5.0,Yes,1
4,32,Female,25.0,Jaipur,Student,4.0,0.0,8.13,3.0,0.0,'5-6 hours',Moderate,M.Tech,Yes,1.0,1.0,No,0


#### Dropping columns that are not needed

In [13]:
# The following columns are being dropped from the DataFrame as they are not necessarilt contributing model's mental health risk prediction capabilities.
# Dropping the primary identifier as it provides no predictive information(id)
# Dropping geographical data, as it won't add value to the model.(city)
# Dropping profession because the focus is on behavioral data, not profession type.(profession)
# Dropping satisfaction scores that are less relevant than behavioral/symptom data.(job and study satisfaction)
# Dropping educational level, which is considered not necessarily relevant to the prediction task.(degree)

student_df = student_df.drop(columns=["id","City","Profession","Study Satisfaction","Job Satisfaction","Degree"])


In [15]:
# this is removed to avoid leakage, it is highly correlated to depression
student_df = student_df.drop(columns= "Have you ever had suicidal thoughts ?")

In [17]:
# Iterate through every column remaining in the 'student_df' DataFrame.
# Print the column name and all the unique values found in that column.
for col in student_df.columns:
    print(f"{col} --> {student_df[col].unique()}\n")

Gender --> ['Male' 'Female']

Age --> [33. 24. 31. 28. 25. 29. 30. 27. 19. 20. 23. 18. 21. 22. 34. 32. 26. 39.
 35. 42. 36. 58. 49. 38. 51. 44. 43. 46. 59. 54. 48. 56. 37. 41.]

Academic Pressure --> [5. 2. 3. 4. 1. 0.]

Work Pressure --> [0. 5. 2.]

CGPA --> [ 8.97    5.9     7.03    5.59    8.13    5.7     9.54    8.04    9.79
  8.38    6.1     7.04    8.52    5.64    8.58    6.51    7.25    7.83
  9.93    8.74    6.73    5.57    8.59    7.1     6.08    5.74    9.86
  6.7     6.21    5.87    6.37    9.72    5.88    9.56    6.99    5.24
  9.21    7.85    6.95    5.86    7.92    9.66    8.94    9.71    7.87
  5.6     7.9     5.46    6.79    8.7     7.38    8.5     7.09    9.82
  8.89    7.94    9.11    6.75    7.53    9.49    9.01    7.64    5.27
  6.      9.44    5.75    7.51    9.05    6.38    8.95    9.88    5.32
  6.27    7.7     8.1     9.59    8.96    5.51    7.43    8.79    9.95
  5.37    6.86    8.32    9.74    5.66    7.48    8.23    8.81    6.03
  5.56    5.68    5.14    7.61

In [19]:
# is there any null values in the data?
print(student_df.isnull().sum())

Gender                              0
Age                                 0
Academic Pressure                   0
Work Pressure                       0
CGPA                                0
Sleep Duration                      0
Dietary Habits                      0
Work/Study Hours                    0
Financial Stress                    0
Family History of Mental Illness    0
Depression                          0
dtype: int64


In [21]:
# Calculate and display the descriptive (summary) statistics of the numerical columns in the DataFrame.
student_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,27901.0,25.8223,4.905687,18.0,21.0,25.0,30.0,59.0
Academic Pressure,27901.0,3.141214,1.381465,0.0,2.0,3.0,4.0,5.0
Work Pressure,27901.0,0.00043,0.043992,0.0,0.0,0.0,0.0,5.0
CGPA,27901.0,7.656104,1.470707,0.0,6.29,7.77,8.92,10.0
Work/Study Hours,27901.0,7.156984,3.707642,0.0,4.0,8.0,10.0,12.0
Depression,27901.0,0.585499,0.492645,0.0,0.0,1.0,1.0,1.0


###### CGPA CONVERTION TO 5 POINT GRADING SYSTEM

In [23]:
# Converting the CGPA from its original scale 10-point to a 5-point grading scale by dividing the value by 2 and rounded to 2 decimal places 
student_df['CGPA'] = (student_df['CGPA'] / 2).round(2)

# Display the first few rows after CGPA adjustment
student_df.head()

Unnamed: 0,Gender,Age,Academic Pressure,Work Pressure,CGPA,Sleep Duration,Dietary Habits,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,Male,33.0,5.0,0.0,4.49,'5-6 hours',Healthy,3.0,1.0,No,1
1,Female,24.0,2.0,0.0,2.95,'5-6 hours',Moderate,3.0,2.0,Yes,0
2,Male,31.0,3.0,0.0,3.52,'Less than 5 hours',Healthy,9.0,1.0,Yes,0
3,Female,28.0,3.0,0.0,2.8,'7-8 hours',Moderate,4.0,5.0,Yes,1
4,Female,25.0,4.0,0.0,4.07,'5-6 hours',Moderate,1.0,1.0,No,0


In [25]:
# convert the data type of the 'Age' column from float to integer .
student_df['Age'] = student_df['Age'].astype(int)
# remove the decimal places
student_df['Age'] = (student_df['Age']).round()

student_df.head()


Unnamed: 0,Gender,Age,Academic Pressure,Work Pressure,CGPA,Sleep Duration,Dietary Habits,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,Male,33,5.0,0.0,4.49,'5-6 hours',Healthy,3.0,1.0,No,1
1,Female,24,2.0,0.0,2.95,'5-6 hours',Moderate,3.0,2.0,Yes,0
2,Male,31,3.0,0.0,3.52,'Less than 5 hours',Healthy,9.0,1.0,Yes,0
3,Female,28,3.0,0.0,2.8,'7-8 hours',Moderate,4.0,5.0,Yes,1
4,Female,25,4.0,0.0,4.07,'5-6 hours',Moderate,1.0,1.0,No,0


In [27]:
# Define a mapping dictionary to convert the string labels "Yes" and "No" into their binary numerical representations (1 and 0)
yes_no_map = {"Yes": 1, "No": 0}
#Apply the mapping to the "Family History of Mental Illness" column.
student_df["Family History of Mental Illness"] =student_df["Family History of Mental Illness"].map(yes_no_map)

# DATA CLEANING AND PREPROCESSING (REDDIT DATA)

In [29]:
# Display the first five rows of the data
reddit_df.head()

Unnamed: 0.1,Unnamed: 0,text,title,target
0,0,Welcome to /r/depression's check-in post - a p...,"Regular check-in post, with information about ...",1
1,1,We understand that most people who reply immed...,Our most-broken and least-understood rules is ...,1
2,2,Anyone else just miss physical touch? I crave ...,"I haven’t been touched, or even hugged, in so ...",1
3,3,I’m just so ashamed. Everyone and everything f...,Being Depressed is Embarrassing,1
4,4,I really need a friend. I don't even have a si...,I'm desperate for a friend and to feel loved b...,1


In [31]:
# Display summary of the DataFrame, including the data types of each column,
# the number of non-null values, and memory usage. 
# to check for missing data and confirm column data types.
reddit_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5957 entries, 0 to 5956
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5957 non-null   int64 
 1   text        5607 non-null   object
 2   title       5957 non-null   object
 3   target      5957 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 186.3+ KB


In [33]:
# the shape of the dataset(rows and columns)
reddit_df.shape

(5957, 4)

In [35]:
# Calculate and display the descriptive (summary) statistics of the numerical columns in the DataFrame.
reddit_df.describe()

Unnamed: 0.1,Unnamed: 0,target
count,5957.0,5957.0
mean,595.230653,2.002182
std,344.00945,1.412252
min,0.0,0.0
25%,297.0,1.0
50%,595.0,2.0
75%,893.0,3.0
max,1201.0,4.0


In [37]:
# checl for null values in the dataset
reddit_df.isnull().sum()

Unnamed: 0      0
text          350
title           0
target          0
dtype: int64

In [39]:
#the null 350 null values was dropped because it less than 1% of the data
reddit_df=reddit_df.dropna()

In [41]:
reddit_df.shape

(5607, 4)

In [43]:
# unnammed column dropped
reddit_df = reddit_df.drop(columns="Unnamed: 0")

In [45]:
reddit_df.isnull().sum()

text      0
title     0
target    0
dtype: int64

In [47]:
reddit_df.columns

Index(['text', 'title', 'target'], dtype='object')

# Text Normalization (Lowercase)

In [51]:
# the columns is lowercased for consistent matching
reddit_df["text"] = reddit_df["text"].astype(str).str.strip().str.lower()
reddit_df["title"] = reddit_df["title"].astype(str).str.strip().str.lower()

# we keep only depression class[1] from the entire dataset since we are only concern with depression only
# Then we Filter the DataFrame to keep only the rows where the binary label is equal to 1. 
# This action ensures that 'reddit_df' contains only records classified as the 
# target class (depression) for further text processing.
reddit_df = reddit_df[reddit_df[reddit_df.columns[2]] == 1]


In [53]:
# Create a new column named "binary_label" in the 'reddit_df' DataFrame.
# Since the previous filtering step ensured that only records classified as the depression class were retained,
# we assign the value 1 to every row in this new column.
reddit_df["binary_label"] = 1  # all retained records are depressed

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reddit_df["binary_label"] = 1  # all retained records are depressed


# TEXT CLEANING FOR TRANSFORMER MODEL

In [55]:
# Text cleaning for transformer model
import re
# function to remove Urls, tags..
def clean_reddit_text(t: str) -> str:
    if not isinstance(t, str):
        return ""
    # this remove URLs
    t = re.sub(r"http\S+|www\.\S+", " ", t)
    # this remove Reddit user / sub tags
    t = re.sub(r"r/\w+", " ", t)
    t = re.sub(r"u/\w+", " ", t)
    # Normalize multiple whitespace characters (tabs, newlines, multiple spaces) into a single space.
    t = re.sub(r"\s+", " ", t)
    # this remove any leading or trailing whitespace and return the cleaned text.
    return t.strip()
    
# Apply the cleaning function to the first column (text) of 'reddit_df' and store the result in a new column called 'clean_text'.
reddit_df["clean_text"] = reddit_df[reddit_df.columns[0]].apply(clean_reddit_text)

#Create a new DataFrame ('reddit_clean') containing only the prepared features: the 'clean_text' and the assigned 'binary_label' (which is always 1).
# The index is reset and dropped to create a clean index.
reddit_clean = reddit_df[["clean_text", "binary_label"]].reset_index(drop=True)
# dispaly the first 5 rows of the new dataframe
reddit_df[["clean_text", "binary_label"]].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reddit_df["clean_text"] = reddit_df[reddit_df.columns[0]].apply(clean_reddit_text)


Unnamed: 0,clean_text,binary_label
0,welcome to / 's check-in post - a place to tak...,1
1,we understand that most people who reply immed...,1
2,anyone else just miss physical touch? i crave ...,1
3,i’m just so ashamed. everyone and everything f...,1
4,i really need a friend. i don't even have a si...,1


# Text serialisation and encoding


In [57]:
# function to convert each student row into a descriptive sentence
# we Define a function to convert the numerical and categorical features of a single student row into a coherent, descriptive natural language sentence.
def serialize_student_row(row):
    return (
        # Start the sentence with Age, Gender, and the normalized CGPA.
        f"A {row['Age']}-year-old {row['Gender']} student with CGPA {row['CGPA']}, "
        # Include pressure and sleep data points.
        f"{row['Academic Pressure']} academic pressure, {row['Sleep Duration']} of sleep, "
        # Conclude with financial stress and the binary family history status.
        f"financial stress level {row['Financial Stress']}, and family history of mental illness: {row['Family History of Mental Illness']}"
    )

# we Apply the serialization function to each row in the student dataframe
# Apply the serialization function row-wise (axis=1) across the entire DataFrame.
# we store The descriptive sentence for each student in a new column called 'serialized_text'.
student_df['serialized_text'] = student_df.apply(serialize_student_row, axis=1)
# Display the first few serialized sentences alongside the original depression label to verify the output and ensure the features are correctly formatted for the model.
student_df[['serialized_text', 'Depression']].head()

Unnamed: 0,serialized_text,Depression
0,"A 33-year-old Male student with CGPA 4.49, 5.0...",1
1,"A 24-year-old Female student with CGPA 2.95, 2...",0
2,"A 31-year-old Male student with CGPA 3.52, 3.0...",0
3,"A 28-year-old Female student with CGPA 2.8, 3....",1
4,"A 25-year-old Female student with CGPA 4.07, 4...",0


In [65]:
# we dropped the columns not needed for prediction
reddit_df = reddit_df.drop(columns=["text","title","target"])

In [59]:
# we save the cleaned student data to processed folder
student_df.to_csv("../data/processed/cleaned_student_dataset.csv", index=False)

In [67]:
# we save the cleaned reddit data to processed folder
reddit_df.to_csv("../data/processed/cleaned_reddit_dataset.csv", index=False)

In [69]:
print(reddit_df.shape)
reddit_df.head()

(1202, 2)


Unnamed: 0,binary_label,clean_text
0,1,welcome to / 's check-in post - a place to tak...
1,1,we understand that most people who reply immed...
2,1,anyone else just miss physical touch? i crave ...
3,1,i’m just so ashamed. everyone and everything f...
4,1,i really need a friend. i don't even have a si...
