In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
raw_path = "../data/raw"
processed_path = "../data/processed"
output_file = os.path.join(processed_path, "hybrid_dataset.csv")

In [3]:
student_file = "../data/raw/student_depression_dataset.csv"
reddit_file =  "../data/raw/reddit_mental_health.csv"

student_df = pd.read_csv(student_file)
reddit_df = pd.read_csv(reddit_file)

In [15]:
student_df.shape

(27901, 12)

In [4]:
student_df.head()

Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,2,Male,33.0,Visakhapatnam,Student,5.0,0.0,8.97,2.0,0.0,'5-6 hours',Healthy,B.Pharm,Yes,3.0,1.0,No,1
1,8,Female,24.0,Bangalore,Student,2.0,0.0,5.9,5.0,0.0,'5-6 hours',Moderate,BSc,No,3.0,2.0,Yes,0
2,26,Male,31.0,Srinagar,Student,3.0,0.0,7.03,5.0,0.0,'Less than 5 hours',Healthy,BA,No,9.0,1.0,Yes,0
3,30,Female,28.0,Varanasi,Student,3.0,0.0,5.59,2.0,0.0,'7-8 hours',Moderate,BCA,Yes,4.0,5.0,Yes,1
4,32,Female,25.0,Jaipur,Student,4.0,0.0,8.13,3.0,0.0,'5-6 hours',Moderate,M.Tech,Yes,1.0,1.0,No,0


In [5]:
# id does not provide any useful information for predictions.
# City wont add any value to the model too
# profession is not relevant for mental health risk predictions. We are focusing more on behavioral data rather than the profession type.
# degree does not necessarily relevant to mental health prediction
student_df = student_df.drop(columns=["id","City","Profession","Study Satisfaction","Job Satisfaction","Degree"])

In [6]:
student_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27901 entries, 0 to 27900
Data columns (total 12 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   Gender                                 27901 non-null  object 
 1   Age                                    27901 non-null  float64
 2   Academic Pressure                      27901 non-null  float64
 3   Work Pressure                          27901 non-null  float64
 4   CGPA                                   27901 non-null  float64
 5   Sleep Duration                         27901 non-null  object 
 6   Dietary Habits                         27901 non-null  object 
 7   Have you ever had suicidal thoughts ?  27901 non-null  object 
 8   Work/Study Hours                       27901 non-null  float64
 9   Financial Stress                       27901 non-null  object 
 10  Family History of Mental Illness       27901 non-null  object 
 11  De

In [7]:
# unique values for each columns
for col in student_df.columns:
    print(f"{col} --> {student_df[col].unique()}\n")

Gender --> ['Male' 'Female']

Age --> [33. 24. 31. 28. 25. 29. 30. 27. 19. 20. 23. 18. 21. 22. 34. 32. 26. 39.
 35. 42. 36. 58. 49. 38. 51. 44. 43. 46. 59. 54. 48. 56. 37. 41.]

Academic Pressure --> [5. 2. 3. 4. 1. 0.]

Work Pressure --> [0. 5. 2.]

CGPA --> [ 8.97    5.9     7.03    5.59    8.13    5.7     9.54    8.04    9.79
  8.38    6.1     7.04    8.52    5.64    8.58    6.51    7.25    7.83
  9.93    8.74    6.73    5.57    8.59    7.1     6.08    5.74    9.86
  6.7     6.21    5.87    6.37    9.72    5.88    9.56    6.99    5.24
  9.21    7.85    6.95    5.86    7.92    9.66    8.94    9.71    7.87
  5.6     7.9     5.46    6.79    8.7     7.38    8.5     7.09    9.82
  8.89    7.94    9.11    6.75    7.53    9.49    9.01    7.64    5.27
  6.      9.44    5.75    7.51    9.05    6.38    8.95    9.88    5.32
  6.27    7.7     8.1     9.59    8.96    5.51    7.43    8.79    9.95
  5.37    6.86    8.32    9.74    5.66    7.48    8.23    8.81    6.03
  5.56    5.68    5.14    7.61

In [8]:
print(student_df.isnull().sum())

Gender                                   0
Age                                      0
Academic Pressure                        0
Work Pressure                            0
CGPA                                     0
Sleep Duration                           0
Dietary Habits                           0
Have you ever had suicidal thoughts ?    0
Work/Study Hours                         0
Financial Stress                         0
Family History of Mental Illness         0
Depression                               0
dtype: int64


In [9]:
student_df.head()

Unnamed: 0,Gender,Age,Academic Pressure,Work Pressure,CGPA,Sleep Duration,Dietary Habits,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,Male,33.0,5.0,0.0,8.97,'5-6 hours',Healthy,Yes,3.0,1.0,No,1
1,Female,24.0,2.0,0.0,5.9,'5-6 hours',Moderate,No,3.0,2.0,Yes,0
2,Male,31.0,3.0,0.0,7.03,'Less than 5 hours',Healthy,No,9.0,1.0,Yes,0
3,Female,28.0,3.0,0.0,5.59,'7-8 hours',Moderate,Yes,4.0,5.0,Yes,1
4,Female,25.0,4.0,0.0,8.13,'5-6 hours',Moderate,Yes,1.0,1.0,No,0


In [10]:
student_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,27901.0,25.8223,4.905687,18.0,21.0,25.0,30.0,59.0
Academic Pressure,27901.0,3.141214,1.381465,0.0,2.0,3.0,4.0,5.0
Work Pressure,27901.0,0.00043,0.043992,0.0,0.0,0.0,0.0,5.0
CGPA,27901.0,7.656104,1.470707,0.0,6.29,7.77,8.92,10.0
Work/Study Hours,27901.0,7.156984,3.707642,0.0,4.0,8.0,10.0,12.0
Depression,27901.0,0.585499,0.492645,0.0,0.0,1.0,1.0,1.0


In [11]:
# converting the CGPA to a 5 point grading system by dividing it by 2
student_df['CGPA'] = (student_df['CGPA'] / 2).round(2)

# Display the first few rows after CGPA adjustment
student_df.head()

Unnamed: 0,Gender,Age,Academic Pressure,Work Pressure,CGPA,Sleep Duration,Dietary Habits,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,Male,33.0,5.0,0.0,4.49,'5-6 hours',Healthy,Yes,3.0,1.0,No,1
1,Female,24.0,2.0,0.0,2.95,'5-6 hours',Moderate,No,3.0,2.0,Yes,0
2,Male,31.0,3.0,0.0,3.52,'Less than 5 hours',Healthy,No,9.0,1.0,Yes,0
3,Female,28.0,3.0,0.0,2.8,'7-8 hours',Moderate,Yes,4.0,5.0,Yes,1
4,Female,25.0,4.0,0.0,4.07,'5-6 hours',Moderate,Yes,1.0,1.0,No,0


In [12]:
# convert the age data type from float to int 
student_df['Age'] = student_df['Age'].astype(int)
# remove the decimal places
student_df['Age'] = (student_df['Age']).round()

student_df.head()

Unnamed: 0,Gender,Age,Academic Pressure,Work Pressure,CGPA,Sleep Duration,Dietary Habits,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,Male,33,5.0,0.0,4.49,'5-6 hours',Healthy,Yes,3.0,1.0,No,1
1,Female,24,2.0,0.0,2.95,'5-6 hours',Moderate,No,3.0,2.0,Yes,0
2,Male,31,3.0,0.0,3.52,'Less than 5 hours',Healthy,No,9.0,1.0,Yes,0
3,Female,28,3.0,0.0,2.8,'7-8 hours',Moderate,Yes,4.0,5.0,Yes,1
4,Female,25,4.0,0.0,4.07,'5-6 hours',Moderate,Yes,1.0,1.0,No,0
