In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error 
from sklearn.model_selection import train_test_split

In [2]:
# save filepath to variable for easier access
mentalhealth_file_path = '/home/steph/mental-health-research-proj/student_mental_health.csv'
# read the data and store it in DataFrame (mentalhealth_data)
mentalhealth_data = pd.read_csv(mentalhealth_file_path)

In [3]:
mentalhealth_data

Unnamed: 0,Timestamp,Choose_your_gender,Age,What_is_your_course,Your_current_year_of_Study,What_is_your_CGPA,Marital_status,Do_you_have_Depression,Do_you_have_Anxiety,Do_you_have_Panic_attack,Did_you_seek_any_specialist_for_a_treatment
0,8/7/2020 12:02,1,18.0,Engineering,1,3,0,1,0,1,0
1,8/7/2020 12:04,0,21.0,Islamic education,2,3,0,0,1,0,0
2,8/7/2020 12:05,0,19.0,BIT,1,3,0,1,1,1,0
3,8/7/2020 12:06,1,22.0,Laws,3,3,1,1,0,0,0
4,8/7/2020 12:13,0,23.0,Mathemathics,4,3,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
96,13/07/2020 19:56:49,1,21.0,BCS,1,4,0,0,1,0,0
97,13/07/2020 21:21:42,0,18.0,Engineering,2,3,0,1,1,0,0
98,13/07/2020 21:22:56,1,19.0,Nursing,3,4,1,1,0,1,0
99,13/07/2020 21:23:57,1,23.0,Pendidikan Islam,4,4,0,0,0,0,0


In [4]:
# displays questionnaire questions
mentalhealth_data.columns

Index(['Timestamp', 'Choose_your_gender', 'Age', 'What_is_your_course',
       'Your_current_year_of_Study', 'What_is_your_CGPA', 'Marital_status',
       'Do_you_have_Depression', 'Do_you_have_Anxiety',
       'Do_you_have_Panic_attack',
       'Did_you_seek_any_specialist_for_a_treatment'],
      dtype='object')

In [5]:
# dropna drops data with missing values 
mentalhealth_data = mentalhealth_data.dropna(axis=0)

In [6]:
# gives a rundown of the data in the dataset
mentalhealth_data.describe(include ='all')

Unnamed: 0,Timestamp,Choose_your_gender,Age,What_is_your_course,Your_current_year_of_Study,What_is_your_CGPA,Marital_status,Do_you_have_Depression,Do_you_have_Anxiety,Do_you_have_Panic_attack,Did_you_seek_any_specialist_for_a_treatment
count,100,100.0,100.0,100,100.0,100.0,100.0,100.0,100.0,100.0,100.0
unique,91,,,49,,,,,,,
top,8/7/2020 12:39,,,BCS,,,,,,,
freq,3,,,18,,,,,,,
mean,,0.75,20.53,,1.98,3.31,0.16,0.35,0.34,0.33,0.06
std,,0.435194,2.49628,,0.994734,0.884148,0.368453,0.479372,0.476095,0.472582,0.238683
min,,0.0,18.0,,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,,0.75,18.0,,1.0,3.0,0.0,0.0,0.0,0.0,0.0
50%,,1.0,19.0,,2.0,3.0,0.0,0.0,0.0,0.0,0.0
75%,,1.0,23.0,,3.0,4.0,0.0,1.0,1.0,1.0,0.0


In [7]:
# popularity of each course (how many students are in each)
course_popularity = mentalhealth_data['What_is_your_course'].value_counts().reset_index().head()
print(course_popularity)

                index  What_is_your_course
0                 BCS                   18
1         Engineering                   17
2                 BIT                    9
3  Biomedical science                    4
4                 KOE                    4


In [8]:
# number of female and males
# 0 is male, 1 is female
count_gender = mentalhealth_data['Choose_your_gender'].value_counts().reset_index()
print(count_gender)

   index  Choose_your_gender
0      1                  75
1      0                  25


In [9]:
# mean age of both males and females
# 0 is male, 1 is female
mentalhealth_data['Age'].groupby(mentalhealth_data['Choose_your_gender']).mean().reset_index()

Unnamed: 0,Choose_your_gender,Age
0,0,20.52
1,1,20.533333


In [10]:
# number of students married vs not married
# 0 is not married, 1 is married
marital_status = mentalhealth_data['Marital_status'].value_counts().reset_index()
print(marital_status)

   index  Marital_status
0      0              84
1      1              16


In [11]:
# percentage of total students facing depression
depression = mentalhealth_data['Do_you_have_Depression'][mentalhealth_data['Do_you_have_Depression'] == 1].count()
depression_percentage = (depression/mentalhealth_data.shape[0])*100
print("%d percent of students in this study stated they have depression." %(depression_percentage))

35 percent of students in this study stated they have depression.


In [12]:
# percentage of students facing anxiety
anxiety = mentalhealth_data['Do_you_have_Anxiety'][mentalhealth_data['Do_you_have_Anxiety'] == 1].count()
anxiety_percentage = (anxiety/mentalhealth_data.shape[0])*100
print("%d percent of students in this study stated they have anxiety." %(anxiety_percentage))

34 percent of students in this study stated they have anxiety.


In [13]:
# percentage of students that endure panic attacks
panic_attacks = mentalhealth_data['Do_you_have_Panic_attack'][mentalhealth_data['Do_you_have_Panic_attack'] == 1].count()
panic_attacks_percentage = (panic_attacks/mentalhealth_data.shape[0])*100
print("%d percent of students in this study stated they experience panic attacks." %(panic_attacks))

33 percent of students in this study stated they experience panic attacks.


In [14]:
# percentage of students that have seeked treatment
treatment = mentalhealth_data['Did_you_seek_any_specialist_for_a_treatment'][mentalhealth_data['Did_you_seek_any_specialist_for_a_treatment'] == 1].count()
treatment_percentage = (treatment/mentalhealth_data.shape[0])*100
print("%d percent of students in this study stated they have seeked a specialist for treatment." %(treatment_percentage))

6 percent of students in this study stated they have seeked a specialist for treatment.


In [15]:
#creating a dataframe of students facing all three problems
students_facingall_df= mentalhealth_data[mentalhealth_data[['Do_you_have_Depression','Do_you_have_Anxiety','Do_you_have_Panic_attack']].nunique(axis=1)==1]
students_facingall_df[mentalhealth_data['Do you have Depression?']=='Yes']

KeyError: 'Do you have Depression?'

In [None]:
# prediction targets (what we are trying to predict with the data provided)
a = mentalhealth_data.Do_you_have_Depression
b = mentalhealth_data.Do_you_have_Anxiety
c = mentalhealth_data.Do_you_have_Panic_attack
d = mentalhealth_data.Did_you_seek_any_specialist_for_a_treatment

In [None]:
# columns to be used to make predictions that may affect the outcome
# removed "What is your course" to ensure data can be read correctly
mentalhealth_features = ['Choose_your_gender', 'Age', 'Your_current_year_of_Study', 'What_is_your_CGPA', 'Marital_status']
#took out course bc its string
X = mentalhealth_data[mentalhealth_features]
X.describe(include = 'all')

In [None]:
# defines the model - specify num for random state to ensure same results each run
mentalhealth_model = DecisionTreeRegressor(random_state = 1)
# Fit model
mentalhealth_model.fit(X,a)
#DecisionTreeRegressor(random_state = 1)

In [None]:
print("Making predictions for the following 5 samples:")
print(X.head())
print("Predictions are the following: ")
print(mentalhealth_model.predict(X.head()))

In [None]:
print("Mean in-sample absolute error: ")
predicted_depression_status = mentalhealth_model.predict(X)
# in-sample mean abs error
# for initial parameter estimation and model selection
mean_absolute_error(a, predicted_depression_status)

In [None]:
# this will split data into training and validation data for features & target 
# random_state = 1 to ensure same split every time script is run
train_X, val_X, train_a, val_a = train_test_split(X, a, random_state = 1)
# fit model
mentalhealth_model.fit(train_X, train_a)
# predicted depression values based on validation data:
val_predictions = mentalhealth_model.predict(val_X)
# out of sample mean abs error:
# used to evaluate forecasting performance
print("Mean out-of-sample absolute error: ")
print(mean_absolute_error(val_a, val_predictions))
print("The mean in-sample abs error is smaller than the mean out-of-sample abs error, so in-sample is better to use.")