In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error 
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

ModuleNotFoundError: No module named 'plotly'

In [None]:
# save filepath to variable for easier access
mentalhealth_file_path = '/home/steph/mental-health-research-proj/student_mental_health.csv'
# read the data and store it in DataFrame (mentalhealth_data)
mentalhealth_data = pd.read_csv(mentalhealth_file_path)

In [None]:
mentalhealth_data

In [None]:
# displays questionnaire questions
mentalhealth_data.columns

In [None]:
# dropna drops data with missing values 
mentalhealth_data = mentalhealth_data.dropna(axis=0)

In [None]:
# gives a rundown of the data in the dataset
mentalhealth_data.describe(include ='all')

In [None]:
# popularity of each course (how many students are in each)
course_popularity = mentalhealth_data['What_is_your_course'].value_counts().reset_index().head()
print(course_popularity)

In [None]:
# number of female and males
# 0 is male, 1 is female
count_gender = mentalhealth_data['Choose_your_gender'].value_counts().reset_index()
print(count_gender)

In [None]:
# mean age of both males and females
# 0 is male, 1 is female
mentalhealth_data['Age'].groupby(mentalhealth_data['Choose_your_gender']).mean().reset_index()

In [None]:
# number of students married vs not married
# 0 is not married, 1 is married
marital_status = mentalhealth_data['Marital_status'].value_counts().reset_index()
print(marital_status)

In [None]:
# percentage of total students facing depression
depression = mentalhealth_data['Do_you_have_Depression'][mentalhealth_data['Do_you_have_Depression'] == 1].count()
depression_percentage = (depression/mentalhealth_data.shape[0])*100
print("%d percent of students in this study stated they have depression." %(depression_percentage))

In [None]:
# percentage of students facing anxiety
anxiety = mentalhealth_data['Do_you_have_Anxiety'][mentalhealth_data['Do_you_have_Anxiety'] == 1].count()
anxiety_percentage = (anxiety/mentalhealth_data.shape[0])*100
print("%d percent of students in this study stated they have anxiety." %(anxiety_percentage))

In [None]:
# percentage of students that endure panic attacks
panic_attacks = mentalhealth_data['Do_you_have_Panic_attack'][mentalhealth_data['Do_you_have_Panic_attack'] == 1].count()
panic_attacks_percentage = (panic_attacks/mentalhealth_data.shape[0])*100
print("%d percent of students in this study stated they experience panic attacks." %(panic_attacks))

In [None]:
# percentage of students in the entire study that seeked treatment
treatment_total = mentalhealth_data['Did_you_seek_any_specialist_for_a_treatment'][mentalhealth_data['Did_you_seek_any_specialist_for_a_treatment'] == 1].count()
treatment_total_percentage = (treatment_total/mentalhealth_data.shape[0])*100
print("%d percent of students in the entire study stated they did not seek any specialist for treatment." %(treatment_total_percentage))

In [None]:
#creating a dataframe of students facing all three problems
students_facingall_df= mentalhealth_data[mentalhealth_data[['Do_you_have_Depression','Do_you_have_Anxiety','Do_you_have_Panic_attack']].nunique(axis=1)==1]
students_facingall_df[mentalhealth_data['Do_you_have_Depression']==1]

In [None]:
# percentage of students facing all three that seeked treatment
treatment = students_facingall_df['Did_you_seek_any_specialist_for_a_treatment'][students_facingall_df['Did_you_seek_any_specialist_for_a_treatment'] == 1].count()
treatment_percentage = (treatment/students_facingall_df.shape[0])*100
print("%d percent of students enduring all three in this study stated they seeked treatment." %(treatment_percentage))

In [None]:
# distribution of gender in the study
# please keep in mind:
# 0 - male
# 1 - female
plt.figure(figsize = (10, 10))
sns.set_theme(style="darkgrid")
plt.hist(mentalhealth_data['Choose_your_gender'], color = 'purple')
plt.title('Gender Distribution of Study')

In [None]:
# 0 - no
# 1 - yes
plt.figure(figsize = (10, 11))
sns.set_theme(style="darkgrid")
plt.hist(mentalhealth_data['Do_you_have_Anxiety'], color = 'cornflowerblue')
plt.title('Students Experiencing Anxiety')

In [None]:
# 0 - no
# 1 - yes
plt.figure(figsize = (10, 11))
sns.set_theme(style="darkgrid")
plt.hist(mentalhealth_data['Do_you_have_Panic_attack'], color = 'darkorange')
plt.title('Students Experiencing Panic Attacks')

In [None]:
# 0 - no
# 1 - yes
plt.figure(figsize = (10, 11))
sns.set_theme(style="darkgrid")
plt.hist(mentalhealth_data['Do_you_have_Depression'], color = 'lime')
plt.title('Students Experiencing Depression')

In [None]:
# percentage of courses each of the students are taking
course_distribution = mentalhealth_data['What_is_your_course'].value_counts().reset_index()
course_distribution.columns = ['What_is_your_course', 'value_counts']
px.pie(course_distribution, names = 'What_is_your_course', values = 'value_counts', width = 600, height = 1500)

In [None]:
# prediction targets (what we are trying to predict with the data provided)
a = mentalhealth_data.Do_you_have_Depression
b = mentalhealth_data.Do_you_have_Anxiety
c = mentalhealth_data.Do_you_have_Panic_attack
d = mentalhealth_data.Did_you_seek_any_specialist_for_a_treatment

In [None]:
# columns to be used to make predictions that may affect the outcome
# removed "What is your course" to ensure data can be read correctly
mentalhealth_features = ['Choose_your_gender', 'Age', 'Your_current_year_of_Study', 'What_is_your_CGPA', 'Marital_status']
#took out course bc its string
X = mentalhealth_data[mentalhealth_features]
X.describe(include = 'all')

In [None]:
# defines the model - specify num for random state to ensure same results each run
mentalhealth_model = DecisionTreeRegressor(random_state = 1)
# Fit model
mentalhealth_model.fit(X,a)
#DecisionTreeRegressor(random_state = 1)

In [None]:
print("Making predictions for the following 5 samples:")
print(X.head())
print("Predictions are the following: ")
print(mentalhealth_model.predict(X.head()))

In [None]:
print("Mean in-sample absolute error: ")
predicted_depression_status = mentalhealth_model.predict(X)
# in-sample mean abs error
# for initial parameter estimation and model selection
mean_absolute_error(a, predicted_depression_status)

In [None]:
# this will split data into training and validation data for features & target 
# random_state = 1 to ensure same split every time script is run
train_X, val_X, train_a, val_a = train_test_split(X, a, random_state = 1)
# fit model
mentalhealth_model.fit(train_X, train_a)
# predicted depression values based on validation data:
val_predictions = mentalhealth_model.predict(val_X)
# out of sample mean abs error:
# used to evaluate forecasting performance
print("Mean out-of-sample absolute error: ")
print(mean_absolute_error(val_a, val_predictions))
print("The mean in-sample abs error is smaller than the mean out-of-sample abs error, so in-sample is better to use.")