# Exploring Mental Health Data

## Install + Import Dependencies

In [1]:
import numpy as np 
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from xgboost import XGBClassifier

## Setup Basic Variables

In [2]:
train_path = ("/kaggle/input/playground-series-s4e11/train.csv")
test_path = ("/kaggle/input/playground-series-s4e11/test.csv")
sample_submission_path = ("/kaggle/input/playground-series-s4e11/sample_submission.csv")
submission_path = ("/kaggle/working/submission.csv")

rs = 25

## Import Data

In [3]:
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

## Data Analysis

In [4]:
train.head()

Unnamed: 0,id,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,0,Aaradhya,Female,49.0,Ludhiana,Working Professional,Chef,,5.0,,,2.0,More than 8 hours,Healthy,BHM,No,1.0,2.0,No,0
1,1,Vivan,Male,26.0,Varanasi,Working Professional,Teacher,,4.0,,,3.0,Less than 5 hours,Unhealthy,LLB,Yes,7.0,3.0,No,1
2,2,Yuvraj,Male,33.0,Visakhapatnam,Student,,5.0,,8.97,2.0,,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
3,3,Yuvraj,Male,22.0,Mumbai,Working Professional,Teacher,,5.0,,,1.0,Less than 5 hours,Moderate,BBA,Yes,10.0,1.0,Yes,1
4,4,Rhea,Female,30.0,Kanpur,Working Professional,Business Analyst,,1.0,,,1.0,5-6 hours,Unhealthy,BBA,Yes,9.0,4.0,Yes,0


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140700 entries, 0 to 140699
Data columns (total 20 columns):
 #   Column                                 Non-Null Count   Dtype  
---  ------                                 --------------   -----  
 0   id                                     140700 non-null  int64  
 1   Name                                   140700 non-null  object 
 2   Gender                                 140700 non-null  object 
 3   Age                                    140700 non-null  float64
 4   City                                   140700 non-null  object 
 5   Working Professional or Student        140700 non-null  object 
 6   Profession                             104070 non-null  object 
 7   Academic Pressure                      27897 non-null   float64
 8   Work Pressure                          112782 non-null  float64
 9   CGPA                                   27898 non-null   float64
 10  Study Satisfaction                     27897 non-null   

In [6]:
test.head()

Unnamed: 0,id,Name,Gender,Age,City,Working Professional or Student,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness
0,140700,Shivam,Male,53.0,Visakhapatnam,Working Professional,Judge,,2.0,,,5.0,Less than 5 hours,Moderate,LLB,No,9.0,3.0,Yes
1,140701,Sanya,Female,58.0,Kolkata,Working Professional,Educational Consultant,,2.0,,,4.0,Less than 5 hours,Moderate,B.Ed,No,6.0,4.0,No
2,140702,Yash,Male,53.0,Jaipur,Working Professional,Teacher,,4.0,,,1.0,7-8 hours,Moderate,B.Arch,Yes,12.0,4.0,No
3,140703,Nalini,Female,23.0,Rajkot,Student,,5.0,,6.84,1.0,,More than 8 hours,Moderate,BSc,Yes,10.0,4.0,No
4,140704,Shaurya,Male,47.0,Kalyan,Working Professional,Teacher,,5.0,,,5.0,7-8 hours,Moderate,BCA,Yes,3.0,4.0,No


In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93800 entries, 0 to 93799
Data columns (total 19 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   id                                     93800 non-null  int64  
 1   Name                                   93800 non-null  object 
 2   Gender                                 93800 non-null  object 
 3   Age                                    93800 non-null  float64
 4   City                                   93800 non-null  object 
 5   Working Professional or Student        93800 non-null  object 
 6   Profession                             69168 non-null  object 
 7   Academic Pressure                      18767 non-null  float64
 8   Work Pressure                          75022 non-null  float64
 9   CGPA                                   18766 non-null  float64
 10  Study Satisfaction                     18767 non-null  float64
 11  Jo

-> **Test dataset is missing var `Depression`**

## Data Manipulation

In [8]:
train = train.drop(['id', 'Name'], axis=1) # Drop insignificant data columns

target_column = 'Depression' # What we aer trying to predict

In [9]:
categorical_columns = train.select_dtypes(include=['object']).columns # Store headings of all categorical columns
numerical_columns = train.select_dtypes(exclude=['object']).columns.drop(target_column) # Store headings of all numeric columns

In [10]:
y_train = train[target_column] 

In [11]:
# Create pipelines for both numeric and categorical data that we seperated above 

numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy="mean")), 
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy="constant", fill_value="missing")), 
    ('ordinal', OrdinalEncoder(dtype=np.int32, handle_unknown='use_encoded_value', unknown_value=-1))
])

In [12]:
# Process the pipelines developed above 

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_columns),
        ('cat', categorical_pipeline, categorical_columns)
    ]
)

# Apply the processed pipelines to train and test data. 
X_train_preprocessed = preprocessor.fit_transform(train)
X_test_preprocessed = preprocessor.fit_transform(test)

In [13]:
# Isolation Forests to detect and remove outliers

isolation_forest = IsolationForest(contamination=0.01, random_state=26)
outlier_labels = isolation_forest.fit_predict(X_train_preprocessed)
non_outliers_mask = outlier_labels != -1 
X_train_preprocessed = X_train_preprocessed[non_outliers_mask]
y_train = y_train[non_outliers_mask]

## Apply and Predict using XGBoost

In [14]:
# Seperate training data into train and validation data

X_train, X_val, y_train, y_val = train_test_split(X_train_preprocessed, y_train, test_size=0.2, random_state = rs)

In [15]:
# Create ML model and fit to the training data 

xgb_model = XGBClassifier(use_label_encoder=False, eval_metric="logloss")

xgb_model.fit(X_train, y_train)

In [16]:
y_pred = xgb_model.predict(X_val)

accuracy = accuracy_score(y_val, y_pred)
print(accuracy) # 0.9345633368031875

0.9345633368031875


## Submission

In [17]:
test_preds = xgb_model.predict(X_test_preprocessed)

In [18]:
submission = pd.read_csv(sample_submission_path)

submission.head()

Unnamed: 0,id,Depression
0,140700,0
1,140701,0
2,140702,0
3,140703,0
4,140704,0


In [19]:
submission['Depression'] = test_preds

In [20]:
submission.to_csv(submission_path, index=False)
submission.head()

Unnamed: 0,id,Depression
0,140700,0
1,140701,0
2,140702,0
3,140703,1
4,140704,0
