In [None]:
import numpy as np 
import pandas as pd
from statsmodels.stats.proportion import proportions_ztest
from matplotlib import pyplot as plt
import seaborn as sns

# load the dataset

In [None]:
de_train = pd.read_csv('train.csv')
de_test = pd.read_csv('test.csv')

de_train.describe()

In [None]:
de_test.describe()

In [None]:
de_train.head(5)

In [None]:
de_train = de_train.drop(['id'],axis=1)

As suggeted in a post on this dataset, the Name columns contain a lot of noise and we may want to drop it.

In [None]:
de_train.drop(columns=['Name'], inplace=True)
de_test.drop(columns=['Name'], inplace=True)

# Visualize nulls in this dataset

In [None]:
import missingno

In [None]:
missingno.matrix(de_train)

In [None]:
missingno.matrix(de_test)

In [None]:
missingno.bar(de_train)

In [None]:
missingno.bar(de_test)

In [None]:
missingno.heatmap(de_train)

It looks like except professions, nulls in other columns have a strong correlationof $\mid 1\mid$. This may be due to specific properties in Profession column. Let's take a closer look on distribution of this column. We can see the test has similar null correlations:

In [None]:
missingno.heatmap(de_test)

## Gender columns

In [None]:
de_train.Gender.unique()

In [None]:
de_test.Gender.unique()

In [None]:
gender_tr = de_train.Gender.value_counts()
plt.bar(gender_tr.index, gender_tr)

In [None]:
gender_te = de_test.Gender.value_counts()
plt.bar(gender_te.index, gender_te)

We can be assured that train and test data have simmilar gender distribution. Now this columns is better to be in true and false.

In [None]:
de_train.Gender = de_train.Gender.map({'Male':True, 'Female':False})
de_train = de_train.rename(columns={'Gender':'Is Male'})

In [None]:
de_train.head(5)

In [None]:
de_test.Gender = de_test.Gender.map({'Male':True, 'Female':False})
de_test = de_test.rename(columns={'Gender':'Is Male'})
de_test.head(5)

In [None]:
male_tr = de_train['Is Male'].value_counts()
male_te = de_test['Is Male'].value_counts()
count_male = [male_te[0], male_tr[0]]
count_n = [de_test.shape[0], de_train.shape[0]]

proportions_ztest(count_male, count_n)

We can conclude that the distribution of proportionns of gender in train and test are the same

## Age

We are just going to check if all values in this column is numeric

In [None]:
from pandas.api.types import is_numeric_dtype
is_numeric_dtype(de_train.Age)

In [None]:
is_numeric_dtype(de_test.Age)

In [None]:
plt.plot(de_train.Age, de_train.index, '.')

In [None]:
plt.plot(de_test.Age, de_test.index, '.')

In conclusion, the distribution of Age in both sets are also similar: samples are about even in all ages

## City
The point here is to see if 
- it's just city names in this column
- the distribution of people with respect to cities

In [None]:
de_train.City.unique()

In [None]:
de_test.City.unique()

So in conclusion, errorneous values in city columns from the train set:
- M.Tech
- Less Delhi
- M.Com
- City
- 3.0
- Less than 5 Kalyan
- MCA
- MSc
- No
- ME
- Researcher

Erroneous values in city columns from the test set:
- Less than 5 hours
- Lawyer
- City
- Less Delhi
- Is Kanpur
- No
- Chemist
- More Delhi
- No.12

In [None]:
def train_non_loc_2loc_first_step(x):
    if x=='Less Delhi' or x=='More Delhi': return 'Delhi'
    elif x=='Less than 5 Kalyan': return 'Kalyan'
    elif x=='Is Kanpur': return 'Kanpur'
    else: return x
de_train.City = de_train.City.apply(train_non_loc_2loc_first_step)
de_train.City.unique()

Erroneous value list on city column from trian set:
- M.Tech
- ~~Less Delhi~~
- M.Com
- City
- 3.0
- ~~Less than 5 Kalyan~~
- MCA
- MSc
- No
- ME
- Researcher

In [None]:
error_cities = ['M.Tech', 'M.Com', '3.0', 'MCA', 'MSc', 'No', 'ME', 'Researcher']

de_train[de_train.City.isin(error_cities)]

It's very lucky that we don't have many of those people hence we only need to fill these with mode

In [None]:
(de_train.City==de_train.City.mode()[0]).sum()

In [None]:
de_train.loc[de_train.City.isin(error_cities), 'City'] = de_train.City.mode()[0]


In [None]:
de_train.City.unique()

In [None]:
de_train[de_train.City=='City']

In [None]:
de_train.loc[de_train.City=='City', 'City'] = de_train.City.mode()[0]

In [None]:
de_train.City.unique()

Cleaning on `de_train.City` is done! Now we are should work on `de_test.City`

Erroneous values in city columns from the test set:
- Less than 5 hours
- Lawyer
- City
- Less Delhi
- Is Kanpur
- No
- Chemist
- More Delhi
- No.12

In [None]:
de_test.City = de_test.City.apply(train_non_loc_2loc_first_step)
de_test.City.unique()

In [None]:
error_cities = ['Less than 5 hours', 'Lawyer', 'City', 'Less Delhi', 'No', 'Chemist', 'No.12']

de_test[de_test.City.isin(error_cities)]

In [None]:
de_test.City.mode()[0]

In [None]:
de_test.loc[de_test.City.isin(error_cities), 'City'] = de_test.City.mode()[0]

In [None]:
de_test.City.unique()

In [None]:
(de_test.City=='San Vasai-Virar').sum()

In [None]:
de_test.loc[de_test.City=='San Vasai-Virar', 'City'] = 'Vasai-Virar'

In [None]:
de_test.City.unique()

In [None]:
test_city = de_test.City.unique()
train_city = de_train.City.unique()

list of values in test City but not in train City:

In [None]:
in_test_not_train = test_city[~np.isin(test_city, train_city)]

In [None]:
in_test_not_train

list of values in train City but not in Test City

In [None]:
in_train_not_test = train_city[~np.isin(train_city, test_city)]

In [None]:
in_train_not_test

In [None]:
de_test[de_test.City.isin(in_test_not_train)].shape

In [None]:
de_train[de_train.City.isin(in_train_not_test)].shape

For now we just map those values as others.

In [None]:
de_train.loc[de_train.City.isin(in_train_not_test), 'City'] = 'Other'

In [None]:
de_train.City.unique()

In [None]:
de_test.loc[de_test.City.isin(in_test_not_train), 'City'] = 'Other'
de_test.City.unique()

Before we continue on, to make better comparisons of cities, we might need to shift cities with lower number of people to others

In [None]:
small_sample_tr = de_train.City.value_counts()[de_train.City.value_counts()<=10].index
small_sample_te = de_test.City.value_counts()[de_test.City.value_counts()<=10].index

In [None]:
np.setdiff1d(small_sample_tr, small_sample_te)

In [None]:
de_train.loc[de_train.City.isin(small_sample_tr), 'City'] = 'Other'
de_test.loc[de_test.City.isin(small_sample_te), 'City'] = 'Other'

In [None]:
count_citytr = de_train.City.value_counts()
count_cityte = de_test.City.value_counts()
count_cityte = count_cityte.loc[count_citytr.index]
plt.bar(count_citytr.index, count_citytr)
plt.bar(count_citytr.index, count_cityte)
_ = plt.xticks(rotation = 90)

In [None]:
from scipy.stats import ks_2samp
ks_2samp(count_citytr, count_cityte)

p value is far less than 0.05 (or 0.01), then distributions of samples among cities are likely the same

Next question: are there any cities having more people in depression?

In [None]:
city_vs_depression = de_train.groupby('City').Depression.value_counts().unstack()
city_vs_depression

In [None]:
city_vs_depression_p = de_train.groupby('City').Depression.value_counts(normalize=True).unstack()
city_vs_depression_p.plot(kind='bar')

In [None]:
from scipy.stats import chi2_contingency
cities = city_vs_depression.index
n_cities = len(cities)
disim_matrix = [[0 for i in range(n_cities)] for i in range(n_cities)]

for i in range(0, n_cities):
    for j in range(i, n_cities):
        table = [
            city_vs_depression.loc[cities[i]],
            city_vs_depression.loc[cities[j]]
        ]
        _, p_val, _, _ = chi2_contingency(table)
        disim_matrix[i][j] = disim_matrix[j][i] = 1-p_val

In [None]:
from scipy.cluster.hierarchy import linkage, dendrogram

city_Z = linkage(disim_matrix, method='ward')

_ = dendrogram(city_Z, labels = cities)
_ = plt.xticks(rotation=90)
plt.axhline(y=1, color='coral', linestyle='--', label='Distance = 1')

In [None]:
cities

We can see some cities are drastically different from the others (and distance between them and other clusters are greater or close to 1):
1. Kanpur
2. Bhopal
3. Thane
4. Other
5. Vadodara
6. Jaipur
7. category1: ['Vasai-Virar', 'Kalyan', 'Surat'],
8. category2: ['Chennai', 'Ghaziabad', 'Delhi', 'Agra', 'Ludhiana']
9. category3: ['Ahmedabad', 'Lucknow', 'Srinagar']
10. category4: ['Varanasi', 'Faridabad', 'Nagpur']
11. category5: ['Meerut', 'Mumbai']
12. category6: ['Patna', 'Indore', 'Pune', 'Nashik', 'Visakhapatnam']
13. category7: ['Bangalore', 'Kolkata', 'Rajkot']

In [None]:
city_cat = [
    ['Vasai-Virar', 'Kalyan', 'Surat'],
    ['Chennai', 'Ghaziabad', 'Delhi', 'Agra', 'Ludhiana'],
    ['Ahmedabad', 'Lucknow', 'Srinagar'], 
    ['Varanasi', 'Faridabad', 'Nagpur'],
    ['Meerut', 'Mumbai'],
    ['Patna', 'Indore', 'Pune', 'Nashik', 'Visakhapatnam'],
    ['Bangalore', 'Kolkata', 'Rajkot']
]

for i in range(len(city_cat)):
    city = city_cat[i]
    de_train.loc[de_train.City.isin(city), 'City'] = 'city'+str(i)
de_train.City.value_counts()

In [None]:
de_train.City = de_train.City.astype('category')

In [None]:
de_train.columns

In [None]:
for i in range(len(city_cat)):
    city = city_cat[i]
    de_test.loc[de_test.City.isin(city), 'City'] = 'city'+str(i)
de_test.City.value_counts()

# Working professional or student

In [None]:
de_train['Working Professional or Student'].unique()

In [None]:
de_test['Working Professional or Student'].unique()

Glad that those two columns don't have value errors. 

In [None]:
count_str = de_train['Working Professional or Student'].value_counts()
count_ste = de_test['Working Professional or Student'].value_counts()

In [None]:
count_str

In [None]:
count_ste

In [None]:
count_student = [count_str[1], count_ste[1]]
n_both = [de_train.shape[0], de_test.shape[0]]

proportions_ztest(count_student, n_both)

In [None]:
table = [
    [count_str[0], count_str[1]],
    [count_ste[0], count_ste[1]]
]
chi2_contingency(table)

We can conclude that train and test have similar distributions on this column as p value is large

It's tempting at this moment to see if student and working professionals have the same distribution on depression counts

In [None]:
# split working_tr and student_tr by 'working professional or studnet' column
working_tr = de_train[de_train['Working Professional or Student']=='Working Professional']
student_tr = de_train[de_train['Working Professional or Student']=='Student']

In [None]:
count_depress_working_tr = working_tr['Depression'].value_counts()
count_depress_student_tr = student_tr['Depression'].value_counts()


In [None]:
count_depress_student_tr

In [None]:
count_depress_working_tr

There's no need to use test to tell if the distributions are different, but still use it at the best:

In [None]:
table_depress = [
    [count_depress_student_tr[0], count_depress_student_tr[1]],
    [count_depress_working_tr[0], count_depress_working_tr[1]]
]

chi2_contingency(table_depress)

It's clear that student and non-students have different depression distribution

In [None]:
working_tr = working_tr.drop(columns= ['Academic Pressure', 'CGPA', 'Study Satisfaction', 'Working Professional or Student'],
                             axis=1)
working_tr.rename(columns={'Work/Study Hours':'Work Hours'}, inplace=True)
working_tr.columns

In [None]:
student_tr = student_tr.drop(columns = ['Profession', 'Work Pressure', 'Job Satisfaction', 'Working Professional or Student'], axis=1)
student_tr.rename(columns={'Work/Study Hours':'Study Hours'}, inplace=True)
student_tr.columns

And do the same thing on test set

In [None]:
working_te = de_test[de_test['Working Professional or Student']=='Working Professional']
student_te = de_test[de_test['Working Professional or Student']=='Student']

In [None]:
working_te = working_te.drop(columns= ['Academic Pressure', 'CGPA', 'Study Satisfaction', 'Working Professional or Student'],
                             axis=1)
working_te.rename(columns={'Work/Study Hours':'Work Hours'}, inplace=True)
working_te.columns

In [None]:
student_te = student_te.drop(columns = ['Profession', 'Work Pressure', 'Job Satisfaction', 'Working Professional or Student'], axis=1)
student_te.rename(columns={'Work/Study Hours':'Study Hours'}, inplace=True)
student_te.columns

## Working Professional: Profession

In [None]:
working_tr.Profession.unique()

Error values in Profession:
1. nan
2. 'B.Com', 
3. 'BE', 
4. 'Student', 
5. 'Yogesh', 
6. 'Dev',
7. 'MBA', 
8. 'LLM', 
9. 'BCA', 
10. 'Academic', 
11. 'Profession'
12. 'BBA'
13. 'Working Professional'
14. 'MBBS'
15. 'Patna', 
16. 'Unveil', 
17. 'B.Ed', 
18. 'Nagpur', 
19. 'Moderate', 
20. 'M.Ed',
21. 'Analyst', 
22. 'Pranav', 
23. 'Visakhapatnam', 
24. 'PhD', 
25. 'Yuvraj'

Several questions:
1. Does nan mean unemployed? Can they have similar distributions of data?
2. What about other error values? Are they also unemployed or random?

In [None]:
unemployed = working_tr[working_tr.Profession=='Unemployed']

In [None]:
unemployed

In [None]:
unemployed

In [None]:
unfilled = working_tr[working_tr.Profession.isnull()]

In [None]:
unemployed['Work Hours']

In [None]:
unfilled['Work Hours']

In [None]:
ks_2samp(unemployed['Work Hours'], unfilled['Work Hours'])

In [None]:
unfilled.columns

In [None]:
unfilled.Age.describe()

In [None]:
error_value = ['Yogesh', 'Dev', 'Profession','Working Professional', 'Patna', 'Unveil',  'Nagpur', 'Moderate', 
                'Pranav', 'Visakhapatnam',  'Yuvraj']
error_profession = working_tr[working_tr.Profession.isin(error_value)]
student_value = ['B.Com', 'BE', 'Student', 'MBA', 'LLM', 'BCA', 'Academic',
                'BBA', 'MBBS', 'B.Ed', 'M.Ed','PhD']
student_profession = working_tr[working_tr.Profession.isin(student_value)]

In [None]:
prop_unfill = unfilled.Depression.value_counts(normalize=True)
prop_error = error_profession.Depression.value_counts(normalize=True)
prop_student = student_profession.Depression.value_counts(normalize=True)
prop_student[1] = 0
compare = pd.DataFrame({'depress in unfill':prop_unfill.values, 'depress in error':prop_error.values, 
                        'depress in student':prop_student.values})
compare.plot(kind='bar')

In [None]:
working_tr.loc[error_profession.index, 'Profession'] = 'Unemployed'
working_tr.loc[student_profession.index, 'Profession'] = 'Unemployed'
working_tr.Profession.fillna('Unemployed', inplace=True)
working_tr.Profession.unique()

The p value is large, and we can conclude that Researchers, Chemist and Data Scientist have similar depression distribution

In [None]:
working_tr.loc[working_tr.Profession=='Medical Doctor', 'Profession'] = 'Doctor'
working_tr.loc[working_tr.Profession=='Family Consultant', 'Profession'] = 'Consultant'
working_tr.loc[working_tr.Profession=='FamilyVirar', 'Profession'] = 'Unemployed'
working_tr.loc[working_tr.Profession=='City Manager', 'Profession'] = 'Manager'

working_tr.loc[working_tr.Profession=='Travel Consultant', 'Profession'] = 'Consultant'
working_tr.loc[working_tr.Profession=='Finanancial Analyst', 'Profession'] = 'Financial Analyst'

In [None]:
working_tr.loc[working_tr.Profession=='Analyst', 'Profession'] = 'Business Analyst'

In [None]:
profession_count = working_tr.Profession.value_counts()
profession_count

In [None]:
depression_vs_profession = working_tr.groupby("Profession").Depression.value_counts(normalize=True).unstack()

In [None]:
p_depressed = working_tr.Depression.value_counts(normalize=True)[1]
depression_vs_profession[1].plot(kind='bar')
plt.title('Depression Ratio in Professions')
plt.axhline(y=p, color='coral', linestyle='--')

In [None]:
depression_vs_profession = working_tr.groupby("Profession").Depression.value_counts().unstack()
jobs = depression_vs_profession.index
matrix = [[0 for i in range(len(jobs))] for i in range(len(jobs))]
for i in range(0, len(jobs)):
    for j in range(0, len(jobs)):
        table = [
            depression_vs_profession.loc[jobs[i]].values,
            depression_vs_profession.loc[jobs[j]].values
        ]
        _, p_val, _, _ = chi2_contingency(table)
        matrix[i][j] = 1-p_val
        matrix[j][i] = 1-p_val

In [None]:
from scipy.cluster.hierarchy import linkage, dendrogram

Z = linkage(matrix, method='ward')
_ = dendrogram(Z, labels=jobs, leaf_rotation=90)
plt.axhline(y=1, color='coral', linestyle='--', label='Distance = 1')

In [None]:
len(working_tr.Profession.unique())

It's very obvious that Grphic Designer and Unemployed are drastically different from other occupations in terms of depression distribution, thus, we should divide it into:

1. Graphic Designer
2. Unemployed
3. Investment Banker
4. job category1: ['Business Analyst', 'Teacher'], 
5. job category2: ['Doctor', 'Marketing Manager', 'Pilot']
6. job category3: ['Electrician', 'Researcher', 'UX/UI Designer']
7. job category4: ['Customer Support', 'Digital Marketer']
8. job category5: ['Consultant', 'Chef', 'Sales Executive']
9. job category6: ['HR Manager', 'Judge']
10. job category7: ['Architect', 'Mechanical Engineer']
11. job category8: ['Chemist', 'Pharmacist']
12. job category9: ['Content Writer', 'Entrepreneur'],
13. job category10: ['Financial Analyst', 'Accountant', 'Plumber', 'Research Analyst', 'Software Engineer']
14. job category11: ['Data Scientist', 'Civil Engineer', 'Manager']
15. job category12: ['Educational Consultant', 'Lawyer']

In [None]:
job_cat = [
    ['Business Analyst', 'Teacher'], 
    ['Doctor', 'Marketing Manager', 'Pilot'],
    ['Electrician', 'Researcher', 'UX/UI Designer'],
    ['Customer Support', 'Digital Marketer'],
    ['Consultant', 'Chef', 'Sales Executive'],
    ['HR Manager', 'Judge'],
    ['Architect', 'Mechanical Engineer'],
    ['Chemist', 'Pharmacist'],
    ['Content Writer', 'Entrepreneur'],
    ['Financial Analyst', 'Accountant', 'Plumber', 'Research Analyst', 'Software Engineer'],
    ['Data Scientist', 'Civil Engineer', 'Manager'],
    ['Educational Consultant', 'Lawyer']
]

for i in range(0, len(job_cat)):
    jobs = job_cat[i]
    working_tr.loc[working_tr.Profession.isin(jobs), 'Profession'] = 'job'+str(i)

working_tr.Profession.value_counts()


In [None]:
working_te.Profession.unique()

In [None]:
working_te.loc[working_te.Profession=='Surgeon', 'Profession'] = 'Doctor'

In [None]:
might_be_unemployed = ['B.Ed', 'Student', 'Working Professional', 'ME', 'B.Pharm', '24th', 'Manvi', 'Yogesh',
                       'Samar', 'Surat', 'PhD', 'M.Ed', 'MD', 'Name', 'MCA', 'Simran','Profession', 'Unhealthy',
                       'BBA', 'LLM', 'No', 'Unveil', 'M.Pharm']
working_te.loc[working_te.Profession.isin(might_be_unemployed), 'Profession'] = 'Unemployed'

In [None]:
working_te.Profession.fillna('Unemployed', inplace=True)

In [None]:
working_te.loc[working_te.Profession=='Analyst', 'Profession'] = 'Financial Analyst'
working_te.loc[working_te.Profession=='City Consultant', 'Profession'] = 'Consultant'
working_te.loc[working_te.Profession=='Travel Consultant', 'Profession'] = 'Consultant'
working_te.loc[working_te.Profession=='Finanancial Analyst', 'Profession'] = 'Financial Analyst'

for i in range(0, len(job_cat)):
    jobs = job_cat[i]
    working_te.loc[working_te.Profession.isin(jobs), 'Profession'] = 'job'+str(i)

working_te.Profession.value_counts()

In [None]:
working_tr.Profession = working_tr.Profession.astype('category')
working_te.Profession = working_te.Profession.astype('category')

## Working Professional: Work Pressure, work hours and job satisfaction

We should consider that if work pressure relate to work hours

In [None]:
working_tr[working_tr['Work Pressure'].isnull()]

In [None]:
sns.violinplot(working_tr['Work Pressure'])

In [None]:
import math

mean_pressure = math.ceil(working_tr['Work Pressure'].mean())
working_tr['Work Pressure'].fillna(mean_pressure, inplace=True)

In [None]:
pressure_vs_depression

In [None]:
pressure_vs_depression = working_tr.groupby('Work Pressure').Depression.value_counts().unstack()
pressure_level = pressure_vs_depression.index
for i in range(0, 4):
    for j in range(i+1, 5):
       table = [
        pressure_vs_depression.loc[pressure_level[i]],
        pressure_vs_depression.loc[pressure_level[j]]
       ]
       _, p_val, _, _ = chi2_contingency(table)
       print("pressure: {}, {}, p_val: {}".format(pressure_level[i], pressure_level[j], p_val))

We can conclude that the distribution of each pressure levels are distinct, making it a good predictor on depression

In [None]:
#fill null to test data
mean_pressure = math.ceil(working_te['Work Pressure'].mean())
working_te['Work Pressure'].fillna(mean_pressure, inplace=True)

In [None]:
working_tr['Work Pressure'] = working_tr['Work Pressure'].astype('category')
working_te['Work Pressure'] = working_te['Work Pressure'].astype('category')

In [None]:
working_tr['Work Hours'].isnull().sum()

In [None]:
working_tr['Work Hours'].describe()

In [None]:
depression_vs_work_hr = working_tr.groupby('Work Hours').Depression.value_counts(normalize=True).unstack()

In [None]:
depression_vs_work_hr.plot(kind='bar')

In [None]:
depression_vs_work_hr = working_tr.groupby('Work Hours').Depression.value_counts().unstack()
hours = depression_vs_work_hr.index
n = 13
disim_matrix = [[0 for i in range(n)] for j in range(n)]

for i in range(0, n):
    for j in range(i, n):
        table = [
            depression_vs_work_hr.loc[hours[i]],
            depression_vs_work_hr.loc[hours[j]]
        ]
        _, p_val, _, _ = chi2_contingency(table)
        disim_matrix[i][j] = disim_matrix[j][i] = 1-p_val 

link_matrix = linkage(disim_matrix, method='ward')
_ = dendrogram(link_matrix, labels=hours, leaf_rotation=90)
plt.axhline(y=1, color='coral', linestyle='--')

In [None]:
working_tr['Heavy_overtime'] = working_tr['Work Hours']>9
working_te['Heavy_overtime'] = working_te['Work Hours']>9

In [None]:
overtime_table = working_tr.groupby('Heavy_overtime').Depression.value_counts().unstack()
chi2_contingency(overtime_table)

In [None]:
hours_cat = [
    [10.0, 12.0],
    [4.0, 5.0],
    [2.0, 3.0],
    [0.0, 1.0],
    [7.0, 8.0]
]

for i in range(len(hours_cat)):
    hour = hours_cat[i]
    working_tr.loc[working_tr['Work Hours'].isin(hour), 'Work Hours'] = 'time_zone'+str(i)
working_tr['Work Hours'].value_counts()

In [None]:
working_tr['Work Hours'] = working_tr['Work Hours'].astype('category')

In [None]:
for i in range(len(hours_cat)):
    hour = hours_cat[i]
    working_te.loc[working_te['Work Hours'].isin(hour), 'Work Hours'] = 'time_zone'+str(i)
working_te['Work Hours'].value_counts()

In [None]:
working_te['Work Hours'] = working_te['Work Hours'].astype('category')

In [None]:
working_tr['Job Satisfaction'].isnull().sum()

In [None]:
working_tr['Job Satisfaction'].describe()

In [None]:
m = working_tr['Job Satisfaction'].mode()[0]
working_tr['Job Satisfaction'].fillna(m, inplace=True)

In [None]:
working_tr.groupby('Job Satisfaction').Depression.value_counts(normalize=True).unstack().plot(kind='bar')

In [None]:
depression_vs_js = working_tr.groupby('Job Satisfaction').Depression.value_counts().unstack()
js = depression_vs_js.index
n = 5
disim_matrix = [[0 for i in range(n)] for j in range(n)]

for i in range(0, n):
    for j in range(i, n):
        table = [
            depression_vs_js.loc[js[i]],
            depression_vs_js.loc[js[j]]
        ]
        _, p_val, _, _ = chi2_contingency(table)
        disim_matrix[i][j] = disim_matrix[j][i] = 1-p_val 

link_matrix = linkage(disim_matrix, method='ward')
_ = dendrogram(link_matrix, labels=js, leaf_rotation=90)
plt.axhline(y=1, color='coral', linestyle='--')

In [None]:
working_tr.loc[working_tr['Job Satisfaction']==5, 'Job Satisfaction'] = 4

In [None]:
working_te['Job Satisfaction'].describe()

In [None]:
m = working_te['Job Satisfaction'].mode()
working_te['Job Satisfaction'].fillna(m, inplace=True)

In [None]:
working_te.loc[working_te['Job Satisfaction']==5, 'Job Satisfaction'] = 4

In [None]:
working_tr['Job Satisfaction'] = working_tr['Job Satisfaction'].astype('category')
working_te['Job Satisfaction'] = working_te['Job Satisfaction'].astype('category')

In [None]:
working_tr.columns

## Working Professionals: Sleep Duration

In [None]:
working_tr['Sleep Duration'].value_counts()

In [None]:
normalized = ['Less than 5 hours','5-6 hours', '7-8 hours', 'More than 8 hours']

working_tr[working_tr['Sleep Duration'].isin(normalized)].groupby('Sleep Duration').Depression.value_counts(normalize=True).unstack()

In [None]:
working_tr[~working_tr['Sleep Duration'].isin(normalized)].Depression.value_counts(normalize=True)

In [None]:
normal_sleep = working_tr[working_tr['Sleep Duration'].isin(normalized)]
sns.violinplot(x='Sleep Duration', y = 'Age', data=normal_sleep)

In [None]:
working_tr.loc[~working_tr['Sleep Duration'].isin(normalized), 'Sleep Duration'] = 'Other'

In [None]:
working_tr['Unhealthy Sleep'] = working_tr['Sleep Duration'].isin(['Less than 5 hours'])

In [None]:
sleep_table = working_tr.groupby('Unhealthy Sleep').Depression.value_counts().unstack()

In [None]:
chi2_contingency(sleep_table)

In [None]:
sleep_table

In [None]:
working_tr.drop(columns='Sleep Duration', axis=1, inplace=True)
working_tr.columns

In [None]:
working_te['Unhealthy Sleep']= working_te['Sleep Duration']=='Less than 5 hours'
working_te.drop(columns='Sleep Duration', axis=1, inplace=True)
working_te.columns

## Working Profession: Dietary Habits

In [None]:
working_tr['Dietary Habits'].value_counts()

In [None]:
working_tr['Dietary Habits'].isnull().sum()

In [None]:
diet_mode = working_tr['Dietary Habits'].mode()[0]
working_tr['Dietary Habits'].fillna(diet_mode,inplace=True)

normalized = ['Moderate', 'Healthy', 'Unhealthy']
working_tr.loc[~working_tr['Dietary Habits'].isin(normalized), 'Dietary Habits'] = diet_mode

In [None]:
depression_vs_diet = working_tr.groupby('Dietary Habits').Depression.value_counts().unstack()
diet = depression_vs_diet.index
n = len(diet)
disim_matrix = [[0 for i in range(n)] for j in range(n)]

for i in range(0, n):
    for j in range(i, n):
        table = [
            depression_vs_diet.loc[diet[i]],
            depression_vs_diet.loc[diet[j]]
        ]
        _, p_val, _, _ = chi2_contingency(table)
        disim_matrix[i][j] = disim_matrix[j][i] = 1-p_val 

link_matrix = linkage(disim_matrix, method='ward')
_ = dendrogram(link_matrix, labels=diet, leaf_rotation=90)
plt.axhline(y=1, color='coral', linestyle='--')

In [None]:
working_tr['Dietary Habits'] = working_tr['Dietary Habits'].astype('category')

In [None]:
working_te['Dietary Habits'].value_counts()

In [None]:
diet_mode = working_te['Dietary Habits'].mode()[0]
working_te.loc[~working_te['Dietary Habits'].isin(normalized), 'Dietary Habits'] = diet_mode
working_te['Dietary Habits'] = working_te['Dietary Habits'].astype('category')
working_te['Dietary Habits'].value_counts()

### Working Profession: Degree

We shall just drop this column as it's a bit messy. 

In [None]:
working_tr.drop(columns='Degree', inplace=True)
working_te.drop(columns='Degree', inplace=True)

working_tr.columns

In [None]:
working_te.columns

### Working Profession: suicidial thoughts

In [None]:
working_tr['Have you ever had suicidal thoughts ?'].unique()

Should just mark it into true and false

In [None]:
working_tr['Have you ever had suicidal thoughts ?'] = working_tr['Have you ever had suicidal thoughts ?'].map({'No': False, 'Yes': True})

In [None]:
working_te['Have you ever had suicidal thoughts ?'] = working_te['Have you ever had suicidal thoughts ?'].map({'No': False, 'Yes': True})

In [None]:
working_tr.columns

## Working Profession: Financial stress

In [None]:
working_tr['Financial Stress'].value_counts()

In [None]:
working_tr['Financial Stress'].isnull().sum()

In [None]:
working_tr['Financial Stress'].fillna(working_tr['Financial Stress'].mode()[0], inplace=True)

In [None]:
working_tr.groupby('Financial Stress').Depression.value_counts(normalize=True).unstack()

In [None]:
depression_vs_finance = working_tr.groupby('Financial Stress').Depression.value_counts().unstack()
finan = depression_vs_finance.index
n = len(finan)
disim_matrix = [[0 for i in range(n)] for j in range(n)]

for i in range(0, n):
    for j in range(i, n):
        table = [
            depression_vs_finance.loc[finan[i]],
            depression_vs_finance.loc[finan[j]]
        ]
        _, p_val, _, _ = chi2_contingency(table)
        disim_matrix[i][j] = disim_matrix[j][i] = 1-p_val 

link_matrix = linkage(disim_matrix, method='ward')
_ = dendrogram(link_matrix, labels=finan, leaf_rotation=90)
plt.axhline(y=1, color='coral', linestyle='--')

In [None]:
working_tr['Financial Stress'] = working_tr['Financial Stress'].astype('category')

In [None]:
working_te['Financial Stress'].isnull().sum()

In [None]:
working_te['Financial Stress'].value_counts()

In [None]:
working_te['Financial Stress'] = working_te['Financial Stress'].astype('category')

## Working professional: Family history

In [None]:
working_tr['Family History of Mental Illness'].unique()

In [None]:
working_te['Family History of Mental Illness'].unique()

In [None]:
working_tr['Family History of Mental Illness'] = working_tr['Family History of Mental Illness'].map({'No': False, 'Yes': True})
working_te['Family History of Mental Illness'] = working_te['Family History of Mental Illness'].map({'No': False, 'Yes': True})

## Working Professional: Summary

In [None]:
working_tr.info()

In [None]:
working_te.info()

# Baseline Model on non-student dataset

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score

In [None]:
y = working_tr.pop('Depression')
X = working_tr

In [None]:
X = pd.get_dummies(X, drop_first=True)

In [None]:
model_reg = LogisticRegression(max_iter=1000)
scores = cross_val_score(model_reg, X, y, cv=10, scoring='roc_auc')

In [None]:
print('roc auc scores in 10 cv: {}'.format(scores))
print('mean roc auc scores in 10 cv: {}'.format(np.mean(scores)))

Our roc_auc looks good. Now let's just predict on the non-student test dataset

In [None]:
working_id = working_te.pop('id')

In [None]:
working_te = pd.get_dummies(working_te, drop_first=True)

In [None]:
model_reg.fit(X, y)
working_pred = model_reg.predict(working_te)

In [None]:
result = {'id': working_id, 'Depression': working_pred}

# Student: Overview

Now let's move on to student dataset

In [None]:
student_tr.info()

In [None]:
student_te.info()

In [None]:
missingno.bar(student_tr)

In [None]:
missingno.bar(student_te)

## Student: Academic Pressure

In [None]:
student_tr['Academic Pressure'].isnull().sum()

In [None]:
student_tr['Academic Pressure'].value_counts()

In [None]:
p_mode_tr = student_tr['Academic Pressure'].mode()
student_tr['Academic Pressure'].fillna(p_mode_tr, inplace=True)

In [None]:
depression_vs_a_pressure = student_tr.groupby('Academic Pressure').Depression.value_counts(normalize=True).unstack()
depression_vs_a_pressure

There's no need for more stats test anymore. It's bright clear that academic pressure and depression is related.

In [None]:
student_tr['Academic Pressure'] = student_tr['Academic Pressure'].astype('category')

In [None]:
p_mode_te = student_te['Academic Pressure'].mode()
student_te['Academic Pressure'].fillna(p_mode_te, inplace=True)

In [None]:
student_te['Academic Pressure'] = student_te['Academic Pressure'].astype('category')

## Student: CGPA

In [None]:
student_tr['CGPA'].unique()

In [None]:
mean_tr = student_tr['CGPA'].mean()
student_tr['CGPA'].fillna(mean_tr, inplace=True)

In [None]:
mean_te = student_te['CGPA'].mean()
student_te['CGPA'].fillna(mean_te, inplace=True)

In [None]:
ks_2samp(student_tr.CGPA, student_te.CGPA)

p value is high, then we conclude that CGPA distribution in train and test are the same

In [None]:
cgpa = pd.DataFrame({
    'train': student_tr['CGPA'],
    'test': student_te['CGPA']
})
sns.violinplot(cgpa)

Just out of curiousty only, we want to see CGPA's effect on depression.

In [None]:
cgpa_depress = student_tr.loc[student_tr['Depression']==1, 'CGPA']
cgpa_not_depress = student_tr.loc[student_tr['Depression']==0, 'CGPA']

ks_2samp(cgpa_depress, cgpa_not_depress)

the p value is small, meaning this might be a strong predictor.

## Student: Study Satisfaction

In [None]:
student_tr['Study Satisfaction'].unique()

In [None]:
student_tr['Study Satisfaction'].value_counts()

In [None]:
student_tr['Study Satisfaction'].isnull().sum()

In [None]:
mode = student_tr['Study Satisfaction'].mode()
student_tr['Study Satisfaction'].fillna(mode, inplace=True)

In [None]:
mode = student_te['Study Satisfaction'].mode()
student_te['Study Satisfaction'].fillna(mode, inplace=True)

In [None]:
depression_vs_ss = student_tr.groupby('Study Satisfaction').Depression.value_counts(normalize=True).unstack()

In [None]:
depression_vs_ss

Still worth to take a look into similarity

In [None]:
depression_vs_ss = student_tr.groupby('Study Satisfaction').Depression.value_counts().unstack()
ss = depression_vs_ss.index
n = len(ss)
disim_matrix = [[0 for i in range(n)] for j in range(n)]

for i in range(0, n):
    for j in range(i, n):
        table = [
            depression_vs_ss.loc[ss[i]],
            depression_vs_ss.loc[ss[j]]
        ]
        _, p_val, _, _ = chi2_contingency(table)
        disim_matrix[i][j] = disim_matrix[j][i] = 1-p_val 

link_matrix = linkage(disim_matrix, method='ward')
_ = dendrogram(link_matrix, labels=ss, leaf_rotation=90)
plt.axhline(y=1, color='coral', linestyle='--')

In [None]:
student_tr['Study Satisfaction'] = student_tr['Study Satisfaction'].astype('category')
student_te['Study Satisfaction'] = student_te['Study Satisfaction'].astype('category')

## Student Train: Sleep Duration

In [None]:
student_tr['Sleep Duration'].unique()

In [None]:
student_tr['Sleep Duration'].value_counts()

In [None]:
lt_5 = ['2-3 hours', '3-4 hours', '1-2 hours', '4-5 hours', 'than 5 hours']
to8s = ['8 hours', '40-45 hours', '55-66 hours', 'Moderate', '6-7 hours', '45']
gt_8 = ['10-11 hours']

In [None]:
student_tr.loc[student_tr['Sleep Duration'].isin(lt_5), 'Sleep Duration'] = 'Less than 5 hours'
student_tr.loc[student_tr['Sleep Duration'].isin(to8s), 'Sleep Duration'] = '7-8 hours'
student_tr.loc[student_tr['Sleep Duration'].isin(gt_8), 'Sleep Duration'] = 'More than 8 hours'

In [None]:
student_tr['Sleep Duration'].unique()

In [None]:
student_tr.groupby('Sleep Duration').Depression.value_counts(normalize=True).unstack()

In [None]:
depression_vs_sleep = student_tr.groupby('Sleep Duration').Depression.value_counts().unstack()
sleep = depression_vs_sleep.index
n = len(sleep)
disim_matrix = [[0 for i in range(n)] for j in range(n)]

for i in range(0, n):
    for j in range(i, n):
        table = [
            depression_vs_sleep.loc[sleep[i]],
            depression_vs_sleep.loc[sleep[j]]
        ]
        _, p_val, _, _ = chi2_contingency(table)
        disim_matrix[i][j] = disim_matrix[j][i] = 1-p_val 

link_matrix = linkage(disim_matrix, method='ward')
_ = dendrogram(link_matrix, labels=sleep, leaf_rotation=90)
plt.axhline(y=1, color='coral', linestyle='--')

In [None]:
student_tr['Sleep Duration'] = student_tr['Sleep Duration'].astype('category')

In [None]:
student_te['Sleep Duration'].unique()

In [None]:
lt_5 = ['1-6 hours', '4-5 hours', '2-3 hours']
gt_8 = ['60-65 hours', '8-9 hours', '9-6 hours']
to_8 = '6-7 hours'

student_te.loc[student_te['Sleep Duration'].isin(lt_5), 'Sleep Duration'] = 'Less than 5 hours'
student_te.loc[student_te['Sleep Duration'].isin(gt_8), 'Sleep Duration'] = 'More than 8 hours'
student_te.loc[student_te['Sleep Duration']==to_8, 'Sleep Duration'] = '7-8 hours'

In [None]:
student_te['Sleep Duration'].value_counts()

In [None]:
student_te['Sleep Duration'] = student_te['Sleep Duration'].astype('category')

In [None]:
student_tr.info()

## Student: Dietary Habits

In [None]:
student_tr['Dietary Habits'].unique()

In [None]:
normalize = ['Healthy', 'Moderate', 'Unhealthy']
mode = student_tr['Dietary Habits'].mode()[0]

student_tr.loc[~student_tr['Dietary Habits'].isin(normalize), 'Dietary Habits'] = mode

In [None]:
mode = student_te['Dietary Habits'].mode()[0]
student_te.loc[~student_te['Dietary Habits'].isin(normalize), 'Dietary Habits'] = mode

In [None]:
student_tr.groupby('Dietary Habits').Depression.value_counts(normalize=True).unstack()

In [None]:
depression_vs_diet = student_tr.groupby('Dietary Habits').Depression.value_counts().unstack()

print(chi2_contingency(depression_vs_diet.loc['Healthy'], depression_vs_diet.loc['Unhealthy']))
print(chi2_contingency(depression_vs_diet.loc['Moderate'], depression_vs_diet.loc['Healthy']))
print(chi2_contingency(depression_vs_diet.loc['Moderate'], depression_vs_diet.loc['Unhealthy']))

In [None]:
student_tr.drop(columns='Dietary Habits', inplace=True)
student_te.drop(columns = 'Dietary Habits', inplace=True)

In [None]:
student_tr.info()

## Student: Degree

In [None]:
student_tr.drop(columns = 'Degree', inplace=True)
student_te.drop(columns = 'Degree', inplace=True)

## Student: Suicide Thoughts

In [None]:
student_tr['Have you ever had suicidal thoughts ?'].unique()

In [None]:
student_te['Have you ever had suicidal thoughts ?'].unique()

In [None]:
student_tr['Have you ever had suicidal thoughts ?'] = student_tr['Have you ever had suicidal thoughts ?'].map({'No': False, 'Yes': True})
student_te['Have you ever had suicidal thoughts ?'] = student_te['Have you ever had suicidal thoughts ?'].map({'No': False, 'Yes': True})

## Student Train: Study Hours

In [None]:
dep_vs_s_hr = student_tr.groupby('Study Hours').Depression.value_counts(normalize=True).unstack()

In [None]:
dep_vs_s_hr.plot(kind='bar')

In [None]:
depression_vs_s_hr = student_tr.groupby('Study Hours').Depression.value_counts().unstack()
s_hr = depression_vs_s_hr.index
n = len(s_hr)
disim_matrix = [[0 for i in range(n)] for j in range(n)]

for i in range(0, n):
    for j in range(i, n):
        table = [
            depression_vs_s_hr.loc[s_hr[i]],
            depression_vs_s_hr.loc[s_hr[j]]
        ]
        _, p_val, _, _ = chi2_contingency(table)
        disim_matrix[i][j] = disim_matrix[j][i] = 1-p_val 

link_matrix = linkage(disim_matrix, method='ward')
_ = dendrogram(link_matrix, labels=s_hr, leaf_rotation=90)
plt.axhline(y=1, color='coral', linestyle='--')

for some reason, 11 and 12 should be treated into one category 7 and 9 should be another

In [None]:
student_tr.loc[student_tr['Study Hours'].isin([11, 12]), 'Study Hours'] = 'timezone 1'
student_tr.loc[student_tr['Study Hours'].isin([7, 9]), 'Study Hours'] = 'timezone 2'

student_tr['Study Hours'] = student_tr['Study Hours'].astype('category')

In [None]:
student_te.loc[student_te['Study Hours'].isin([11, 12]), 'Study Hours'] = 'timezone 1'
student_te.loc[student_te['Study Hours'].isin([7, 9]), 'Study Hours'] = 'timezone 2'

student_te['Study Hours'] = student_te['Study Hours'].astype('category')

## Student: Financial Stress

In [None]:
student_tr['Financial Stress'].unique()

In [None]:
mode = student_tr['Financial Stress'].mode()[0]
student_tr['Financial Stress'].fillna(mode, inplace=True)

In [None]:
mode = student_te['Financial Stress'].mode()[0]
student_te['Financial Stress'].fillna(mode, inplace=True)

In [None]:
dep_vs_finan = student_tr.groupby('Financial Stress').Depression.value_counts().unstack()

In [None]:
dep_vs_finan

In [None]:
dep_vs_finan.plot(kind='bar')

In [None]:
student_tr['Financial Stress'] = student_tr['Financial Stress'].astype('category')
student_te['Financial Stress'] = student_te['Financial Stress'].astype('category')

## Student: Family History

In [None]:
student_tr['Family History of Mental Illness'].unique()

In [None]:
student_te['Family History of Mental Illness'].unique()

In [None]:
student_tr['Family History of Mental Illness'] = student_tr['Family History of Mental Illness'].map({'No': False, 'Yes': True})
student_te['Family History of Mental Illness'] = student_te['Family History of Mental Illness'].map({'No': False, 'Yes': True})

## Student: Summary

In [None]:
student_tr.info()

In [None]:
student_te.info()

In [None]:
student_tr = pd.get_dummies(student_tr)
student_te = pd.get_dummies(student_te)

# Student: Baseline model

In [None]:
y = student_tr.pop('Depression')



In [None]:
model_reg_s = LogisticRegression(max_iter=1000)
scores = cross_val_score(model_reg_s, student_tr, y, cv=10)

In [None]:
scores

That's not very satisfying...but let's wrap up our work for now

In [None]:
student_id = student_te.pop('id')

In [None]:
model_reg_s.fit(student_tr, y)
student_pred = model_reg_s.predict(student_te)

In [None]:
result = pd.DataFrame(result)

In [None]:
student_result = pd.DataFrame({'id': student_id, 'Depression': student_pred})

temporal_result = pd.concat([result, student_result])

temporal_result = temporal_result.sort_values(by='id').reset_index(drop=True)

In [None]:
temporal_result.head()

In [None]:
temporal_result.to_csv('temp_submission.csv', index=False)