### Preprocessing

##### Read Data

In [1]:
# Read Data
import pandas as pd
fname = 'train.csv'
train = pd.read_csv(fname, low_memory=False)

fname = 'test.csv'
test = pd.read_csv(fname, low_memory=False)

##### Split Working Professional and Student

In [2]:
# student and working

train_stu = train[train['Working Professional or Student'] == 'Student']
train_wor = train[train['Working Professional or Student'] == 'Working Professional']

test_stu = test[test['Working Professional or Student'] == 'Student']
test_wor = test[test['Working Professional or Student'] == 'Working Professional']


##### Replace Missing Value

In [3]:
# Replace Value with 0
train_stu.loc[:,'Profession'] = 0
train_stu.loc[:,'Work Pressure'] = 0
train_stu.loc[:,'Job Satisfaction'] = 0
train_wor.loc[:,'Academic Pressure'] = 0
train_wor.loc[:,'CGPA'] = 0
train_wor.loc[:,'Study Satisfaction'] = 0

test_stu.loc[:,'Profession'] = 0
test_stu.loc[:,'Work Pressure'] = 0
test_stu.loc[:,'Job Satisfaction'] = 0
test_wor.loc[:,'Academic Pressure'] = 0
test_wor.loc[:,'CGPA'] = 0
test_wor.loc[:,'Study Satisfaction'] = 0

In [4]:
# Replace Missing Value
import numpy as np
from sklearn import impute
       
imp = impute.SimpleImputer(missing_values=np.nan, strategy='most_frequent', copy=False, keep_empty_features=True)
new_train = imp.fit_transform(train_stu)
train_stu = pd.DataFrame(data=new_train[0:,0:],
                        index=train_stu.index,
                        columns=train_stu.columns)
new_train = imp.fit_transform(train_wor)
train_wor = pd.DataFrame(data=new_train[0:,0:],
                        index=train_wor.index,
                        columns=train_wor.columns)

new_test = imp.fit_transform(test_stu)
test_stu = pd.DataFrame(data=new_test[0:,0:],
                    index=test_stu.index,
                    columns=test_stu.columns)
new_test = imp.fit_transform(test_wor)
test_wor = pd.DataFrame(data=new_test[0:,0:],
                    index=test_wor.index,
                    columns=test_wor.columns)

##### Replace Strange Value

In [5]:
# Replace Strange Value in train

train_stu['City'] = train_stu['City'].replace('Less Delhi', 'Delhi')
train_stu['City'] = train_stu['City'].replace('Less than 5 Kalyan', 'Kalyan')
most_frequent_value = train_stu['City'].mode()[0]
train_stu['City'] = train_stu['City'].replace('3.0', most_frequent_value)
train_stu['City'] = train_stu['City'].replace('City', most_frequent_value)
train_stu['City'] = train_stu['City'].replace('M.Tech', most_frequent_value)
train_stu['City'] = train_stu['City'].replace('ME', most_frequent_value)
train_stu['City'] = train_stu['City'].replace('M.Com', most_frequent_value)
train_stu['Sleep Duration'] = train_stu['Sleep Duration'].replace('1-2 hours', 'Less than 5 hours')
train_stu['Sleep Duration'] = train_stu['Sleep Duration'].replace('2-3 hours', 'Less than 5 hours')
train_stu['Sleep Duration'] = train_stu['Sleep Duration'].replace('4-5 hours', 'Less than 5 hours')
train_stu['Sleep Duration'] = train_stu['Sleep Duration'].replace('than 5 hours', 'Less than 5 hours')
train_stu['Sleep Duration'] = train_stu['Sleep Duration'].replace('55-66 hours', '5-6 hours')
train_stu['Sleep Duration'] = train_stu['Sleep Duration'].replace('8 hours', '7-8 hours')
train_stu['Sleep Duration'] = train_stu['Sleep Duration'].replace('10-11 hours', 'More than 8 hours')
train_stu['Sleep Duration'] = train_stu['Sleep Duration'].replace('40-45 hours', 'More than 8 hours')
allowed_values = ['Less than 5 hours', '5-6 hours', '6-7 hours', '7-8 hours', 'More than 8 hours']
most_frequent_value = train_stu['Sleep Duration'].mode()[0]
train_stu['Sleep Duration'] = train_stu['Sleep Duration'].apply(lambda x: x if x in allowed_values else most_frequent_value)
train_stu['Dietary Habits'] = train_stu['Dietary Habits'].replace('Less Healthy', 'Unhealthy')
train_stu['Dietary Habits'] = train_stu['Dietary Habits'].replace('No Healthy', 'Unhealthy')
train_stu['Dietary Habits'] = train_stu['Dietary Habits'].replace('Yes', 'Healthy')
allowed_values = ['Unhealthy', 'Moderate', 'Healthy']
most_frequent_value = train_stu['Dietary Habits'].mode()[0]
train_stu['Dietary Habits'] = train_stu['Dietary Habits'].apply(lambda x: x if x in allowed_values else most_frequent_value)
train_stu['Degree'] = train_stu['Degree'].replace('BArch', 'B.Arch')
train_stu['Degree'] = train_stu['Degree'].replace('LL B.Ed', 'B.Ed')
train_stu['Degree'] = train_stu['Degree'].replace('BPharm', 'B.Pharm')
train_stu['Degree'] = train_stu['Degree'].replace('BSc', 'B.Sc')
train_stu['Degree'] = train_stu['Degree'].replace('BA', 'B.A')
train_stu['Degree'] = train_stu['Degree'].replace('BBA', 'B.BA')
train_stu['Degree'] = train_stu['Degree'].replace('BE', 'B.E')
train_stu['Degree'] = train_stu['Degree'].replace('Class 11', 'Class 12')
train_stu['Degree'] = train_stu['Degree'].replace('MA', 'M.A')
train_stu['Degree'] = train_stu['Degree'].replace('MBA', 'M.BA')
train_stu['Degree'] = train_stu['Degree'].replace('ME', 'M.E')
train_stu['Degree'] = train_stu['Degree'].replace('MSc', 'M.Sc')
train_stu['Degree'] = train_stu['Degree'].replace('BCA', 'B.CA')
train_stu['Degree'] = train_stu['Degree'].replace('LLB', 'LL.B')
train_stu['Degree'] = train_stu['Degree'].replace('BHM', 'B.HM')
train_stu['Degree'] = train_stu['Degree'].replace('MCA', 'M.CA')
train_stu['Degree'] = train_stu['Degree'].replace('MD', 'M.D')
train_stu['Degree'] = train_stu['Degree'].replace('LLM', 'LL.M')
train_stu['Degree'] = train_stu['Degree'].replace('MHM', 'M.HM')
train_stu['Degree'] = train_stu['Degree'].replace('MPA', 'M.PA')
train_stu['Degree'] = train_stu['Degree'].replace('BH', 'B.H')
train_stu['Degree'] = train_stu['Degree'].replace('BPA', 'B.PA')
allowed_values = ['B.Arch', 'B.Com', 'B.Ed', 'B.Pharm', 'B.Sc', 'B.Tech', 'B.A', 'B.BA', 'B.E', 'Class 12', \
                  'M.Com', 'M.Ed', 'M.Pharm', 'M.Tech', 'M.A', 'M.BA', 'M.E', 'M.Sc', 'B.CA', 'PhD', 'LL.B', \
                  'B.HM', 'M.CA', 'M.D', 'MBBS', 'LL.M', 'M.HM', 'M.PA', 'B.H', 'B.PA']
most_frequent_value = train_stu['Degree'].mode()[0]
train_stu['Degree'] = train_stu['Degree'].apply(lambda x: x if x in allowed_values else most_frequent_value)

most_frequent_value = train_wor['City'].mode()[0]
train_wor['City'] = train_wor['City'].replace('City', most_frequent_value)
train_wor['City'] = train_wor['City'].replace('M.Com', most_frequent_value)
train_wor['City'] = train_wor['City'].replace('MSc', most_frequent_value)
train_wor['City'] = train_wor['City'].replace('MCA', most_frequent_value)
train_wor['City'] = train_wor['City'].replace('No', most_frequent_value)
train_wor['City'] = train_wor['City'].replace('Researcher', most_frequent_value)
most_frequent_value = train_wor['Profession'].mode()[0]
train_wor['Profession'] = train_wor['Profession'].replace('Finanancial Analyst', 'Financial Analyst')
train_wor['Profession'] = train_wor['Profession'].replace('Yogesh', most_frequent_value)
train_wor['Profession'] = train_wor['Profession'].replace('Dev', most_frequent_value)
train_wor['Profession'] = train_wor['Profession'].replace('Profession', most_frequent_value)
train_wor['Profession'] = train_wor['Profession'].replace('FamilyVirar', most_frequent_value)
train_wor['Profession'] = train_wor['Profession'].replace('Working Professional', most_frequent_value)
train_wor['Profession'] = train_wor['Profession'].replace('Patna', most_frequent_value)
train_wor['Profession'] = train_wor['Profession'].replace('Unveil', most_frequent_value)
train_wor['Profession'] = train_wor['Profession'].replace('Nagpur', most_frequent_value)
train_wor['Profession'] = train_wor['Profession'].replace('Moderate', most_frequent_value)
train_wor['Profession'] = train_wor['Profession'].replace('Pranav', most_frequent_value)
train_wor['Profession'] = train_wor['Profession'].replace('Visakhapatnam', most_frequent_value)
train_wor['Profession'] = train_wor['Profession'].replace('Yuvraj', most_frequent_value)
train_wor['Sleep Duration'] = train_wor['Sleep Duration'].replace('1-3 hours', 'Less than 5 hours')
train_wor['Sleep Duration'] = train_wor['Sleep Duration'].replace('2-3 hours', 'Less than 5 hours')
train_wor['Sleep Duration'] = train_wor['Sleep Duration'].replace('3-4 hours', 'Less than 5 hours')
train_wor['Sleep Duration'] = train_wor['Sleep Duration'].replace('4-5 hours', 'Less than 5 hours')
train_wor['Sleep Duration'] = train_wor['Sleep Duration'].replace('1-6 hours', 'Less than 5 hours')
train_wor['Sleep Duration'] = train_wor['Sleep Duration'].replace('3-6 hours', 'Less than 5 hours')
train_wor['Sleep Duration'] = train_wor['Sleep Duration'].replace('4-6 hours', '5-6 hours')
train_wor['Sleep Duration'] = train_wor['Sleep Duration'].replace('35-36 hours', '5-6 hours')
train_wor['Sleep Duration'] = train_wor['Sleep Duration'].replace('45-48 hours', '6-7 hours')
train_wor['Sleep Duration'] = train_wor['Sleep Duration'].replace('49 hours', '6-7 hours')
train_wor['Sleep Duration'] = train_wor['Sleep Duration'].replace('6-8 hours', '7-8 hours')
train_wor['Sleep Duration'] = train_wor['Sleep Duration'].replace('9-5 hours', '7-8 hours')
train_wor['Sleep Duration'] = train_wor['Sleep Duration'].replace('10-6 hours', '7-8 hours')
train_wor['Sleep Duration'] = train_wor['Sleep Duration'].replace('8-9 hours', 'More than 8 hours')
train_wor['Sleep Duration'] = train_wor['Sleep Duration'].replace('9-11 hours', 'More than 8 hours')
train_wor['Sleep Duration'] = train_wor['Sleep Duration'].replace('9-6 hours', 'More than 8 hours')
allowed_values = ['Less than 5 hours', '5-6 hours', '6-7 hours', '7-8 hours', 'More than 8 hours']
most_frequent_value = train_wor['Sleep Duration'].mode()[0]
train_wor['Sleep Duration'] = train_wor['Sleep Duration'].apply(lambda x: x if x in allowed_values else most_frequent_value)
train_wor['Dietary Habits'] = train_wor['Dietary Habits'].replace('More Healthy', 'Healthy')
train_wor['Dietary Habits'] = train_wor['Dietary Habits'].replace('Yes', 'Healthy')
train_wor['Dietary Habits'] = train_wor['Dietary Habits'].replace('No', 'Unhealthy')
allowed_values = ['Unhealthy', 'Moderate', 'Healthy']
most_frequent_value = train_wor['Dietary Habits'].mode()[0]
train_wor['Dietary Habits'] = train_wor['Dietary Habits'].apply(lambda x: x if x in allowed_values else most_frequent_value)
train_wor['Degree'] = train_wor['Degree'].replace('B.B.Arch', 'B.Arch')
train_wor['Degree'] = train_wor['Degree'].replace('BArch', 'B.Arch')
train_wor['Degree'] = train_wor['Degree'].replace('B B.Com', 'B.Com')
train_wor['Degree'] = train_wor['Degree'].replace('BEd', 'B.Ed')
train_wor['Degree'] = train_wor['Degree'].replace('LL B.Ed', 'B.Ed')
train_wor['Degree'] = train_wor['Degree'].replace('BPharm', 'B.Pharm')
train_wor['Degree'] = train_wor['Degree'].replace('BSc', 'B.Sc')
train_wor['Degree'] = train_wor['Degree'].replace('Data Scientist', 'B.Sc')
train_wor['Degree'] = train_wor['Degree'].replace('BA', 'B.A')
train_wor['Degree'] = train_wor['Degree'].replace('B BA', 'B.BA')
train_wor['Degree'] = train_wor['Degree'].replace('BBA', 'B.BA')
train_wor['Degree'] = train_wor['Degree'].replace('Business Analyst', 'B.BA')
train_wor['Degree'] = train_wor['Degree'].replace('BCA', 'B.CA')
train_wor['Degree'] = train_wor['Degree'].replace('BE', 'B.E')
train_wor['Degree'] = train_wor['Degree'].replace('BHM', 'B.HM')
train_wor['Degree'] = train_wor['Degree'].replace('LLB', 'LL.B')
train_wor['Degree'] = train_wor['Degree'].replace('LLM', 'LL.M')
train_wor['Degree'] = train_wor['Degree'].replace('M. Business Analyst', 'M.BA')
train_wor['Degree'] = train_wor['Degree'].replace('MEd', 'M.Ed')
train_wor['Degree'] = train_wor['Degree'].replace('MPharm', 'M.Pharm')
train_wor['Degree'] = train_wor['Degree'].replace('M.S', 'M.Sc')
train_wor['Degree'] = train_wor['Degree'].replace('MSc', 'M.Sc')
train_wor['Degree'] = train_wor['Degree'].replace('M_Tech', 'M.Tech')
train_wor['Degree'] = train_wor['Degree'].replace('MTech', 'M.Tech')
train_wor['Degree'] = train_wor['Degree'].replace('MA', 'M.A')
train_wor['Degree'] = train_wor['Degree'].replace('MBA', 'M.BA')
train_wor['Degree'] = train_wor['Degree'].replace('MCA', 'M.CA')
train_wor['Degree'] = train_wor['Degree'].replace('MD', 'M.D')
train_wor['Degree'] = train_wor['Degree'].replace('ME', 'M.E')
train_wor['Degree'] = train_wor['Degree'].replace('MHM', 'M.HM')
allowed_values = ['B.Arch', 'B.Com', 'B.Ed', 'B.Pharm', 'B.Sc', 'B.Tech', 'B.A', 'B.BA', 'B.CA', \
                  'B.E', 'B.HM', 'Class 12', 'LL.B', 'LL.M', 'M.BA', 'M.Arch', 'M.Com', 'M.Ed', \
                  'M.Pharm', 'M.Sc', 'M.Tech', 'M.A', 'M.BA', 'MBBS', 'M.CA', 'M.D', 'M.E', 'PhD']
most_frequent_value = train_wor['Degree'].mode()[0]
train_wor['Degree'] = train_wor['Degree'].apply(lambda x: x if x in allowed_values else most_frequent_value)


In [6]:
# Replace Strange Value in test
import pandas

pandas.set_option("future.no_silent_downcasting", True)
test_stu['Age'] = test_stu['Age'].replace(32.08, 32)
most_frequent_value = test_stu['City'].mode()[0]
test_stu['City'] = test_stu['City'].replace('City', most_frequent_value)
test_stu['City'] = test_stu['City'].replace('Less than 5 hours', most_frequent_value)
test_stu['Sleep Duration'] = test_stu['Sleep Duration'].replace('1-6 hours', 'Less than 5 hours')
test_stu['Sleep Duration'] = test_stu['Sleep Duration'].replace('2-3 hours', 'Less than 5 hours')
test_stu['Sleep Duration'] = test_stu['Sleep Duration'].replace('4-5 hours', 'Less than 5 hours')
test_stu['Sleep Duration'] = test_stu['Sleep Duration'].replace('3-6 hours', 'Less than 5 hours')
test_stu['Sleep Duration'] = test_stu['Sleep Duration'].replace('20-21 hours', 'Less than 5 hours')
test_stu['Sleep Duration'] = test_stu['Sleep Duration'].replace('4-6 hours', '5-6 hours')
test_stu['Sleep Duration'] = test_stu['Sleep Duration'].replace('6 hours', '5-6 hours')
test_stu['Sleep Duration'] = test_stu['Sleep Duration'].replace('50-75 hours', '7-8 hours')
test_stu['Sleep Duration'] = test_stu['Sleep Duration'].replace('9-5 hours', '7-8 hours')
test_stu['Sleep Duration'] = test_stu['Sleep Duration'].replace('10-6 hours', '7-8 hours')
test_stu['Sleep Duration'] = test_stu['Sleep Duration'].replace('8-9 hours', 'More than 8 hours')
test_stu['Sleep Duration'] = test_stu['Sleep Duration'].replace('60-65 hours', 'More than 8 hours')
test_stu['Sleep Duration'] = test_stu['Sleep Duration'].replace('9-6 hours', 'More than 8 hours')
test_stu['Sleep Duration'] = test_stu['Sleep Duration'].replace('9-10 hours', 'More than 8 hours')
test_stu['Sleep Duration'] = test_stu['Sleep Duration'].replace('9-11 hours', 'More than 8 hours')
allowed_values = ['Less than 5 hours', '5-6 hours', '6-7 hours', '7-8 hours', 'More than 8 hours']
most_frequent_value = test_stu['Sleep Duration'].mode()[0]
test_stu['Sleep Duration'] = test_stu['Sleep Duration'].apply(lambda x: x if x in allowed_values else most_frequent_value)
test_stu['Dietary Habits'] = test_stu['Dietary Habits'].replace('5 Unhealthy', 'Unhealthy')
test_stu['Dietary Habits'] = test_stu['Dietary Habits'].replace('Less Healthy', 'Unhealthy')
test_stu['Dietary Habits'] = test_stu['Dietary Habits'].replace('No', 'Unhealthy')
test_stu['Dietary Habits'] = test_stu['Dietary Habits'].replace('More Healthy', 'Healthy')
allowed_values = ['Unhealthy', 'Moderate', 'Healthy']
most_frequent_value = test_stu['Dietary Habits'].mode()[0]
test_stu['Dietary Habits'] = test_stu['Dietary Habits'].apply(lambda x: x if x in allowed_values else most_frequent_value)
test_stu['Degree'] = test_stu['Degree'].replace('BPharm', 'B.Pharm')
test_stu['Degree'] = test_stu['Degree'].replace('BA', 'B.A')
test_stu['Degree'] = test_stu['Degree'].replace('BBA', 'B.BA')
test_stu['Degree'] = test_stu['Degree'].replace('BCA', 'B.CA')
test_stu['Degree'] = test_stu['Degree'].replace('BE', 'B.E')
test_stu['Degree'] = test_stu['Degree'].replace('BHM', 'B.HM')
test_stu['Degree'] = test_stu['Degree'].replace('BSc', 'B.Sc')
test_stu['Degree'] = test_stu['Degree'].replace('LLB', 'LL.B')
test_stu['Degree'] = test_stu['Degree'].replace('LLM', 'LL.M')
test_stu['Degree'] = test_stu['Degree'].replace('M.B.Ed', 'M.Ed')
test_stu['Degree'] = test_stu['Degree'].replace('MPharm', 'M.Pharm')
test_stu['Degree'] = test_stu['Degree'].replace('MA', 'M.A')
test_stu['Degree'] = test_stu['Degree'].replace('MBA', 'M.BA')
test_stu['Degree'] = test_stu['Degree'].replace('MCA', 'M.CA')
test_stu['Degree'] = test_stu['Degree'].replace('MD', 'M.D')
test_stu['Degree'] = test_stu['Degree'].replace('ME', 'M.E')
test_stu['Degree'] = test_stu['Degree'].replace('MHM', 'M.HM')
test_stu['Degree'] = test_stu['Degree'].replace('MSc', 'M.Sc')
allowed_values = ['B.Arch', 'B.Com', 'B.Ed', 'B.Pharm', 'B.Tech', 'B.A', 'B.BA', 'B.CA', 'B.E', 'B.HM', \
                  'B.Sc', 'Class 12', 'LL.B', 'LL.M', 'M.Arch', 'M.Ed', 'M.Com', 'M.Pharm', 'M.Tech', \
                  'M.A', 'M.BA', 'M.CA', 'M.D', 'M.E', 'M.HM', 'M.Sc', 'PhD', 'MBBS', 'M.UI']
most_frequent_value = test_stu['Degree'].mode()[0]
test_stu['Degree'] = test_stu['Degree'].apply(lambda x: x if x in allowed_values else most_frequent_value)

test_wor['City'] = test_wor['City'].replace('Less Delhi', 'Delhi')
test_wor['City'] = test_wor['City'].replace('More Delhi', 'Delhi')
most_frequent_value = test_wor['City'].mode()[0]
test_wor['City'] = test_wor['City'].replace('City', most_frequent_value)
test_wor['City'] = test_wor['City'].replace('Lawyer', most_frequent_value)
test_wor['City'] = test_wor['City'].replace('Chemist', most_frequent_value)
test_wor['City'] = test_wor['City'].replace('No', most_frequent_value)
test_wor['City'] = test_wor['City'].replace('No.12', most_frequent_value)
test_wor['City'] = test_wor['City'].replace('Unhealthy', most_frequent_value)
test_wor['City'] = test_wor['City'].replace('24th', most_frequent_value)
test_wor['City'] = test_wor['City'].replace('Name', most_frequent_value)
test_wor['City'] = test_wor['City'].replace('No', most_frequent_value)
most_frequent_value = test_wor['Profession'].mode()[0]
test_wor['Profession'] = test_wor['Profession'].replace('Finanancial Analyst', 'Financial Analyst')
test_wor['Profession'] = test_wor['Profession'].replace('Unhealthy', most_frequent_value)
test_wor['Profession'] = test_wor['Profession'].replace('Working Professional', most_frequent_value)
test_wor['Profession'] = test_wor['Profession'].replace('24th', most_frequent_value)
test_wor['Profession'] = test_wor['Profession'].replace('Manvi', most_frequent_value)
test_wor['Profession'] = test_wor['Profession'].replace('Yogesh', most_frequent_value)
test_wor['Profession'] = test_wor['Profession'].replace('Samar', most_frequent_value)
test_wor['Profession'] = test_wor['Profession'].replace('Surat', most_frequent_value)
test_wor['Profession'] = test_wor['Profession'].replace('Name', most_frequent_value)
test_wor['Profession'] = test_wor['Profession'].replace('Simran', most_frequent_value)
test_wor['Profession'] = test_wor['Profession'].replace('Profession', most_frequent_value)
test_wor['Profession'] = test_wor['Profession'].replace('No', most_frequent_value)
test_wor['Profession'] = test_wor['Profession'].replace('Unveil', most_frequent_value)
test_wor['Sleep Duration'] = test_wor['Sleep Duration'].replace('than 5 hours', 'Less than 5 hours')
test_wor['Sleep Duration'] = test_wor['Sleep Duration'].replace('1-2 hours', 'Less than 5 hours')
test_wor['Sleep Duration'] = test_wor['Sleep Duration'].replace('1-3 hours', 'Less than 5 hours')
test_wor['Sleep Duration'] = test_wor['Sleep Duration'].replace('2-3 hours', 'Less than 5 hours')
test_wor['Sleep Duration'] = test_wor['Sleep Duration'].replace('3-4 hours', 'Less than 5 hours')
test_wor['Sleep Duration'] = test_wor['Sleep Duration'].replace('4-5 hours', 'Less than 5 hours')
test_wor['Sleep Duration'] = test_wor['Sleep Duration'].replace('3-6 hours', 'Less than 5 hours')
test_wor['Sleep Duration'] = test_wor['Sleep Duration'].replace('20-21 hours hours hours', 'Less than 5 hours')
test_wor['Sleep Duration'] = test_wor['Sleep Duration'].replace('4-6 hours', '5-6 hours')
test_wor['Sleep Duration'] = test_wor['Sleep Duration'].replace('6 hours', '5-6 hours')
test_wor['Sleep Duration'] = test_wor['Sleep Duration'].replace('9-5 hours', '7-8 hours')
test_wor['Sleep Duration'] = test_wor['Sleep Duration'].replace('10-6 hours', '7-8 hours')
test_wor['Sleep Duration'] = test_wor['Sleep Duration'].replace('50-75 hours', '7-8 hours')
test_wor['Sleep Duration'] = test_wor['Sleep Duration'].replace('8-9 hours', 'More than 8 hours')
test_wor['Sleep Duration'] = test_wor['Sleep Duration'].replace('8-89 hours hours', 'More than 8 hours')
test_wor['Sleep Duration'] = test_wor['Sleep Duration'].replace('9-10 hours', 'More than 8 hours')
test_wor['Sleep Duration'] = test_wor['Sleep Duration'].replace('9-11 hours', 'More than 8 hours')
allowed_values = ['Less than 5 hours', '5-6 hours', '6-7 hours', '7-8 hours', 'More than 8 hours']
most_frequent_value = test_wor['Sleep Duration'].mode()[0]
test_wor['Sleep Duration'] = test_wor['Sleep Duration'].apply(lambda x: x if x in allowed_values else most_frequent_value)
test_wor['Dietary Habits'] = test_wor['Dietary Habits'].replace('Less Healthy', 'Unhealthy')
test_wor['Dietary Habits'] = test_wor['Dietary Habits'].replace('No', 'Unhealthy')
test_wor['Dietary Habits'] = test_wor['Dietary Habits'].replace('5 Unhealthy', 'Unhealthy')
allowed_values = ['Unhealthy', 'Moderate', 'Healthy']
most_frequent_value = test_wor['Dietary Habits'].mode()[0]
test_wor['Dietary Habits'] = test_wor['Dietary Habits'].apply(lambda x: x if x in allowed_values else most_frequent_value)
test_wor['Degree'] = test_wor['Degree'].replace('BTech', 'B.Tech')
test_wor['Degree'] = test_wor['Degree'].replace('B B.Tech', 'B.Tech')
test_wor['Degree'] = test_wor['Degree'].replace('B Financial Analyst', 'B.F')
test_wor['Degree'] = test_wor['Degree'].replace('B._Pharm', 'B.Pharm')
test_wor['Degree'] = test_wor['Degree'].replace('BPharm', 'B.Pharm')
test_wor['Degree'] = test_wor['Degree'].replace('BArch', 'B.Arch')
test_wor['Degree'] = test_wor['Degree'].replace('BBA', 'B.BA')
test_wor['Degree'] = test_wor['Degree'].replace('Business Analyst', 'B.BA')
test_wor['Degree'] = test_wor['Degree'].replace('BCA', 'B.CA')
test_wor['Degree'] = test_wor['Degree'].replace('B BCA', 'B.CA')
test_wor['Degree'] = test_wor['Degree'].replace('B_Com', 'B.Com')
test_wor['Degree'] = test_wor['Degree'].replace('BEd', 'B.Ed')
test_wor['Degree'] = test_wor['Degree'].replace('BH', 'B.H')
test_wor['Degree'] = test_wor['Degree'].replace('B.M.Com', 'M.Com')
test_wor['Degree'] = test_wor['Degree'].replace('BSc', 'B.Sc')
test_wor['Degree'] = test_wor['Degree'].replace('Mechanical Engineer', 'B.Sc')
test_wor['Degree'] = test_wor['Degree'].replace('BA', 'B.A')
test_wor['Degree'] = test_wor['Degree'].replace('BE', 'B.E')
test_wor['Degree'] = test_wor['Degree'].replace('BHCA', 'B.HCA')
test_wor['Degree'] = test_wor['Degree'].replace('BHM', 'B.HM')
test_wor['Degree'] = test_wor['Degree'].replace('LLB', 'LL.B')
test_wor['Degree'] = test_wor['Degree'].replace('LLM', 'LL.M')
test_wor['Degree'] = test_wor['Degree'].replace('M.M.Ed', 'M.Ed')
test_wor['Degree'] = test_wor['Degree'].replace('MPharm', 'M.Pharm')
test_wor['Degree'] = test_wor['Degree'].replace('MA', 'M.A')
test_wor['Degree'] = test_wor['Degree'].replace('MBA', 'M.BA')
test_wor['Degree'] = test_wor['Degree'].replace('MCA', 'M.CA')
test_wor['Degree'] = test_wor['Degree'].replace('MD', 'M.D')
test_wor['Degree'] = test_wor['Degree'].replace('ME', 'M.E')
test_wor['Degree'] = test_wor['Degree'].replace('MHM', 'M.HM')
test_wor['Degree'] = test_wor['Degree'].replace('MSc', 'M.Sc')
allowed_values = ['B.Arch', 'B.Com', 'B.Ed', 'B.Pharm', 'B.Tech', 'B.A', 'B.BA', 'B.CA', 'B.E', 'B.F', \
                  'B.H', 'B.HCA', 'B.HM', 'B.Sc', 'Class 12', 'LL.B', 'LL.M', 'M.Arch', 'M.Ed', 'M.Com', \
                  'M.Pharm', 'M.Tech', 'M.A', 'M.BA', 'M.CA', 'M.D', 'M.E', 'M.HM', 'M.Sc', 'PhD', 'MBBS']
most_frequent_value = test_wor['Degree'].mode()[0]
test_wor['Degree'] = test_wor['Degree'].apply(lambda x: x if x in allowed_values else most_frequent_value)


##### Catagorical to Integer

In [9]:
# Gender

mapping = {'Male': 1, 'Female': 2}

train_stu['Gender'] = train_stu['Gender'].map(mapping)
train_wor['Gender'] = train_wor['Gender'].map(mapping)
test_stu['Gender'] = test_stu['Gender'].map(mapping)
test_wor['Gender'] = test_wor['Gender'].map(mapping)

In [10]:
# City

mapping = { 'Aaradhya': 1, 'Aditi': 2, 'Aditya': 3, 'Abhinav': 4, 'Agra': 5, \
            'Ahmedabad': 6, 'Aishwarya': 7, 'Anvi': 8, 'Armaan': 9, 'Atharv': 10, \
            'Ayansh': 11, 'Ayush': 12, 'Bangalore': 13, 'Bhavna': 14, 'Bhopal': 15, \
            'Chennai': 16, 'Chhavi': 17, 'Delhi': 18, 'Dhruv': 19, 'Faridabad': 20, \
            'Galesabad': 21, 'Gaurav': 22, 'Ghaziabad': 23, 'Ghopal': 24, 'Golkata': 25, \
            'Gurgaon': 26, 'Harsh': 27, 'Harsha': 28, 'Hrithik': 29, 'Hyderabad': 30, \
            'Indore': 31, 'Ira': 32, 'Ishanabad': 33, 'Ishkarsh': 34, 'Ithal': 35, \
            'Itheg': 36, 'Ivaan': 37, 'Is Kanpur': 38, 'Jaipur': 39, 'Jhanvi': 40, \
            'Kagan': 41, 'Kalyan': 42, 'Kanpur': 43, 'Kashish': 44, 'Kashk': 45, \
            'Keshav': 46, 'Khaziabad': 47, 'Khushi': 48, 'Kibara': 49, 'Kolkata': 50, \
            'Krinda': 51, 'Krishna': 52, 'Leela': 53, 'Lucknow': 54, 'Ludhiana': 55, \
            'Mahi': 56, 'Malyan': 57, 'Malyansh': 58, 'Meerut': 59, 'Mhopal': 60, \
            'Mihir': 61, 'Mira': 62, 'Molkata': 63, 'Moreadhyay': 64, 'Morena': 65, \
            'Mumbai': 66, 'Nagpur': 67, 'Nalini': 68, 'Nalyan': 69, 'Nandini': 70, \
            'Nashik': 71, 'Parth': 72, 'Patna': 73, 'Plata': 74, 'Pooja': 75, \
            'Pratham': 76, 'Pratyush': 77, 'Pune': 78, 'Raghavendra': 79, 'Rajkot': 80, \
            'Rashi': 81, 'Reyansh': 82, 'Rolkata': 83, 'Saanvi': 84, 'San Vasai-Virar': 85, \
            'Sara': 86, 'Saurav': 87, 'Siddhesh': 88, 'Shrey': 89, 'Srinagar': 90, \
            'Surat': 91, 'Thane': 92, 'Thani': 93, 'Tolkata': 94, 'Tushar': 95, \
            'Unaly': 96, 'Unirar': 97, 'Vaanya': 98, 'Vadodara': 99, 'Vaikot': 100, \
            'Vaishnavi': 101, 'Varanasi': 102, 'Vasai-Virar': 103, 'Vidhi': 104, 'Vidya': 105, \
            'Vikram': 106, 'Visakhapatnam': 107, 'Avni': 108 }
            

train_stu['City'] = train_stu['City'].map(mapping)
train_wor['City'] = train_wor['City'].map(mapping)
test_stu['City'] = test_stu['City'].map(mapping)
test_wor['City'] = test_wor['City'].map(mapping)

In [11]:
# Profession

mapping = { 'Academic': 1, 'Accountant': 2, 'Analyst': 3, 'Architect': 4, 'B.Com': 5, \
            'B.Ed': 6, 'BBA': 7, 'BCA': 8, 'BE': 9, 'Business Analyst': 10, \
            'Chef': 11, 'Chemist': 12, 'City Consultant': 13, 'City Manager': 14, 'Civil Engineer': 15, \
            'Consultant': 16, 'Content Writer': 17, 'Customer Support': 18, 'Data Scientist': 19, 'M.Pharm': 20, \
            'Digital Marketer': 21, 'Doctor': 22, 'Educational Consultant': 23, 'Electrician': 24, 'Entrepreneur': 25, \
            'Family Consultant': 26, 'FamilyVirar': 27, 'Financial Analyst': 28, 'Graphic Designer': 29, 'HR Manager': 30, \
            'Investment Banker': 31, 'Judge': 32, 'Lawyer': 33, 'LLM': 34, 'M.Ed': 35, \
            'Manager': 36, 'Marketing Manager': 37, 'MBA': 38, 'MBBS': 39, 'Mechanical Engineer': 40, \
            'Medical Doctor': 41, 'MCA': 42, 'MD': 43, 'ME': 44, 'Pharmacist': 45, \
            'PhD': 46, 'Pilot': 47, 'Plumber': 48, 'B.Pharm': 49, 'HR Manager': 50, \
            'Research Analyst': 51, 'Researcher': 52, 'Sales Executive': 53, 'Software Engineer': 54, 'Student': 55, \
            'Teacher': 56, 'Travel Consultant': 57, 'Unemployed': 58, 'Surgeon': 59, 'UX/UI Designer': 60, }
            

train_wor['Profession'] = train_wor['Profession'].map(mapping)
test_wor['Profession'] = test_wor['Profession'].map(mapping)

In [12]:
# Working Professional or Student

mapping = {'Working Professional': 1, 'Student': 2}

train_stu['Working Professional or Student'] = train_stu['Working Professional or Student'].map(mapping)
train_wor['Working Professional or Student'] = train_wor['Working Professional or Student'].map(mapping)
test_stu['Working Professional or Student'] = test_stu['Working Professional or Student'].map(mapping)
test_wor['Working Professional or Student'] = test_wor['Working Professional or Student'].map(mapping)

In [13]:
# Sleep Duration

mapping = {'Less than 5 hours': 1, '5-6 hours': 2, '6-7 hours': 3, '7-8 hours': 4, 'More than 8 hours': 5}

train_stu['Sleep Duration'] = train_stu['Sleep Duration'].map(mapping)
train_wor['Sleep Duration'] = train_wor['Sleep Duration'].map(mapping)
test_stu['Sleep Duration'] = test_stu['Sleep Duration'].map(mapping)
test_wor['Sleep Duration'] = test_wor['Sleep Duration'].map(mapping)

In [14]:
# Dietary Habits

mapping = {'Unhealthy': 0, 'Moderate': 1, 'Healthy': 2}

train_stu['Dietary Habits'] = train_stu['Dietary Habits'].map(mapping)
train_wor['Dietary Habits'] = train_wor['Dietary Habits'].map(mapping)
test_stu['Dietary Habits'] = test_stu['Dietary Habits'].map(mapping)
test_wor['Dietary Habits'] = test_wor['Dietary Habits'].map(mapping)

In [15]:
# Degree

mapping = { 'B.A': 1, 'B.Arch': 2, 'B.BA': 3, 'B.CA': 4, 'B.Com': 5, \
            'B.E': 6, 'B.Ed': 7, 'B.F': 8, 'B.H': 9, 'B.HCA': 10, \
            'B.HM': 11, 'B.PA': 12, 'B.Pharm': 13, 'B.Sc': 14, 'B.Tech': 15, \
            'Class 12': 16, 'LL.B': 17, 'LL.M': 18, 'M.A': 19, 'M.Arch': 20, \
            'M.BA': 21, 'M.CA': 22, 'M.Com': 23, 'M.D': 24, 'M.E': 25, \
            'M.Ed': 26, 'M.HM': 27, 'M.PA': 28, 'M.Pharm': 29, 'M.Sc': 30, \
            'M.Tech': 31, 'M.UI': 32, 'MBBS': 33, 'PhD': 34 }

train_stu['Degree'] = train_stu['Degree'].map(mapping)
train_wor['Degree'] = train_wor['Degree'].map(mapping)
test_stu['Degree'] = test_stu['Degree'].map(mapping)
test_wor['Degree'] = test_wor['Degree'].map(mapping)

In [16]:
# Have you ever had suicidal thoughts ?

mapping = {'No': 0, 'Yes': 1}

train_stu['Have you ever had suicidal thoughts ?'] = train_stu['Have you ever had suicidal thoughts ?'].map(mapping)
train_wor['Have you ever had suicidal thoughts ?'] = train_wor['Have you ever had suicidal thoughts ?'].map(mapping)
test_stu['Have you ever had suicidal thoughts ?'] = test_stu['Have you ever had suicidal thoughts ?'].map(mapping)
test_wor['Have you ever had suicidal thoughts ?'] = test_wor['Have you ever had suicidal thoughts ?'].map(mapping)

In [17]:
# Family History of Mental Illness

mapping = {'No': 0, 'Yes': 1}

train_stu['Family History of Mental Illness'] = train_stu['Family History of Mental Illness'].map(mapping)
train_wor['Family History of Mental Illness'] = train_wor['Family History of Mental Illness'].map(mapping)
test_stu['Family History of Mental Illness'] = test_stu['Family History of Mental Illness'].map(mapping)
test_wor['Family History of Mental Illness'] = test_wor['Family History of Mental Illness'].map(mapping)

##### Integer data to csv

In [18]:
train_stu.to_csv('int_train_stu.csv',index=False)
train_wor.to_csv('int_train_wor.csv',index=False)
test_stu.to_csv('int_test_stu.csv',index=False)
test_wor.to_csv('int_test_wor.csv',index=False)

In [1]:
# Read Data
import pandas as pd
fname = 'int_train_stu.csv'
train_stu = pd.read_csv(fname, low_memory=False)
fname = 'int_train_wor.csv'
train_wor = pd.read_csv(fname, low_memory=False)

fname = 'int_test_stu.csv'
test_stu = pd.read_csv(fname, low_memory=False)
fname = 'int_test_wor.csv'
test_wor = pd.read_csv(fname, low_memory=False)

### Classification

In [3]:
# Seperate X and Y
train_stu = train_stu.drop("Name", axis='columns')
train_wor = train_wor.drop("Name", axis='columns')
test_stu = test_stu.drop("Name", axis='columns')
test_wor = test_wor.drop("Name", axis='columns')

train_stu_X = train_stu.iloc[:,:-1]
train_stu_Y = train_stu.iloc[:,-1].astype('category')
train_wor_X = train_wor.iloc[:,:-1]
train_wor_Y = train_wor.iloc[:,-1].astype('category')

##### KNeighborsClassifier

In [None]:
# Test parameters
from sklearn.neighbors import KNeighborsClassifier
from time import perf_counter

nVal = 18
weight = 'distance'
algor = 'kd_tree'
leaf = 20
pVal = 1
met = 'minkowski'
para = None
job = 2
KneighClf = KNeighborsClassifier(n_neighbors=nVal, weights=weight, algorithm=algor, \
                                 leaf_size=leaf, p=pVal, metric=met, \
                                 metric_params=para, n_jobs=job)

start_time = perf_counter()

Kclf_stu = KneighClf.fit(train_stu_X.iloc[:,1:], train_stu_Y)
testX = test_stu
predictKclf = Kclf_stu.predict(testX.iloc[:,1:])
result1 = pd.DataFrame(testX['id'])
result1['Depression'] = predictKclf

Kclf_wor = KneighClf.fit(train_wor_X.iloc[:,1:], train_wor_Y)
testX = test_wor
predictKclf = Kclf_wor.predict(testX.iloc[:,1:])
result2 = pd.DataFrame(testX['id'])
result2['Depression'] = predictKclf

result = pd.concat([result1, result2], ignore_index=True)
sorted_df = result.sort_values(by='id')

end_time = perf_counter()
execution_time = end_time - start_time
print(f"executeion time：{execution_time:.4f} s")

executeion time：15.5738 s


In [169]:
# Output to csv
fname = str(nVal) + ',' + str(weight)+ ',' + str(algor)+ ',' \
        + str(leaf)+ ',' + str(pVal) + ',' + str(met) \
        + ',' + str(para) + ',' + str(job) + '.csv'
sorted_df.to_csv(fname,index=False)


In [156]:
# drop attributes containing 0 values only
from sklearn.neighbors import KNeighborsClassifier
from time import perf_counter

nVal = 18
weight = 'distance'
algor = 'kd_tree'
leaf = 20
pVal = 2
met = 'minkowski'
para = None
job = None
KneighClf = KNeighborsClassifier(n_neighbors=nVal, weights=weight, algorithm=algor, \
                                 leaf_size=leaf, p=pVal, metric=met, \
                                 metric_params=para, n_jobs=job)

start_time = perf_counter()

train_stu_X_d = train_stu_X.drop("Profession", axis="columns")
train_stu_X_d = train_stu_X.drop("Work Pressure", axis="columns")
train_stu_X_d = train_stu_X.drop("Job Satisfaction", axis="columns")
train_wor_X_d = train_wor_X.drop("Academic Pressure", axis="columns")
train_wor_X_d = train_wor_X.drop("CGPA", axis="columns")
train_wor_X_d = train_wor_X.drop("Study Satisfaction", axis="columns")
test_stu_d = test_stu.drop("Profession", axis="columns")
test_stu_d = test_stu.drop("Work Pressure", axis="columns")
test_stu_d = test_stu.drop("Job Satisfaction", axis="columns")
test_wor_d = test_wor.drop("Academic Pressure", axis="columns")
test_wor_d = test_wor.drop("CGPA", axis="columns")
test_wor_d = test_wor.drop("Study Satisfaction", axis="columns")

Kclf_stu = KneighClf.fit(train_stu_X_d.iloc[:,1:], train_stu_Y)
testX = test_stu_d
predictKclf = Kclf_stu.predict(testX.iloc[:,1:])
result1 = pd.DataFrame(testX['id'])
result1['Depression'] = predictKclf

Kclf_wor = KneighClf.fit(train_wor_X_d.iloc[:,1:], train_wor_Y)
testX = test_wor_d
predictKclf = Kclf_wor.predict(testX.iloc[:,1:])
result2 = pd.DataFrame(testX['id'])
result2['Depression'] = predictKclf

result = pd.concat([result1, result2], ignore_index=True)
sorted_df = result.sort_values(by='id')

end_time = perf_counter()
execution_time = end_time - start_time
print(f"executeion time：{execution_time:.4f} s")

fname = 'drop_' + str(nVal) + ',' + str(weight)+ ',' + str(algor)+ ',' \
        + str(leaf)+ ',' + str(pVal) + ',' + str(met) \
        + ',' + str(para) + ',' + str(job) + '.csv'
sorted_df.to_csv(fname,index=False)

executeion time：9.2337 s


In [216]:
# try to drop one more attribute
from sklearn.neighbors import KNeighborsClassifier

nVal = 18
weight = 'distance'
algor = 'kd_tree'
leaf = 20
pVal = 2
met = 'minkowski'
para = None
job = None
KneighClf = KNeighborsClassifier(n_neighbors=nVal, weights=weight, algorithm=algor, \
                                 leaf_size=leaf, p=pVal, metric=met, \
                                 metric_params=para, n_jobs=job)

train_stu_X_d = train_stu_X.drop("Profession", axis="columns")
train_stu_X_d = train_stu_X_d.drop("Work Pressure", axis="columns")
train_stu_X_d = train_stu_X_d.drop("Job Satisfaction", axis="columns")
train_wor_X_d = train_wor_X.drop("Academic Pressure", axis="columns")
train_wor_X_d = train_wor_X_d.drop("CGPA", axis="columns")
train_wor_X_d = train_wor_X_d.drop("Study Satisfaction", axis="columns")
test_stu_d = test_stu.drop("Profession", axis="columns")
test_stu_d = test_stu_d.drop("Work Pressure", axis="columns")
test_stu_d = test_stu_d.drop("Job Satisfaction", axis="columns")
test_wor_d = test_wor.drop("Academic Pressure", axis="columns")
test_wor_d = test_wor_d.drop("CGPA", axis="columns")
test_wor_d = test_wor_d.drop("Study Satisfaction", axis="columns")
train_stu_X_d = train_stu_X_d.drop("Working Professional or Student", axis="columns")
train_wor_X_d = train_wor_X_d.drop("Working Professional or Student", axis="columns")
test_stu_d = test_stu_d.drop("Working Professional or Student", axis="columns")
test_wor_d = test_wor_d.drop("Working Professional or Student", axis="columns")

drop_stu = "Work/Study Hours"
train_stu_X_d = train_stu_X_d.drop(drop_stu, axis="columns")
test_stu_d = test_stu_d.drop(drop_stu, axis="columns")
drop_wor = ""
#train_wor_X_d = train_wor_X_d.drop(drop_wor, axis="columns")
#test_wor_d = test_wor_d.drop(drop_wor, axis="columns")

Kclf_stu = KneighClf.fit(train_stu_X_d.iloc[:,1:], train_stu_Y)
testX = test_stu_d
predictKclf = Kclf_stu.predict(testX.iloc[:,1:])
result1 = pd.DataFrame(testX['id'])
result1['Depression'] = predictKclf

Kclf_wor = KneighClf.fit(train_wor_X_d.iloc[:,1:], train_wor_Y)
testX = test_wor_d
predictKclf = Kclf_wor.predict(testX.iloc[:,1:])
result2 = pd.DataFrame(testX['id'])
result2['Depression'] = predictKclf

result = pd.concat([result1, result2], ignore_index=True)
sorted_df = result.sort_values(by='id')

fname = 'drop_stu_' + "Study Hours" + '_wor_' + str(drop_wor) \
         + '.csv'
sorted_df.to_csv(fname,index=False)

In [21]:
# try to drop two to four more attributes
from sklearn.neighbors import KNeighborsClassifier

nVal = 18
weight = 'distance'
algor = 'kd_tree'
leaf = 20
pVal = 2
met = 'minkowski'
para = None
job = None
KneighClf = KNeighborsClassifier(n_neighbors=nVal, weights=weight, algorithm=algor, \
                                 leaf_size=leaf, p=pVal, metric=met, \
                                 metric_params=para, n_jobs=job)

train_stu_X_d = train_stu_X.drop("Profession", axis="columns")
train_stu_X_d = train_stu_X_d.drop("Work Pressure", axis="columns")
train_stu_X_d = train_stu_X_d.drop("Job Satisfaction", axis="columns")
train_wor_X_d = train_wor_X.drop("Academic Pressure", axis="columns")
train_wor_X_d = train_wor_X_d.drop("CGPA", axis="columns")
train_wor_X_d = train_wor_X_d.drop("Study Satisfaction", axis="columns")
test_stu_d = test_stu.drop("Profession", axis="columns")
test_stu_d = test_stu_d.drop("Work Pressure", axis="columns")
test_stu_d = test_stu_d.drop("Job Satisfaction", axis="columns")
test_wor_d = test_wor.drop("Academic Pressure", axis="columns")
test_wor_d = test_wor_d.drop("CGPA", axis="columns")
test_wor_d = test_wor_d.drop("Study Satisfaction", axis="columns")
train_stu_X_d = train_stu_X_d.drop("Working Professional or Student", axis="columns")
train_wor_X_d = train_wor_X_d.drop("Working Professional or Student", axis="columns")
test_stu_d = test_stu_d.drop("Working Professional or Student", axis="columns")
test_wor_d = test_wor_d.drop("Working Professional or Student", axis="columns")

drop_stu1 = "City"
train_stu_X_d = train_stu_X_d.drop(drop_stu1, axis="columns")
test_stu_d = test_stu_d.drop(drop_stu1, axis="columns")
drop_stu2 = "Degree"
train_stu_X_d = train_stu_X_d.drop(drop_stu2, axis="columns")
test_stu_d = test_stu_d.drop(drop_stu2, axis="columns")
drop_wor1 = "Profession"
train_wor_X_d = train_wor_X_d.drop(drop_wor1, axis="columns")
test_wor_d = test_wor_d.drop(drop_wor1, axis="columns")
drop_wor2 = "Degree"
train_wor_X_d = train_wor_X_d.drop(drop_wor2, axis="columns")
test_wor_d = test_wor_d.drop(drop_wor2, axis="columns")

Kclf_stu = KneighClf.fit(train_stu_X_d.iloc[:,1:], train_stu_Y)
testX = test_stu_d
predictKclf = Kclf_stu.predict(testX.iloc[:,1:])
result1 = pd.DataFrame(testX['id'])
result1['Depression'] = predictKclf

Kclf_wor = KneighClf.fit(train_wor_X_d.iloc[:,1:], train_wor_Y)
testX = test_wor_d
predictKclf = Kclf_wor.predict(testX.iloc[:,1:])
result2 = pd.DataFrame(testX['id'])
result2['Depression'] = predictKclf

result = pd.concat([result1, result2], ignore_index=True)
sorted_df = result.sort_values(by='id')

fname = 'drop_stu_' + str(drop_stu1) + '_' + str(drop_stu2) + '_' \
         + '_wor_' + str(drop_wor1) + '_' + str(drop_wor2) \
         + '.csv'
sorted_df.to_csv(fname,index=False)

In [32]:
# try to drop five more attributes
from sklearn.neighbors import KNeighborsClassifier

nVal = 18
weight = 'distance'
algor = 'kd_tree'
leaf = 20
pVal = 2
met = 'minkowski'
para = None
job = None
KneighClf = KNeighborsClassifier(n_neighbors=nVal, weights=weight, algorithm=algor, \
                                 leaf_size=leaf, p=pVal, metric=met, \
                                 metric_params=para, n_jobs=job)

train_stu_X_d = train_stu_X.drop("Profession", axis="columns")
train_stu_X_d = train_stu_X_d.drop("Work Pressure", axis="columns")
train_stu_X_d = train_stu_X_d.drop("Job Satisfaction", axis="columns")
train_wor_X_d = train_wor_X.drop("Academic Pressure", axis="columns")
train_wor_X_d = train_wor_X_d.drop("CGPA", axis="columns")
train_wor_X_d = train_wor_X_d.drop("Study Satisfaction", axis="columns")
test_stu_d = test_stu.drop("Profession", axis="columns")
test_stu_d = test_stu_d.drop("Work Pressure", axis="columns")
test_stu_d = test_stu_d.drop("Job Satisfaction", axis="columns")
test_wor_d = test_wor.drop("Academic Pressure", axis="columns")
test_wor_d = test_wor_d.drop("CGPA", axis="columns")
test_wor_d = test_wor_d.drop("Study Satisfaction", axis="columns")
train_stu_X_d = train_stu_X_d.drop("Working Professional or Student", axis="columns")
train_wor_X_d = train_wor_X_d.drop("Working Professional or Student", axis="columns")
test_stu_d = test_stu_d.drop("Working Professional or Student", axis="columns")
test_wor_d = test_wor_d.drop("Working Professional or Student", axis="columns")

drop_stu1 = "City"
train_stu_X_d = train_stu_X_d.drop(drop_stu1, axis="columns")
test_stu_d = test_stu_d.drop(drop_stu1, axis="columns")
drop_stu2 = "Degree"
train_stu_X_d = train_stu_X_d.drop(drop_stu2, axis="columns")
test_stu_d = test_stu_d.drop(drop_stu2, axis="columns")
drop_wor1 = "City"
train_wor_X_d = train_wor_X_d.drop(drop_wor1, axis="columns")
test_wor_d = test_wor_d.drop(drop_wor1, axis="columns")
drop_wor2 = "Degree"
train_wor_X_d = train_wor_X_d.drop(drop_wor2, axis="columns")
test_wor_d = test_wor_d.drop(drop_wor2, axis="columns")
drop_wor3 = "Profession"
train_wor_X_d = train_wor_X_d.drop(drop_wor3, axis="columns")
test_wor_d = test_wor_d.drop(drop_wor3, axis="columns")

Kclf_stu = KneighClf.fit(train_stu_X_d.iloc[:,1:], train_stu_Y)
testX = test_stu_d
predictKclf = Kclf_stu.predict(testX.iloc[:,1:])
result1 = pd.DataFrame(testX['id'])
result1['Depression'] = predictKclf

Kclf_wor = KneighClf.fit(train_wor_X_d.iloc[:,1:], train_wor_Y)
testX = test_wor_d
predictKclf = Kclf_wor.predict(testX.iloc[:,1:])
result2 = pd.DataFrame(testX['id'])
result2['Depression'] = predictKclf

result = pd.concat([result1, result2], ignore_index=True)
sorted_df = result.sort_values(by='id')

fname = 'drop3_stu_' + str(drop_stu1) + '_' + str(drop_stu2) + '_' \
         + '_wor_' + str(drop_wor1) + '_' + str(drop_wor2) + '_' \
         + str(drop_wor3) + '.csv'
sorted_df.to_csv(fname,index=False)