In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from functions_preprocessing import printing_column, show_invalid_entries, replacing_invalid

In [2]:
## Reading csv file
df = pd.read_csv("student_depression_dataset.csv")

Since **id** column doesn't has any relevance has only unique values we can drop it.

In [3]:
## Dropping id column and printing head to verify

df = df.drop('id', axis=1)
df.head()

Unnamed: 0,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,Male,33.0,Visakhapatnam,Student,5.0,0.0,8.97,2.0,0.0,'5-6 hours',Healthy,B.Pharm,Yes,3.0,1.0,No,1
1,Female,24.0,Bangalore,Student,2.0,0.0,5.9,5.0,0.0,'5-6 hours',Moderate,BSc,No,3.0,2.0,Yes,0
2,Male,31.0,Srinagar,Student,3.0,0.0,7.03,5.0,0.0,'Less than 5 hours',Healthy,BA,No,9.0,1.0,Yes,0
3,Female,28.0,Varanasi,Student,3.0,0.0,5.59,2.0,0.0,'7-8 hours',Moderate,BCA,Yes,4.0,5.0,Yes,1
4,Female,25.0,Jaipur,Student,4.0,0.0,8.13,3.0,0.0,'5-6 hours',Moderate,M.Tech,Yes,1.0,1.0,No,0


In [4]:
## Printing all the unique values in all the columns

printing_column(df)

Gender unique values:
['Male' 'Female']

Age unique values:
[33. 24. 31. 28. 25. 29. 30. 27. 19. 20. 23. 18. 21. 22. 34. 32. 26. 39.
 35. 42. 36. 58. 49. 38. 51. 44. 43. 46. 59. 54. 48. 56. 37. 41.]

City unique values:
['Visakhapatnam' 'Bangalore' 'Srinagar' 'Varanasi' 'Jaipur' 'Pune' 'Thane'
 'Chennai' 'Nagpur' 'Nashik' 'Vadodara' 'Kalyan' 'Rajkot' 'Ahmedabad'
 'Kolkata' 'Mumbai' 'Lucknow' 'Indore' 'Surat' 'Ludhiana' 'Bhopal'
 'Meerut' 'Agra' 'Ghaziabad' 'Hyderabad' 'Vasai-Virar' 'Kanpur' 'Patna'
 'Faridabad' 'Delhi' 'Saanvi' 'M.Tech' 'Bhavna' "'Less Delhi'" 'City'
 '3.0' "'Less than 5 Kalyan'" 'Mira' 'Harsha' 'Vaanya' 'Gaurav' 'Harsh'
 'Reyansh' 'Kibara' 'Rashi' 'ME' 'M.Com' 'Nalyan' 'Mihir' 'Nalini'
 'Nandini' 'Khaziabad']

Profession unique values:
['Student' "'Civil Engineer'" 'Architect' "'UX/UI Designer'"
 "'Digital Marketer'" "'Content Writer'" "'Educational Consultant'"
 'Teacher' 'Manager' 'Chef' 'Doctor' 'Lawyer' 'Entrepreneur' 'Pharmacist']

Academic Pressure unique values

After printing all the unique values in each column we can visualize that a lot of columns are having incorrect or missing entries.
- City: 'M.Tech', "'Less Delhi'", 'City', '3.0', "'Less than 5 Kalyan'", 'ME', 'M.Com'
- Sleep Duration: 'Others'
- Dietary Habits: 'Others'
- Degree: 'Others'
- Financial Stress: 'Others'

### <u>City Column</u>

Dealing with City column first, looking closely we can observe that the city column does have incorrect values which weren't suppose to be in there. Here we have two values which represent cities "Less than 5 Kalyan" and "Less Delhi". We can replace them with appropriate values i.e. "Less than 5 Kalyan" with "Kalyan" and "Less Delhi" with "Delhi".

In [5]:
invalid_cities = ['M.Tech', "'Less Delhi'", 'City', '3.0', "'Less than 5 Kalyan'", 'ME', 'M.Com']
show_invalid_entries(df, 'City', invalid_cities)

Number of invalid City entries: 8
City
City                    2
M.Tech                  1
'Less Delhi'            1
3.0                     1
'Less than 5 Kalyan'    1
ME                      1
M.Com                   1
Name: count, dtype: int64


Replacing all values with mode value of City column except "Less Delhi" and "Less than 5 Kalyan" since they will be replaced by the city names.

In [6]:
## Replaced "Less Delhi" with "Delhi" and "Less than 5 Kalyan"
df['City'] = df['City'].replace("'Less Delhi'", 'Delhi')
df['City'] = df['City'].replace("'Less than 5 Kalyan'", 'Kalyan')

In [7]:
show_invalid_entries(df, 'City', invalid_cities)

Number of invalid City entries: 6
City
City      2
M.Tech    1
3.0       1
ME        1
M.Com     1
Name: count, dtype: int64


In [8]:
## Replacing rest of the invalid cities with the mode values
df = replacing_invalid(df, 'City', invalid_cities)

In [9]:
show_invalid_entries(df, 'City', invalid_cities)

Number of invalid City entries: 0
Series([], Name: count, dtype: int64)


All invalid values in **City** column are replaced.

___

### <u>Sleep Duration</u>

Discussed earlier we observe there is an invalid value **"Other"** in the column **Sleep Duration**.

In [10]:
invalid_sleep = ['Others']
show_invalid_entries(df, "Sleep Duration", invalid_sleep)

Number of invalid Sleep Duration entries: 18
Sleep Duration
Others    18
Name: count, dtype: int64


Removing the **18 other** values with the value which appeared most (mode).

In [11]:
df = replacing_invalid(df, 'Sleep Duration', invalid_sleep)

In [12]:
show_invalid_entries(df, "Sleep Duration", invalid_sleep)

Number of invalid Sleep Duration entries: 0
Series([], Name: count, dtype: int64)


---

### <u>Dietary Habits</u>

Discussed earlier we observe there is an invalid value **"Other"** in the column **Dietary Habits**.

In [13]:
invalid_habit = ['Others']
show_invalid_entries(df, "Dietary Habits", invalid_habit)

Number of invalid Dietary Habits entries: 12
Dietary Habits
Others    12
Name: count, dtype: int64


In [14]:
df = replacing_invalid(df, 'Dietary Habits', invalid_habit)

In [15]:
show_invalid_entries(df, "Dietary Habits", invalid_habit)

Number of invalid Dietary Habits entries: 0
Series([], Name: count, dtype: int64)


---

### <u>Degree</u>

Discussed earlier we observe there is an invalid value **"Other"** in the column **Degree**.

In [16]:
invalid_degree = ['Others']
show_invalid_entries(df, "Degree", invalid_degree)

Number of invalid Degree entries: 35
Degree
Others    35
Name: count, dtype: int64


In [17]:
df = replacing_invalid(df, 'Degree', invalid_degree)

In [18]:
show_invalid_entries(df, "Degree", invalid_degree)

Number of invalid Degree entries: 0
Series([], Name: count, dtype: int64)


---

### <u>Financial Stress</u>

Discussed earlier we observe there is an invalid value **"?"** in the column **Financial Stress**.

In [19]:
invalid_stress = ['?']
show_invalid_entries(df, "Financial Stress", invalid_stress)

Number of invalid Financial Stress entries: 3
Financial Stress
?    3
Name: count, dtype: int64


In [20]:
df = replacing_invalid(df, "Financial Stress", invalid_stress)

In [21]:
show_invalid_entries(df, "Financial Stress", invalid_stress)

Number of invalid Financial Stress entries: 0
Series([], Name: count, dtype: int64)


In [22]:
df['Financial Stress'] = df['Financial Stress'].astype(float).astype('int64')


___

In [None]:
## Printing all the unique values of the column again; just to make sure that we have not missed anything.
printing_column(df)

Gender unique values:
['Male' 'Female']

Age unique values:
[33. 24. 31. 28. 25. 29. 30. 27. 19. 20. 23. 18. 21. 22. 34. 32. 26. 39.
 35. 42. 36. 58. 49. 38. 51. 44. 43. 46. 59. 54. 48. 56. 37. 41.]

City unique values:
['Visakhapatnam' 'Bangalore' 'Srinagar' 'Varanasi' 'Jaipur' 'Pune' 'Thane'
 'Chennai' 'Nagpur' 'Nashik' 'Vadodara' 'Kalyan' 'Rajkot' 'Ahmedabad'
 'Kolkata' 'Mumbai' 'Lucknow' 'Indore' 'Surat' 'Ludhiana' 'Bhopal'
 'Meerut' 'Agra' 'Ghaziabad' 'Hyderabad' 'Vasai-Virar' 'Kanpur' 'Patna'
 'Faridabad' 'Delhi' 'Saanvi' 'Bhavna' 'Mira' 'Harsha' 'Vaanya' 'Gaurav'
 'Harsh' 'Reyansh' 'Kibara' 'Rashi' 'Nalyan' 'Mihir' 'Nalini' 'Nandini'
 'Khaziabad']

Profession unique values:
['Student' "'Civil Engineer'" 'Architect' "'UX/UI Designer'"
 "'Digital Marketer'" "'Content Writer'" "'Educational Consultant'"
 'Teacher' 'Manager' 'Chef' 'Doctor' 'Lawyer' 'Entrepreneur' 'Pharmacist']

Academic Pressure unique values:
[5. 2. 3. 4. 1. 0.]

Work Pressure unique values:
[0. 5. 2.]

CGPA uniqu

In [None]:
## Printing the types again to make sure we have the correct types
print(df.dtypes)

Gender                                    object
Age                                      float64
City                                      object
Profession                                object
Academic Pressure                        float64
Work Pressure                            float64
CGPA                                     float64
Study Satisfaction                       float64
Job Satisfaction                         float64
Sleep Duration                            object
Dietary Habits                            object
Degree                                    object
Have you ever had suicidal thoughts ?     object
Work/Study Hours                         float64
Financial Stress                           int64
Family History of Mental Illness          object
Depression                                 int64
dtype: object


In [26]:
## Age, Academic Pressure, Work Pressure, Study Satisfaction, Job Satisfaction, Work/Study Hours
## Converting these columns which from float to int64 since there are no decimal values
cols_to_convert = ['Age', 'Academic Pressure', 'Work Pressure', 'Study Satisfaction', 'Job Satisfaction', 'Work/Study Hours']
df[cols_to_convert] = df[cols_to_convert].astype(float).astype('int64') 

In [27]:
print(df.dtypes)

Gender                                    object
Age                                        int64
City                                      object
Profession                                object
Academic Pressure                          int64
Work Pressure                              int64
CGPA                                     float64
Study Satisfaction                         int64
Job Satisfaction                           int64
Sleep Duration                            object
Dietary Habits                            object
Degree                                    object
Have you ever had suicidal thoughts ?     object
Work/Study Hours                           int64
Financial Stress                           int64
Family History of Mental Illness          object
Depression                                 int64
dtype: object


In [28]:
df.to_csv("Final.csv")