## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

## Loading the data

In [2]:
df=pd.read_csv('Data/data.csv')
df.shape

(27901, 18)

In [3]:
df.head()

Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,2,Male,33.0,Visakhapatnam,Student,5.0,0.0,8.97,2.0,0.0,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
1,8,Female,24.0,Bangalore,Student,2.0,0.0,5.9,5.0,0.0,5-6 hours,Moderate,BSc,No,3.0,2.0,Yes,0
2,26,Male,31.0,Srinagar,Student,3.0,0.0,7.03,5.0,0.0,Less than 5 hours,Healthy,BA,No,9.0,1.0,Yes,0
3,30,Female,28.0,Varanasi,Student,3.0,0.0,5.59,2.0,0.0,7-8 hours,Moderate,BCA,Yes,4.0,5.0,Yes,1
4,32,Female,25.0,Jaipur,Student,4.0,0.0,8.13,3.0,0.0,5-6 hours,Moderate,M.Tech,Yes,1.0,1.0,No,0


In [4]:
df.drop('id',axis=1,inplace=True)

In [5]:
print(df.dtypes)

Gender                                    object
Age                                      float64
City                                      object
Profession                                object
Academic Pressure                        float64
Work Pressure                            float64
CGPA                                     float64
Study Satisfaction                       float64
Job Satisfaction                         float64
Sleep Duration                            object
Dietary Habits                            object
Degree                                    object
Have you ever had suicidal thoughts ?     object
Work/Study Hours                         float64
Financial Stress                         float64
Family History of Mental Illness          object
Depression                                 int64
dtype: object


In [6]:
print(df.isnull().sum())

Gender                                   0
Age                                      0
City                                     0
Profession                               0
Academic Pressure                        0
Work Pressure                            0
CGPA                                     0
Study Satisfaction                       0
Job Satisfaction                         0
Sleep Duration                           0
Dietary Habits                           0
Degree                                   0
Have you ever had suicidal thoughts ?    0
Work/Study Hours                         0
Financial Stress                         3
Family History of Mental Illness         0
Depression                               0
dtype: int64


In [7]:
df.head()

Unnamed: 0,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,Male,33.0,Visakhapatnam,Student,5.0,0.0,8.97,2.0,0.0,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
1,Female,24.0,Bangalore,Student,2.0,0.0,5.9,5.0,0.0,5-6 hours,Moderate,BSc,No,3.0,2.0,Yes,0
2,Male,31.0,Srinagar,Student,3.0,0.0,7.03,5.0,0.0,Less than 5 hours,Healthy,BA,No,9.0,1.0,Yes,0
3,Female,28.0,Varanasi,Student,3.0,0.0,5.59,2.0,0.0,7-8 hours,Moderate,BCA,Yes,4.0,5.0,Yes,1
4,Female,25.0,Jaipur,Student,4.0,0.0,8.13,3.0,0.0,5-6 hours,Moderate,M.Tech,Yes,1.0,1.0,No,0


In [8]:
# unique values in each column

for col, vals in {col: df[col].unique() for col in df.columns}.items(): print(f"{col}: {vals}")

Gender: ['Male' 'Female']
Age: [33. 24. 31. 28. 25. 29. 30. 27. 19. 20. 23. 18. 21. 22. 34. 32. 26. 39.
 35. 42. 36. 58. 49. 38. 51. 44. 43. 46. 59. 54. 48. 56. 37. 41.]
City: ['Visakhapatnam' 'Bangalore' 'Srinagar' 'Varanasi' 'Jaipur' 'Pune' 'Thane'
 'Chennai' 'Nagpur' 'Nashik' 'Vadodara' 'Kalyan' 'Rajkot' 'Ahmedabad'
 'Kolkata' 'Mumbai' 'Lucknow' 'Indore' 'Surat' 'Ludhiana' 'Bhopal'
 'Meerut' 'Agra' 'Ghaziabad' 'Hyderabad' 'Vasai-Virar' 'Kanpur' 'Patna'
 'Faridabad' 'Delhi' 'Saanvi' 'M.Tech' 'Bhavna' 'Less Delhi' 'City' '3.0'
 'Less than 5 Kalyan' 'Mira' 'Harsha' 'Vaanya' 'Gaurav' 'Harsh' 'Reyansh'
 'Kibara' 'Rashi' 'ME' 'M.Com' 'Nalyan' 'Mihir' 'Nalini' 'Nandini'
 'Khaziabad']
Profession: ['Student' 'Civil Engineer' 'Architect' 'UX/UI Designer'
 'Digital Marketer' 'Content Writer' 'Educational Consultant' 'Teacher'
 'Manager' 'Chef' 'Doctor' 'Lawyer' 'Entrepreneur' 'Pharmacist']
Academic Pressure: [5. 2. 3. 4. 1. 0.]
Work Pressure: [0. 5. 2.]
CGPA: [ 8.97    5.9     7.03    5.59    

## Data Cleaning & Preprocessing /W/ Handle Missing Values

In [9]:
# removing the rows with less than 400 cities
cities_to_remove = df['City'].value_counts()[df['City'].value_counts() < 400]
df = df[~df['City'].isin(cities_to_remove.index)]
df['City'].value_counts()

City
Kalyan           1570
Srinagar         1372
Hyderabad        1340
Vasai-Virar      1290
Lucknow          1155
Thane            1139
Ludhiana         1111
Agra             1094
Surat            1078
Kolkata          1066
Jaipur           1036
Patna            1007
Visakhapatnam     969
Pune              968
Ahmedabad         951
Bhopal            934
Chennai           885
Meerut            825
Rajkot            816
Delhi             768
Bangalore         767
Ghaziabad         745
Mumbai            699
Vadodara          694
Varanasi          685
Nagpur            651
Indore            643
Kanpur            609
Nashik            547
Faridabad         461
Name: count, dtype: int64

In [10]:
for column in df.columns:
    unique_count = df[column].nunique()
    print(f"Number of unique values for {column}: {unique_count}")

Number of unique values for Gender: 2
Number of unique values for Age: 34
Number of unique values for City: 30
Number of unique values for Profession: 14
Number of unique values for Academic Pressure: 6
Number of unique values for Work Pressure: 3
Number of unique values for CGPA: 332
Number of unique values for Study Satisfaction: 6
Number of unique values for Job Satisfaction: 5
Number of unique values for Sleep Duration: 5
Number of unique values for Dietary Habits: 4
Number of unique values for Degree: 28
Number of unique values for Have you ever had suicidal thoughts ?: 2
Number of unique values for Work/Study Hours: 13
Number of unique values for Financial Stress: 5
Number of unique values for Family History of Mental Illness: 2
Number of unique values for Depression: 2


In [11]:
df['Financial Stress'].replace(np.nan, 0, inplace=True) # replacing NaN values with 0

In [12]:
#converting categorical to numerical

df.loc[df['Gender'] == 'Male', 'Gender'] = 0
df.loc[df['Gender'] == 'Female', 'Gender'] = 1

In [13]:
df['Profession'].value_counts() # checking the unique values in the column

Profession
Student                   27844
Architect                     8
Teacher                       6
Digital Marketer              3
Content Writer                2
Chef                          2
Doctor                        2
Pharmacist                    2
Civil Engineer                1
UX/UI Designer                1
Educational Consultant        1
Manager                       1
Lawyer                        1
Entrepreneur                  1
Name: count, dtype: int64

In [14]:
df = df.loc[df['Profession'] == 'Student'] 
df['Profession'].value_counts()

Profession
Student    27844
Name: count, dtype: int64

In [15]:
df = df.drop(['Profession'], axis=1) # dropping the column as it has only one unique value

In [16]:
df['Work Pressure'].value_counts() # checking the unique values in the column 

Work Pressure
0.0    27841
5.0        2
2.0        1
Name: count, dtype: int64

In [17]:
df = df.drop(['Work Pressure'], axis=1) # dropping the column as it has only one unique value
df.head()

Unnamed: 0,Gender,Age,City,Academic Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,0,33.0,Visakhapatnam,5.0,8.97,2.0,0.0,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
1,1,24.0,Bangalore,2.0,5.9,5.0,0.0,5-6 hours,Moderate,BSc,No,3.0,2.0,Yes,0
2,0,31.0,Srinagar,3.0,7.03,5.0,0.0,Less than 5 hours,Healthy,BA,No,9.0,1.0,Yes,0
3,1,28.0,Varanasi,3.0,5.59,2.0,0.0,7-8 hours,Moderate,BCA,Yes,4.0,5.0,Yes,1
4,1,25.0,Jaipur,4.0,8.13,3.0,0.0,5-6 hours,Moderate,M.Tech,Yes,1.0,1.0,No,0


## Feature Engineering

In [18]:
import pandas as pd

def preprocess_data(df):
    # Drop the 'City' column
    df.drop(['City'], axis=1, inplace=True)
    
    # Filter rows where 'Age' is less than or equal to 30
    df = df[df['Age'] <= 30]
    
    # Filter rows where 'Academic Pressure' is greater than 0
    df = df[df['Academic Pressure'] > 0]
    
    # Filter rows where 'Study Satisfaction' is greater than 0
    df = df[df['Study Satisfaction'] > 0]
    
    # Drop 'Job Satisfaction' column as it has only one unique value
    df.drop(['Job Satisfaction'], axis=1, inplace=True)
    
    # Remove rows where 'Sleep Duration' is 'Others'
    df = df[df['Sleep Duration'] != 'Others']
    
    # Map 'Sleep Duration' to numerical values
    sleep_mapping = {
        'Less than 5 hours': 0,
        '5-6 hours': 1,
        '7-8 hours': 2,
        'More than 8 hours': 3
    }
    df['Sleep Duration'] = df['Sleep Duration'].map(sleep_mapping)
    
    # Remove rows where 'Dietary Habits' is 'Others'
    df = df[df['Dietary Habits'] != 'Others']
    
    # Map 'Dietary Habits' to numerical values
    diet_mapping = {
        'Healthy': 0,
        'Unhealthy': 1,
        'Moderate': 2
    }
    df['Dietary Habits'] = df['Dietary Habits'].map(diet_mapping)
    
    # Create 'New_Degree' column based on 'Degree' values
    degree_mapping = {
        r'BSc|BCA|B.Ed|BHM|B.Pharm|B.Com|BE|BA|B.Arch|B.Tech|BBA|LLB': 'Graduated',
        r'MSc|MCA|M.Ed|M.Pharm|M.Com|ME|MA|M.Arch|M.Tech|MBA|LLM': 'Post Graduated',
        'Class 12': 'Higher Secondary'
    }
    for pattern, degree in degree_mapping.items():
        df.loc[df['Degree'].str.contains(pattern, regex=True), 'New_Degree'] = degree
    df = df[df['Degree'] != 'Others']
    
    # Map 'New_Degree' to numerical values
    degree_value_mapping = {
        'Graduated': 0,
        'Post Graduated': 1,
        'Higher Secondary': 2
    }
    df['New_Degree'] = df['New_Degree'].map(degree_value_mapping)
    
    # Drop the original 'Degree' column
    df.drop(['Degree'], axis=1, inplace=True)
    
    # Map 'Have you ever had suicidal thoughts ?' to binary values
    df['Have you ever had suicidal thoughts ?'] = df['Have you ever had suicidal thoughts ?'].map({'Yes': 1, 'No': 0})
    
    # Map 'Family History of Mental Illness' to binary values
    df['Family History of Mental Illness'] = df['Family History of Mental Illness'].map({'Yes': 1, 'No': 0})
    
    return df

# Apply the preprocessing function to your DataFrame
df = preprocess_data(df)


In [19]:

df.isnull().sum()
df = df.dropna()

df_data = df[['Gender', 'Age', 'Academic Pressure', 'CGPA',
        'Study Satisfaction', 'Sleep Duration', 'Dietary Habits',
        'Have you ever had suicidal thoughts ?', 'Work/Study Hours',
        'Financial Stress', 'Family History of Mental Illness', 'Depression',
        'New_Degree']]
df_data.head()
df.rename(columns={'Have you ever had suicidal thoughts ?': 'Suicidal thoughts', 'Family History of Mental Illness':'Family history'}, inplace=True)

In [20]:
import plotly.graph_objects as go

corr_matrix = df.corr()

fig = go.Figure(data=go.Heatmap(
    z=corr_matrix.values,
    x=corr_matrix.columns,
    y=corr_matrix.index,
    colorscale='Viridis',
    colorbar=dict(title='Correlation Coefficient'),
    zmin=-1,
    zmax=1,
))

fig.update_layout(
    title='Correlation Heatmap',
    xaxis_title='Variables',
    yaxis_title='Variables',
    xaxis=dict(tickmode='array', tickvals=list(range(len(corr_matrix.columns))), ticktext=corr_matrix.columns),
    yaxis=dict(tickmode='array', tickvals=list(range(len(corr_matrix.index))), ticktext=corr_matrix.index),
    width=800,
    height=800,
)

fig.show()


In [21]:
df

Unnamed: 0,Gender,Age,Academic Pressure,CGPA,Study Satisfaction,Sleep Duration,Dietary Habits,Suicidal thoughts,Work/Study Hours,Financial Stress,Family history,Depression,New_Degree
1,1,24.0,2.0,5.90,5.0,1,2,0,3.0,2.0,1,0,0.0
3,1,28.0,3.0,5.59,2.0,2,2,1,4.0,5.0,1,1,0.0
4,1,25.0,4.0,8.13,3.0,1,2,1,1.0,1.0,0,0,1.0
6,0,30.0,3.0,9.54,4.0,2,0,0,1.0,2.0,0,0,0.0
7,1,30.0,2.0,8.04,4.0,0,1,0,0.0,1.0,1,0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
27893,1,24.0,3.0,6.02,2.0,2,2,0,8.0,2.0,0,0,0.0
27896,1,27.0,5.0,5.75,5.0,1,1,1,7.0,1.0,1,0,2.0
27897,0,27.0,2.0,9.40,3.0,0,0,0,0.0,3.0,1,0,1.0
27899,1,18.0,5.0,6.88,2.0,0,0,1,10.0,5.0,0,1,2.0


## Exporting Cleaned Dataset

In [22]:
df.to_csv('Data/Cleaned_dataset.csv', index=False)