### Imports and Configurations

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Configure visualizations
%matplotlib inline
sns.set(style="whitegrid")
plt.style.use('seaborn-v0_8-whitegrid') 
warnings.filterwarnings('ignore')

print("Libraries imported successfully.")

Libraries imported successfully.


### Load the dataset

In [2]:
try:
    df = pd.read_csv('data.csv')      
    print("\nDataset loaded successfully.")
    print(f"Shape of the dataset: {df.shape}")

except FileNotFoundError:
    print("Error: 'data.csv' not found. Please ensure the file is in the correct directory.")


Dataset loaded successfully.
Shape of the dataset: (27901, 18)


### Understanding the Data

In [3]:
# Viewing First and Last 3 data from the dataset
df.head(3)

Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
0,2,Male,33.0,Visakhapatnam,Student,5.0,0.0,8.97,2.0,0.0,5-6 hours,Healthy,B.Pharm,Yes,3.0,1.0,No,1
1,8,Female,24.0,Bangalore,Student,2.0,0.0,5.9,5.0,0.0,5-6 hours,Moderate,BSc,No,3.0,2.0,Yes,0
2,26,Male,31.0,Srinagar,Student,3.0,0.0,7.03,5.0,0.0,Less than 5 hours,Healthy,BA,No,9.0,1.0,Yes,0


In [4]:
df.tail(3)

Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
27898,140689,Male,31.0,Faridabad,Student,3.0,0.0,6.61,4.0,0.0,5-6 hours,Unhealthy,MD,No,12.0,2.0,No,0
27899,140690,Female,18.0,Ludhiana,Student,5.0,0.0,6.88,2.0,0.0,Less than 5 hours,Healthy,Class 12,Yes,10.0,5.0,No,1
27900,140699,Male,27.0,Patna,Student,4.0,0.0,9.24,1.0,0.0,Less than 5 hours,Healthy,BCA,Yes,2.0,3.0,Yes,1


In [5]:
# Displaying basic data information 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27901 entries, 0 to 27900
Data columns (total 18 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   id                                     27901 non-null  int64  
 1   Gender                                 27901 non-null  object 
 2   Age                                    27901 non-null  float64
 3   City                                   27901 non-null  object 
 4   Profession                             27901 non-null  object 
 5   Academic Pressure                      27901 non-null  float64
 6   Work Pressure                          27901 non-null  float64
 7   CGPA                                   27901 non-null  float64
 8   Study Satisfaction                     27901 non-null  float64
 9   Job Satisfaction                       27901 non-null  float64
 10  Sleep Duration                         27901 non-null  object 
 11  Di

In [6]:
# Displaying descriptive statistics for numerical columns
df.describe(include='all')

Unnamed: 0,id,Gender,Age,City,Profession,Academic Pressure,Work Pressure,CGPA,Study Satisfaction,Job Satisfaction,Sleep Duration,Dietary Habits,Degree,Have you ever had suicidal thoughts ?,Work/Study Hours,Financial Stress,Family History of Mental Illness,Depression
count,27901.0,27901,27901.0,27901,27901,27901.0,27901.0,27901.0,27901.0,27901.0,27901,27901,27901,27901,27901.0,27898.0,27901,27901.0
unique,,2,,52,14,,,,,,5,4,28,2,,,2,
top,,Male,,Kalyan,Student,,,,,,Less than 5 hours,Unhealthy,Class 12,Yes,,,No,
freq,,15547,,1570,27870,,,,,,8310,10317,6080,17656,,,14398,
mean,70442.149421,,25.8223,,,3.141214,0.00043,7.656104,2.943837,0.000681,,,,,7.156984,3.139867,,0.585499
std,40641.175216,,4.905687,,,1.381465,0.043992,1.470707,1.361148,0.044394,,,,,3.707642,1.437347,,0.492645
min,2.0,,18.0,,,0.0,0.0,0.0,0.0,0.0,,,,,0.0,1.0,,0.0
25%,35039.0,,21.0,,,2.0,0.0,6.29,2.0,0.0,,,,,4.0,2.0,,0.0
50%,70684.0,,25.0,,,3.0,0.0,7.77,3.0,0.0,,,,,8.0,3.0,,1.0
75%,105818.0,,30.0,,,4.0,0.0,8.92,4.0,0.0,,,,,10.0,4.0,,1.0


### Data Cleaning & Initial Transformations

In [7]:
# Dropping unnecessary/uninformative columns

columns_to_drop = ['id', 'Work Pressure', 'Job Satisfaction'] # since `work pressure` and `job satisfaction` are not present and are irrelevant to Students
columns_to_drop = [col for col in columns_to_drop if col in df.columns] 

if columns_to_drop:
    df.drop(columns=columns_to_drop, inplace=True)
    print(f"\nDropped columns: {columns_to_drop}")
    print(f"New shape: {df.shape}")
else:
    print("\nNo columns to drop (already dropped or not found).")


Dropped columns: ['id', 'Work Pressure', 'Job Satisfaction']
New shape: (27901, 15)


In [8]:
# Checking for missing values
print(df.isnull().sum())

Gender                                   0
Age                                      0
City                                     0
Profession                               0
Academic Pressure                        0
CGPA                                     0
Study Satisfaction                       0
Sleep Duration                           0
Dietary Habits                           0
Degree                                   0
Have you ever had suicidal thoughts ?    0
Work/Study Hours                         0
Financial Stress                         3
Family History of Mental Illness         0
Depression                               0
dtype: int64


In [9]:
# Removing the 3 rows that didnt have value of `Financial Stress`
df.dropna(inplace=True)
print(f"New shape: {df.shape}")

New shape: (27898, 15)


In [10]:
# Renaming the column first for easier access
df.rename(columns={'Have you ever had suicidal thoughts ?': 'Suicidal_Thoughts'}, inplace=True)

In [11]:
# Standardize Yes/No columns to binary (1/0)
yes_no_cols = ['Suicidal_Thoughts', 'Family History of Mental Illness']
for col in yes_no_cols:
    if col in df.columns:
        if df[col].dtype == 'object': # Only convert if it's still object type
             print(f"\nConverting '{col}' to binary (1=Yes, 0=No)")
             print(f"Original unique values: {df[col].unique()}")
             df[col] = df[col].map({'Yes': 1, 'No': 0}).astype(int)
             print(f"New unique values: {df[col].unique()}")
        elif pd.api.types.is_numeric_dtype(df[col]):
             print(f"Column '{col}' is already numeric.")
    else:
        print(f"Column '{col}' not found for conversion.")
        
# Conversion Verification
print("\n--- Data Info after Yes/No Conversion ---")
df.info()


Converting 'Suicidal_Thoughts' to binary (1=Yes, 0=No)
Original unique values: ['Yes' 'No']
New unique values: [1 0]

Converting 'Family History of Mental Illness' to binary (1=Yes, 0=No)
Original unique values: ['No' 'Yes']
New unique values: [0 1]

--- Data Info after Yes/No Conversion ---
<class 'pandas.core.frame.DataFrame'>
Index: 27898 entries, 0 to 27900
Data columns (total 15 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Gender                            27898 non-null  object 
 1   Age                               27898 non-null  float64
 2   City                              27898 non-null  object 
 3   Profession                        27898 non-null  object 
 4   Academic Pressure                 27898 non-null  float64
 5   CGPA                              27898 non-null  float64
 6   Study Satisfaction                27898 non-null  float64
 7   Sleep Duration                

### Feature Engineering 

In [12]:
# Getting Unique Values per columns
object_columns = df.select_dtypes(include=['object'])
unique_values_per_column = [object_columns[col].unique().tolist() for col in object_columns.columns]

for idx, unique_values in enumerate(unique_values_per_column):
    print(f"{object_columns.columns[idx]}: {unique_values}\n")

Gender: ['Male', 'Female']

City: ['Visakhapatnam', 'Bangalore', 'Srinagar', 'Varanasi', 'Jaipur', 'Pune', 'Thane', 'Chennai', 'Nagpur', 'Nashik', 'Vadodara', 'Kalyan', 'Rajkot', 'Ahmedabad', 'Kolkata', 'Mumbai', 'Lucknow', 'Indore', 'Surat', 'Ludhiana', 'Bhopal', 'Meerut', 'Agra', 'Ghaziabad', 'Hyderabad', 'Vasai-Virar', 'Kanpur', 'Patna', 'Faridabad', 'Delhi', 'Saanvi', 'M.Tech', 'Bhavna', 'Less Delhi', 'City', '3.0', 'Less than 5 Kalyan', 'Mira', 'Harsha', 'Vaanya', 'Gaurav', 'Harsh', 'Reyansh', 'Kibara', 'Rashi', 'ME', 'M.Com', 'Nalyan', 'Mihir', 'Nalini', 'Nandini', 'Khaziabad']

Profession: ['Student', 'Civil Engineer', 'Architect', 'UX/UI Designer', 'Digital Marketer', 'Content Writer', 'Educational Consultant', 'Teacher', 'Manager', 'Chef', 'Doctor', 'Lawyer', 'Entrepreneur', 'Pharmacist']

Sleep Duration: ['5-6 hours', 'Less than 5 hours', '7-8 hours', 'More than 8 hours', 'Others']

Dietary Habits: ['Healthy', 'Moderate', 'Unhealthy', 'Others']

Degree: ['B.Pharm', 'BSc', 'BA

In [13]:
# Removing Junk cities 

junk_cities = [
        'Saanvi', 'M.Tech', 'Bhavna', 'Less Delhi', 'City', '3.0', 
        'Less than 5 Kalyan', 'Mira', 'Harsha', 'Vaanya', 'Gaurav', 
        'Harsh', 'Reyansh', 'Kibara', 'Rashi', 'ME', 'M.Com', 
        'Nalyan', 'Mihir', 'Nalini', 'Nandini', 'Khaziabad'
    ]

junk_cities_set = set(junk_cities) 


initial_rows = df.shape[0]
print(f"Shape before filtering 'City': {initial_rows} rows")

mask_keep = ~df['City'].isin(junk_cities_set) 

df = df[mask_keep] 

rows_removed = initial_rows - df.shape[0]
print(f"Shape after filtering 'City': {df.shape[0]} rows")
print(f"Removed {rows_removed} rows based on junk 'City' values.")

print("\nUnique 'City' values AFTER cleaning (first 50):")
print(f"{df['City'].unique().tolist()[:50]}...")
print(f"Total unique cities after cleaning: {df['City'].nunique()}")

Shape before filtering 'City': 27898 rows
Shape after filtering 'City': 27872 rows
Removed 26 rows based on junk 'City' values.

Unique 'City' values AFTER cleaning (first 50):
['Visakhapatnam', 'Bangalore', 'Srinagar', 'Varanasi', 'Jaipur', 'Pune', 'Thane', 'Chennai', 'Nagpur', 'Nashik', 'Vadodara', 'Kalyan', 'Rajkot', 'Ahmedabad', 'Kolkata', 'Mumbai', 'Lucknow', 'Indore', 'Surat', 'Ludhiana', 'Bhopal', 'Meerut', 'Agra', 'Ghaziabad', 'Hyderabad', 'Vasai-Virar', 'Kanpur', 'Patna', 'Faridabad', 'Delhi']...
Total unique cities after cleaning: 30


In [14]:
# Regional Seperation of cities 

print("\nMapping cities to regions...")

region_map = {
    # Northern India
    'Delhi': 'Northern',
    'Srinagar': 'Northern',
    'Ludhiana': 'Northern',
    'Chandigarh': 'Northern', 
    'Jaipur': 'Northern', 
    'Ghaziabad': 'Northern',
    'Faridabad': 'Northern',
    'Meerut': 'Northern',
    'Kanpur': 'Northern', 
    'Lucknow': 'Northern', 
    'Varanasi': 'Northern', 
    'Agra': 'Northern', 
    
    # Central India
    'Bhopal': 'Central',
    'Indore': 'Central',
    'Nagpur': 'Central', 
    'Raipur': 'Central', 
    
    # Western India
    'Mumbai': 'Western',
    'Pune': 'Western',
    'Ahmedabad': 'Western',
    'Surat': 'Western',
    'Vadodara': 'Western',
    'Rajkot': 'Western',
    'Thane': 'Western',
    'Nashik': 'Western',
    'Kalyan': 'Western', 
    'Vasai-Virar': 'Western', 
    
    # Southern India
    'Bangalore': 'Southern',
    'Chennai': 'Southern',
    'Hyderabad': 'Southern',
    'Visakhapatnam': 'Southern',
    'Coimbatore': 'Southern', 
    'Kochi': 'Southern',
    
    # Eastern India
    'Kolkata': 'Eastern',
    'Patna': 'Eastern',
    'Ranchi': 'Eastern', 
    'Bhubaneswar': 'Eastern', 
}

# Apply the mapping
df['Region'] = df['City'].map(region_map).fillna('Other/Unknown') # Fill missing maps

print("Created 'Region' column.")
print("Region distribution:")
print(df['Region'].value_counts())


Mapping cities to regions...
Created 'Region' column.
Region distribution:
Region
Northern    9860
Western     9752
Southern    3960
Central     2228
Eastern     2072
Name: count, dtype: int64


In [15]:
# Feature Engineering Degree

In [16]:
# Function to determine Degree Level
def get_degree_level(degree):
    degree = str(degree).strip()
    if degree == 'Class 12':
        return 'Class 12'
    elif degree == 'PhD':
        return 'PhD'
    elif degree == 'Others' or degree == 'nan':
         return 'Other/Unknown'
    elif degree.startswith(('M.', 'ME', 'MD', 'MS', 'LLM', 'MA', 'MBA', 'MCA', 'MHM', 'MSc')): 
         if degree == 'ME' and not degree.startswith('M.E'): 
             return 'Masters'
         return 'Masters'
    elif degree.startswith(('B.', 'BE', 'BA', 'BBA', 'BCA', 'BHM', 'BSc', 'LLB')):
         return 'Bachelors'
    else: 
        return 'Other/Unknown'

In [17]:
# Function to determine Degree Field
def get_degree_field(degree):
    degree = str(degree).strip().upper() 
    if degree in ['CLASS 12', 'OTHERS', 'NAN']:
        return 'General/Other'
    elif re.search(r'B\.?TECH|BE|ME|M\.?TECH|BCA|MCA', degree):
         return 'Engineering/Tech'
    elif re.search(r'MBBS|MD|B\.?PHARM|M\.?PHARM|BDS', degree): 
         return 'Medical/Pharma'
    elif re.search(r'LLB|LLM', degree):
         return 'Law'
    elif re.search(r'BBA|MBA|B\.?COM|M\.?COM|BHM|MHM', degree):
         return 'Business/Mgmt'
    elif re.search(r'BA|MA|B\.?ED|M\.?ED', degree): 
         return 'Arts/Humanities/Edu'
    elif re.search(r'BSC|MSC', degree):
         return 'Science'
    elif re.search(r'B\.?ARCH', degree):
        return 'Architecture' 
    elif degree == 'PHD':
         return 'PhD'
    else:
        return 'General/Other'

In [18]:
# Function to determine Science vs Non-Science
def get_degree_type(degree_field): # Base this on the derived field
    science_tech_fields = ['Engineering/Tech', 'Medical/Pharma', 'Science', 'Architecture', 'PhD'] # PhD often science/tech focused
    if degree_field in science_tech_fields:
        return 'Science/Tech'
    elif degree_field == 'General/Other':
         return 'General/Other'
    else:
        return 'Non-Science/Arts/Business/Law'

In [19]:
# Apply the functions
df['Degree_Level'] = df['Degree'].apply(get_degree_level)
df['Degree_Field'] = df['Degree'].apply(get_degree_field)
df['Degree_Type'] = df['Degree_Field'].apply(get_degree_type) # Derive type from field

print("Created 'Degree_Level', 'Degree_Field', and 'Degree_Type' columns.")
print("\nDegree Level distribution:")
print(df['Degree_Level'].value_counts())
print("\nDegree Field distribution:")
print(df['Degree_Field'].value_counts())
print("\nDegree Type distribution:")
print(df['Degree_Type'].value_counts())

print("\nFeature Engineering Complete\n")
print("DataFrame head with new features:")
print(df[['Degree', 'Degree_Level', 'Degree_Field', 'Degree_Type']].head())
print(f"\nDataFrame shape after Feature Engineering: {df.shape}")

NameError: name 're' is not defined

In [None]:
# Creating Age Groups
age_bins = [17, 24, 30, 40] 
age_labels = ['18-24', '25-30', '31-39'] 
df['Age_Group'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels, right=True)
print(df[['Age', 'Age_Group']].head())
print(f"\nAge Group distribution:\n{df['Age_Group'].value_counts()}")


In [None]:
# Converting Sleep Duration to Ordinal
sleep_map = {
    'Less than 5 hours': 0,
    '5-6 hours': 1,
    '7-8 hours': 2,
    'More than 8 hours': 3,
    'Others': 4,
}
current_sleep_values = df['Sleep Duration'].unique()
missing_in_map = [val for val in current_sleep_values if val not in sleep_map]
if missing_in_map:
    print(f"Warning: Values in 'Sleep Duration' not found in sleep_map: {missing_in_map}")
else:
    df['Sleep_Ordinal'] = df['Sleep Duration'].map(sleep_map)
    print(df[['Sleep Duration', 'Sleep_Ordinal']].head())
    print(f"\nSleep Ordinal distribution:\n{df['Sleep_Ordinal'].value_counts()}")

In [None]:
# Creating a Combined Stress Score
df['Total_Stress'] = df['Academic Pressure'] + df['Financial Stress']
print(df[['Academic Pressure', 'Financial Stress', 'Total_Stress']].head())
print(f"\nTotal Stress distribution:\n{df['Total_Stress'].value_counts().sort_index()}")

### Data Exploration & Visualization (Numerical Features)

In [None]:
# Identifying numerical columns (excluding binary/target for distribution plots)

numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
# Removing binary/target/ordinal columns
cols_to_exclude_from_dist_plots = ['Suicidal_Thoughts', 'Family History of Mental Illness', 
                                    'Depression', 'Sleep_Ordinal'] 
numerical_cols_for_dist = [col for col in numerical_cols if col not in cols_to_exclude_from_dist_plots]

print(f"\nNumerical columns for distribution analysis: {numerical_cols_for_dist}")

In [None]:
# Plotting histograms for numerical features
df[numerical_cols_for_dist].hist(figsize=(15, 10), bins=20, edgecolor='black')
plt.suptitle('Distribution of Numerical Features', y=1.02)
plt.tight_layout()
plt.show()

In [None]:
# Plotting box plots for numerical features to check for outliers
plt.figure(figsize=(15, 10))
for i, col in enumerate(numerical_cols_for_dist):
    plt.subplot((len(numerical_cols_for_dist) + 1) // 2, 2, i + 1) 
    sns.boxplot(y=df[col])
    plt.title(col)
plt.suptitle('Box Plots of Numerical Features', y=1.02)
plt.tight_layout()
plt.show()

### Data Exploration & Visualization (Categorical Features)

In [None]:
# Identifying categorical columns (including newly created ones)
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
categorical_cols.extend(['Suicidal_Thoughts', 'Family History of Mental Illness', 
                         'Sleep_Ordinal', 'Age_Group', 'Depression']) # Including target for hue
categorical_cols = list(dict.fromkeys(categorical_cols)) 
if 'Sleep Duration' in categorical_cols and 'Sleep_Ordinal' in df.columns:
     categorical_cols.remove('Sleep Duration')

print(f"\nCategorical columns for count analysis: {categorical_cols}")

In [None]:
# Plotting count plots for categorical features (vs. Depression)
n_cols = 2
n_rows = (len(categorical_cols) + n_cols - 1) // n_cols

plt.figure(figsize=(15, n_rows * 5)) # Adjust height based on number of rows
for i, col in enumerate(categorical_cols):
    if col != 'Depression': # Not ploting Depression against itself
        plt.subplot(n_rows, n_cols, i + 1)
        if df[col].nunique() > 10:
             ax = sns.countplot(data=df, y=col, hue='Depression', order=df[col].value_counts().index[:20], palette='viridis') # Showing top 20 for high cardinality
             plt.title(f'{col} (Top 20) vs Depression')
             plt.xticks(rotation=90)
        else:
             ax = sns.countplot(data=df, x=col, hue='Depression', order=df[col].value_counts().index, palette='viridis')
             plt.title(f'{col} vs Depression')
             plt.xticks(rotation=45, ha='right')

plt.suptitle('Distribution of Categorical Features by Depression Status', y=1.02)
plt.tight_layout()
plt.show()

### Additional Pre-processing via graph info

In [None]:
# Remove Outlyers

outlier_cols = ['Age', 'CGPA']
df_original_shape = df.shape[0]

for col in outlier_cols:
    if col in df.columns and pd.api.types.is_numeric_dtype(df[col]):
        print(f"\nProcessing outliers for: {col}")
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        print(f"  Q1: {Q1:.2f}, Q3: {Q3:.2f}, IQR: {IQR:.2f}")
        print(f"  Lower Bound: {lower_bound:.2f}, Upper Bound: {upper_bound:.2f}")
        
        original_count = df.shape[0]
        
        # Filter out outliers
        df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
        removed_count = original_count - df.shape[0]
        
        if removed_count > 0:
            print(f"  Removed {removed_count} outliers for {col}.")
        else:
            print(f"  No outliers detected/removed for {col} based on IQR.")
    else:
        print(f"Column {col} not found or not numeric, skipping outlier removal.")
        
print(f"\nShape after outlier removal: {df.shape}")
print(f"Total rows removed due to outliers: {df_original_shape - df.shape[0]}")

In [None]:
# Handling the `profession Column`

profession_counts = df['Profession'].value_counts(normalize=True)
print("Profession distribution (Top 5):")
print(profession_counts.head())

# Checking if 'Student' is overwhelmingly dominant 
student_percentage = profession_counts.get('Student', 0) * 100
print(f"\nPercentage of 'Student': {student_percentage:.2f}%")

threshold = 95.0 
if student_percentage > threshold:
    df.drop(columns=['Profession'], inplace=True)
    print(f"\n'Profession' column dropped because 'Student' category constitutes > {threshold}% of the data.")
    print(f"New shape: {df.shape}")
else:
    print("\n'Profession' column kept as 'Student' dominance is below threshold")

In [None]:
# Feature Engineering again

# Creating Age Groups
age_bins = [17, 24, 30, 40] 
age_labels = ['18-24', '25-30', '31-39'] 
df['Age_Group'] = pd.cut(df['Age'], bins=age_bins, labels=age_labels, right=True)
print(df[['Age', 'Age_Group']].head())
print(f"\nAge Group distribution:\n{df['Age_Group'].value_counts()}")

# Converting Sleep Duration to Ordinal
sleep_map = {
    'Less than 5 hours': 0,
    '5-6 hours': 1,
    '7-8 hours': 2,
    'More than 8 hours': 3,
    'Others': 4,
}
current_sleep_values = df['Sleep Duration'].unique()
missing_in_map = [val for val in current_sleep_values if val not in sleep_map]
if missing_in_map:
    print(f"Warning: Values in 'Sleep Duration' not found in sleep_map: {missing_in_map}")
else:
    df['Sleep_Ordinal'] = df['Sleep Duration'].map(sleep_map)
    print(df[['Sleep Duration', 'Sleep_Ordinal']].head())
    print(f"\nSleep Ordinal distribution:\n{df['Sleep_Ordinal'].value_counts()}")

# Creating a Combined Stress Score
df['Total_Stress'] = df['Academic Pressure'] + df['Financial Stress']
print(df[['Academic Pressure', 'Financial Stress', 'Total_Stress']].head())
print(f"\nTotal Stress distribution:\n{df['Total_Stress'].value_counts().sort_index()}")

In [None]:
# Verify Changes

plt.figure(figsize=(12, 5))

# Age After
plt.subplot(2, 2, 1)
sns.boxplot(y=df['Age'])
plt.title('Age (After Outlier Removal)')
plt.ylabel('Age')


# CGPA After
plt.subplot(2, 2, 2)
sns.boxplot(y=df['CGPA'])
plt.title('CGPA (After Outlier Removal)')
plt.ylabel('CGPA')

plt.suptitle('Verification of Outlier Removal for Age and CGPA', y=1.03)
plt.tight_layout(rect=[0, 0.03, 1, 0.97]) 
plt.show()

### Visualize Correlation

In [None]:
numerical_cols_corr = df.select_dtypes(include=np.number).columns.tolist()

correlation_matrix = df[numerical_cols_corr].corr()

# Plot heatmap
plt.figure(figsize=(14, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation Matrix of Numerical Features (After Outlier/Profession Handling)')
plt.show()

# Specifically look at correlations with the target variable 'Depression'
print("\nCorrelation with Depression\n")

corr_with_target = correlation_matrix['Depression'].sort_values(ascending=False)
print(corr_with_target)

In [None]:
print("\nMean Depression Rate per Age Group")
mean_depression = df.groupby('Age_Group')['Depression'].mean().sort_values(ascending=False)
print(mean_depression)

plt.figure(figsize=(8, 4))
mean_depression.plot(kind='bar', color=sns.color_palette('viridis', len(mean_depression)))
plt.title('Mean Depression Rate by Age Group')
plt.ylabel('Mean Depression Rate (0=No, 1=Yes)')
plt.xlabel('Age Group')
plt.xticks(rotation=0)
plt.show()

### Final Preparation: Encoding Categorical Features

In [None]:
categorical_cols_to_encode = df.select_dtypes(include=['object', 'category']).columns.tolist()

# Ensuring Age_Group is included if it's categorical
if 'Age_Group' in df.columns and pd.api.types.is_categorical_dtype(df['Age_Group']):
     if 'Age_Group' not in categorical_cols_to_encode:
         categorical_cols_to_encode.append('Age_Group')
         
# Removing Sleep Duration if Sleep_Ordinal exists and it's still here
if 'Sleep Duration' in categorical_cols_to_encode and 'Sleep_Ordinal' in df.columns:
    categorical_cols_to_encode.remove('Sleep Duration')


print(f"\nRemaining categorical columns to be one-hot encoded: {categorical_cols_to_encode}")

In [None]:
if categorical_cols_to_encode:
    # Apply One-Hot Encoding
    df_processed = pd.get_dummies(df, columns=categorical_cols_to_encode, drop_first=True) 
    
    print("\nData after One-Hot Encoding\n")
    print(f"New shape: {df_processed.shape}")
    
else:
    print("\nNo remaining categorical columns require encoding.\n")
    df_processed = df.copy() # Using the current df if no encoding was needed

In [None]:
df_processed.head()

In [None]:
# Dropping original 'Sleep Duration' if 'Sleep_Ordinal' existed
if 'Sleep Duration' in df_processed.columns and 'Sleep_Ordinal' in df_processed.columns:
    df_processed.drop(columns=['Sleep Duration'], inplace=True)
    print("\nDropped original 'Sleep Duration' column.")

print("\nFinal Column Data Types")
print(df_processed.dtypes.value_counts())

print("\nPreprocessing complete. Ready for model training.")

### Export the cleaned df as csv

In [None]:
output_filename = 'cleaned_student_data.csv'

df_processed.to_csv(output_filename, index=False, encoding='utf-8')

print(f"\nSuccessfully Exported DataFrame\n")
print(f"DataFrame shape: {df_processed.shape}")
print(f"Data saved to: '{output_filename}'")
   

In [None]:
df.head()

### Data lost Percentage

In [None]:
original_rows = 27901
new_rows = 27851

# Number of col for original was 18, we removed the ;
# -> profession vol as 99.89 was student so we dont account the col change marking it as negligible.
# -> Irrelevant data (ID,Work Pressure,Job Satisfaction,)


# Calculate rows lost and percentage loss
rows_lost = original_rows - new_rows
percentage_loss = (rows_lost / original_rows) * 100

print(f"Rows lost: {rows_lost}")
print(f"Percentage loss: {percentage_loss:.2f}%")
