In [66]:
import os
import carbon_tracker

# Create the emissions folder if it doesn't exist
if not os.path.exists('./emissions'):
	os.makedirs('./emissions')

#improt file in "Sad-Brain-Analytics-Hackathon\carbon_tracker.py"
%run carbon_tracker.py
carbon_tracker.start_tracker()

# **Import libraries:**

In [67]:
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns 
carbon_tracker.start_tracker()
carbon_tracker.update_project_name('Import Libraries')

# **Prepare Data:**

In [68]:
df = pd.read_csv("final_depression_dataset_1.csv")
df.head()
carbon_tracker.update_project_name('Read dataset')

In [69]:
df.shape

(2556, 19)

In [70]:
import numpy as np

def print_dtype_info():
    # Define types to inspect
    dtypes = [np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, 
              np.uint32, np.uint64, np.float16, np.float32, np.float64]
    
    print(f"{'Data Type':<10} {'Size (Bytes)':<12} {'Min Value':<25} {'Max Value':<25}")
    print("-" * 75)
    
    for dtype in dtypes:
        info = np.iinfo(dtype) if np.issubdtype(dtype, np.integer) else np.finfo(dtype)
        print(f"{dtype.__name__:<10} {np.dtype(dtype).itemsize:<12} {info.min:<25} {info.max:<25}")

print_dtype_info()


Data Type  Size (Bytes) Min Value                 Max Value                
---------------------------------------------------------------------------
int8       1            -128                      127                      
int16      2            -32768                    32767                    
int32      4            -2147483648               2147483647               
int64      8            -9223372036854775808      9223372036854775807      
uint8      1            0                         255                      
uint16     2            0                         65535                    
uint32     4            0                         4294967295               
uint64     8            0                         18446744073709551615     
float16    2            -65504.0                  65504.0                  
float32    4            -3.4028234663852886e+38   3.4028234663852886e+38   
float64    8            -1.7976931348623157e+308  1.7976931348623157e+308  


In [71]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2556 entries, 0 to 2555
Data columns (total 19 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   Name                                   2556 non-null   object 
 1   Gender                                 2556 non-null   object 
 2   Age                                    2556 non-null   int64  
 3   City                                   2556 non-null   object 
 4   Working Professional or Student        2556 non-null   object 
 5   Profession                             1883 non-null   object 
 6   Academic Pressure                      502 non-null    float64
 7   Work Pressure                          2054 non-null   float64
 8   CGPA                                   502 non-null    float64
 9   Study Satisfaction                     502 non-null    float64
 10  Job Satisfaction                       2054 non-null   float64
 11  Slee

### *Handling null values:*

In [72]:
df.isnull().sum()

Name                                        0
Gender                                      0
Age                                         0
City                                        0
Working Professional or Student             0
Profession                                673
Academic Pressure                        2054
Work Pressure                             502
CGPA                                     2054
Study Satisfaction                       2054
Job Satisfaction                          502
Sleep Duration                              0
Dietary Habits                              0
Degree                                      0
Have you ever had suicidal thoughts ?       0
Work/Study Hours                            0
Financial Stress                            0
Family History of Mental Illness            0
Depression                                  0
dtype: int64

Out of 2556 entries, the columns Academic Pressure, CGPA, and Study Satisfaction have 2054 null values each. Since these columns contain mostly missing data, removing them will not significantly impact our model training. On the other hand, Profession and Job Satisfaction are important features for our model and should be retained.

In [73]:
# Drop unecessary columns

df = df.drop(['Academic Pressure', 'CGPA', 'Study Satisfaction'], axis=1)
df.shape


(2556, 16)

In [74]:
df['Profession'] = df['Profession'].fillna('Uknown')

In [75]:
df.isnull().sum()

Name                                       0
Gender                                     0
Age                                        0
City                                       0
Working Professional or Student            0
Profession                                 0
Work Pressure                            502
Job Satisfaction                         502
Sleep Duration                             0
Dietary Habits                             0
Degree                                     0
Have you ever had suicidal thoughts ?      0
Work/Study Hours                           0
Financial Stress                           0
Family History of Mental Illness           0
Depression                                 0
dtype: int64

In [76]:
# Replace null values with most commun
df = df.apply(lambda column: column.fillna(column.mode()[0]) if column.isna().any() else column)

In [77]:
df.isnull().sum()

Name                                     0
Gender                                   0
Age                                      0
City                                     0
Working Professional or Student          0
Profession                               0
Work Pressure                            0
Job Satisfaction                         0
Sleep Duration                           0
Dietary Habits                           0
Degree                                   0
Have you ever had suicidal thoughts ?    0
Work/Study Hours                         0
Financial Stress                         0
Family History of Mental Illness         0
Depression                               0
dtype: int64

## *Datatype Casting:*

In [78]:
def calculate_memory(df):
    mem_usage = df.memory_usage(deep=True).sum()
    return mem_usage
    
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2556 entries, 0 to 2555
Data columns (total 16 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   Name                                   2556 non-null   object 
 1   Gender                                 2556 non-null   object 
 2   Age                                    2556 non-null   int64  
 3   City                                   2556 non-null   object 
 4   Working Professional or Student        2556 non-null   object 
 5   Profession                             2556 non-null   object 
 6   Work Pressure                          2556 non-null   float64
 7   Job Satisfaction                       2556 non-null   float64
 8   Sleep Duration                         2556 non-null   object 
 9   Dietary Habits                         2556 non-null   object 
 10  Degree                                 2556 non-null   object 
 11  Have

In [79]:
initial_memory = calculate_memory(df)
print(f"Memory usage before type conversions: {initial_memory} bytes")


df['Name'] = df['Name'].astype('string')
df['Sleep Duration'] = df['Sleep Duration'].astype('string')
df['Dietary Habits'] = df['Dietary Habits'].astype('string')
df['Degree'] = df['Degree'].astype('string')
df['City'] = df['City'].astype('string')

df['Age'] = df['Age'].astype('int16')  # Better for larger numbers
df['Work/Study Hours'] = df['Work/Study Hours'].astype('int16')
df['Financial Stress'] = df['Financial Stress'].astype('int8')

# Handle potential float to int conversion safely
df['Work Pressure'] = df['Work Pressure'].fillna(0).round().astype('int8')
df['Job Satisfaction'] = df['Job Satisfaction'].fillna(0).round().astype('int8')

# Categorical conversions
df['Gender'] = df['Gender'].astype('category')
df['Working Professional or Student'] = df['Working Professional or Student'].astype('category')
df['Profession'] = df['Profession'].astype('category')
df['Depression'] = df['Depression'].astype('category')
df['Family History of Mental Illness'] = df['Family History of Mental Illness'].astype('category')
df['Have you ever had suicidal thoughts ?'] = df['Have you ever had suicidal thoughts ?'].astype('category')
final_memory = calculate_memory(df)
print(f"Memory usage after type conversions: {final_memory} bytes")

# Memory savings
memory_saved = initial_memory - final_memory
print(f"Memory saved: {memory_saved} bytes ({(memory_saved / initial_memory) * 100:.2f}% reduction)")
print(df.dtypes)

Memory usage before type conversions: 1676894 bytes
Memory usage after type conversions: 760434 bytes
Memory saved: 916460 bytes (54.65% reduction)
Name                                     string[python]
Gender                                         category
Age                                               int16
City                                     string[python]
Working Professional or Student                category
Profession                                     category
Work Pressure                                      int8
Job Satisfaction                                   int8
Sleep Duration                           string[python]
Dietary Habits                           string[python]
Degree                                   string[python]
Have you ever had suicidal thoughts ?          category
Work/Study Hours                                  int16
Financial Stress                                   int8
Family History of Mental Illness               category
Depression  

 For memory efficiency, string dtypre ismore efficient than object because it is optimized for text data, while object dtype can store mixed types but uses more memory for purely textual data.

### *Handle duplicated*:

In [80]:
df.duplicated().sum()

np.int64(0)

In [81]:
carbon_tracker.update_project_name('Data Cleaning')
carbon_tracker.update_project_name('Data Cleaning')