In [73]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchaudio
import numpy as np
import scipy
import sklearn
import pandas as pd
import PIL
import matplotlib
import tokenizers
import datasets
import transformers
import matplotlib.pyplot as plt
from collections import Counter
import datetime

# Data exploration

Here we explore each feature column and comment on how we are going to preprocess each one. We have a total of 140700 samples. The last feature is the label depression which we will train on.

## Id column



In [74]:
# Load the dataset
data = pd.read_csv("./data/train.csv")

# Print examples of the 'id' column
print("Examples of 'id' column:")
print(data["id"].head())

print("")

print("We have " ,data["id"].count()," amount of samples")

Examples of 'id' column:
0    0
1    1
2    2
3    3
4    4
Name: id, dtype: int64

We have  140700  amount of samples


We will drop the id column since there is no correlation between this and the label.

## Name column

In [75]:
# Print examples of the 'Name' column
print("Examples of 'Name' column:")
print(data["Name"].head())

print("")

print("Number of unique names:", data["Name"].nunique())

Examples of 'Name' column:
0    Aaradhya
1       Vivan
2      Yuvraj
3      Yuvraj
4        Rhea
Name: Name, dtype: object

Number of unique names: 422


We thought about dropping the name column since we thought that this may not have any correlation to the label. But we realised that there might be a correlation, so we decided to keep it. The correlation being that for example having a "unattractive" name can affect your life in a bad way. 


For pre-processing this we want to give the name a value of how unique it is.

## Gender

In [76]:
# Print examples of the 'Name' column
print("Examples of 'Name' column:")
print(data["Gender"].head())

print("")

print("Number of unique genders:", data["Gender"].nunique())

Examples of 'Name' column:
0    Female
1      Male
2      Male
3      Male
4    Female
Name: Gender, dtype: object

Number of unique genders: 2


We will just encode male to 1 and female to 0.

## Age

In [77]:
print("Examples of 'Age' column:")
print(data["Age"].head())

print("")

print("Number of unique ages:", data["Age"].nunique())

Examples of 'Age' column:
0    49.0
1    26.0
2    33.0
3    22.0
4    30.0
Name: Age, dtype: float64

Number of unique ages: 43


We will only change the number from float to integer.

# Pre-processing

Dropping the Id column

In [78]:
def dropId(data):

    # Drop the 'id' column
    data = data.drop(columns=["id"])
    return data


Encoding the name column

In [79]:
def name_freq(data):
    # Replace each name with its frequency in the dataset
    data["Name"] = data["Name"].map(data["Name"].value_counts())
    return data

Encoding gender column

In [80]:
def gender_encode(data):
    data["Gender"] = data["Gender"].map({"Female": 0, "Male": 1 })
    return data

Age

In [81]:
def age_integer(data):
    data["Age"] = data["Age"].astype(int)
    return data

Pre-processing function:

In [82]:
def pre_processing(data):

    data = dropId(data)
    data = name_freq(data)
    data = gender_encode(data)
    data = age_integer(data)
    return data


data = pre_processing(data)

print(data.head())

   Name  Gender  Age           City Working Professional or Student  \
0  2045       0   49       Ludhiana            Working Professional   
1   963       1   26       Varanasi            Working Professional   
2   730       1   33  Visakhapatnam                         Student   
3   730       1   22         Mumbai            Working Professional   
4   499       0   30         Kanpur            Working Professional   

         Profession  Academic Pressure  Work Pressure  CGPA  \
0              Chef                NaN            5.0   NaN   
1           Teacher                NaN            4.0   NaN   
2               NaN                5.0            NaN  8.97   
3           Teacher                NaN            5.0   NaN   
4  Business Analyst                NaN            1.0   NaN   

   Study Satisfaction  Job Satisfaction     Sleep Duration Dietary Habits  \
0                 NaN               2.0  More than 8 hours        Healthy   
1                 NaN               3.0 