In [202]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchaudio
import numpy as np
import scipy
import sklearn
import pandas as pd
import PIL
import matplotlib
import tokenizers
import datasets
import transformers
import matplotlib.pyplot as plt
from collections import Counter
import datetime

# Data exploration

Here we explore each feature column and comment on how we are going to preprocess each one. We have a total of 140700 samples. The last feature is the label depression which we will train on.

## Id column



In [203]:
# Load the dataset
data = pd.read_csv("./data/train.csv")

# Print examples of the 'id' column
print("Examples of 'id' column:")
print(data["id"].head())

print("")

print("We have " ,data["id"].count()," amount of samples")

Examples of 'id' column:
0    0
1    1
2    2
3    3
4    4
Name: id, dtype: int64

We have  140700  amount of samples


We will drop the id column since there is no correlation between this and the label.

## Name column

In [204]:
# Count occurrences of each unique name
name_counts = data["Name"].value_counts()

# Print results
print("Unique names sorted by count (most to least):")
for name, count in name_counts.items():
    print(f"{name}: {count}")

print(f"Total unique names: {len(name_counts)}")


Unique names sorted by count (most to least):
Rohan: 3178
Aarav: 2336
Rupak: 2176
Aaradhya: 2045
Anvi: 2035
Raghavendra: 1877
Vani: 1657
Tushar: 1596
Ritvik: 1589
Shiv: 1568
Riya: 1548
Rashi: 1547
Raunak: 1524
Anand: 1486
Ishaani: 1477
Ansh: 1423
Vidya: 1408
Ritika: 1313
Anushka: 1279
Sanya: 1272
Aarush: 1266
Aariv: 1254
Abhishek: 1252
Rupal: 1234
Harsha: 1230
Harsh: 1156
Vikram: 1154
Shivam: 1146
Raghav: 1120
Armaan: 1116
Prachi: 1104
Ivaan: 1090
Ayaan: 1090
Siddhesh: 1090
Ira: 1061
Prisha: 1055
Rahil: 1051
Rishi: 1040
Ritik: 1033
Pratham: 1023
Aniket: 1023
Chhavi: 1003
Vibha: 974
Vivan: 963
Aishwarya: 962
Gauri: 959
Nikita: 951
Naina: 946
Veda: 940
Arav: 925
Vidhi: 913
Jiya: 912
Advait: 910
Krishna: 875
Vedant: 872
Ayush: 869
Aditi: 851
Shaurya: 848
Kashish: 845
Gagan: 842
Eshita: 831
Pratyush: 816
Ila: 799
Simran: 790
Aadhya: 787
Shreya: 784
Rudransh: 776
Garima: 769
Yashvi: 767
Anjali: 757
Vihaan: 744
Keshav: 737
Yuvraj: 730
Ishan: 714
Tanisha: 713
Harshil: 712
Sanket: 710
Rajat: 7

We thought about dropping the name column since we thought that this may not have any correlation to the label. But we realised that there might be a correlation, so we decided to keep it. The correlation being that for example having a "unattractive" name can affect your life in a bad way. 


For pre-processing this we want to give the name a value of how unique it is in the dataset, so for now we change the name with how frequent it is.


Things to consider:

Find a way to "rate" each name instead of how frequent the name is. Impute or change wrong names to missing name.

## Gender

In [205]:
# Print examples of the 'Name' column
print("Examples of 'Name' column:")
print(data["Gender"].head())

print("")

print("Number of unique genders:", data["Gender"].nunique())

Examples of 'Name' column:
0    Female
1      Male
2      Male
3      Male
4    Female
Name: Gender, dtype: object

Number of unique genders: 2


We will just encode male to 1 and female to 0.

## Age

In [206]:
print("Examples of 'Age' column:")
print(data["Age"].head())

print("")

print("Number of unique ages:", data["Age"].nunique())

Examples of 'Age' column:
0    49.0
1    26.0
2    33.0
3    22.0
4    30.0
Name: Age, dtype: float64

Number of unique ages: 43


We will only change the number from float to integer.

## City

In [207]:
print("Examples of 'City' column:")
print(data["City"].head())

print("")

print("Number of unique cities:", data["Age"].nunique())

Examples of 'City' column:
0         Ludhiana
1         Varanasi
2    Visakhapatnam
3           Mumbai
4           Kanpur
Name: City, dtype: object

Number of unique cities: 43


In [208]:
# Count occurrences of each unique city
city_counts = data["City"].value_counts()

# Find first occurrence index for each unique city
first_occurrence = data.groupby("City").apply(lambda x: x.index[0])

# Sort cities by count in descending order
sorted_cities = city_counts.index  # Cities sorted by count (default sorting from most to least)

count = 0
# Print results
print("Unique cities sorted by count (most to least):")
for city in sorted_cities:
    if city_counts[city] > 0:
        count += 1
        print(f"{city}: First Index = {first_occurrence[city]}, Count = {city_counts[city]}")

print(f"Total unique cities: {count}")


Unique cities sorted by count (most to least):
Kalyan: First Index = 36, Count = 6591
Patna: First Index = 9, Count = 5924
Vasai-Virar: First Index = 49, Count = 5765
Kolkata: First Index = 28, Count = 5689
Ahmedabad: First Index = 5, Count = 5613
Meerut: First Index = 17, Count = 5528
Ludhiana: First Index = 0, Count = 5226
Pune: First Index = 13, Count = 5210
Rajkot: First Index = 10, Count = 5207
Visakhapatnam: First Index = 2, Count = 5176
Srinagar: First Index = 26, Count = 5074
Mumbai: First Index = 3, Count = 4966
Indore: First Index = 189, Count = 4872
Agra: First Index = 18, Count = 4684
Surat: First Index = 20, Count = 4636
Varanasi: First Index = 1, Count = 4606
Vadodara: First Index = 43, Count = 4568
Hyderabad: First Index = 23, Count = 4496
Kanpur: First Index = 4, Count = 4398
Jaipur: First Index = 12, Count = 4328
Thane: First Index = 6, Count = 4289
Lucknow: First Index = 16, Count = 4280
Nagpur: First Index = 37, Count = 4209
Bangalore: First Index = 8, Count = 4123
C

  first_occurrence = data.groupby("City").apply(lambda x: x.index[0])


We found population, density, literacy and sex ratio for each major city in our dataset. We then merged this in our data and removed city column. For the minor cities or the wrongly written cities we took the average of the other columns..

# Pre-processing

Dropping the Id column

In [209]:
def dropId(data):

    # Drop the 'id' column
    data = data.drop(columns=["id"])
    return data


Encoding the name column

In [210]:
def name_freq(data):
    # Replace each name with its frequency in the dataset
    data["Name"] = data["Name"].map(data["Name"].value_counts())
    return data

Encoding gender column

In [211]:
def gender_encode(data):
    data["Gender"] = data["Gender"].map({"Female": 0, "Male": 1 })
    return data

Age

In [212]:
def age_integer(data):
    data["Age"] = data["Age"].astype(int)
    return data

City

In [213]:
import pandas as pd

# Load city data (assuming it's already loaded in a DataFrame named df_cities)
city_data_path = "./data/indian_cities_data.csv"
df_cities = pd.read_csv(city_data_path)

def city_one_hot(data):
    # Find the index of the "City" column
    city_index = data.columns.get_loc("City")

    # Merge the data with city information, excluding "Main Language"
    merged_data = data.merge(df_cities.drop(columns=["Main Language"]), on="City", how="left")

    # Calculate the mean values for missing cities
    mean_values = merged_data[["Population", "Density (per km²)", "Literacy Rate (%)", "Sex Ratio"]].mean()

    # Replace NaN values with the mean of the respective column
    merged_data.fillna(mean_values, inplace=True)

    # Drop the original "City" column
    merged_data.drop(columns=["City"], inplace=True)

    # Reorder columns to place new city-related columns where "City" was
    city_columns = ["Population", "Density (per km²)", "Literacy Rate (%)", "Sex Ratio"]
    cols = merged_data.columns.tolist()

    # Move the new city-related columns to the correct index
    for col in reversed(city_columns):
        cols.insert(city_index, cols.pop(cols.index(col)))

    # Reorder the dataframe
    merged_data = merged_data[cols]

    return merged_data


Pre-processing function:

In [214]:
def pre_processing(data):

    data = dropId(data)
    data = name_freq(data)
    data = gender_encode(data)
    data = age_integer(data)
    data = city_one_hot(data)
    return data


data = pre_processing(data)

print(data.head())

   Name  Gender  Age  Population  Density (per km²)  Literacy Rate (%)  \
0  2045       0   49   1618879.0             5200.0              85.77   
1   963       1   26   1198491.0             7300.0              79.27   
2   730       1   33   1728128.0             2500.0              81.79   
3   730       1   22  12442373.0            20482.0              89.73   
4   499       0   30   2765348.0             6900.0              82.42   

   Sex Ratio Working Professional or Student        Profession  \
0      850.0            Working Professional              Chef   
1      887.0            Working Professional           Teacher   
2      978.0                         Student               NaN   
3      853.0            Working Professional           Teacher   
4      857.0            Working Professional  Business Analyst   

   Academic Pressure  ...  Study Satisfaction  Job Satisfaction  \
0                NaN  ...                 NaN               2.0   
1                NaN  ..