In [11]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder
import numpy as np

In [12]:
data = pd.read_csv('resources/Student Depression Dataset.csv')

In [13]:
preprocesed_data = data.drop(columns=['id', 'City', 'Profession', 'Work Pressure'])

In [14]:
values = preprocesed_data['Degree'].unique()
print(values)

['B.Pharm' 'BSc' 'BA' 'BCA' 'M.Tech' 'PhD' 'Class 12' 'B.Ed' 'LLB' 'BE'
 'M.Ed' 'MSc' 'BHM' 'M.Pharm' 'MCA' 'MA' 'B.Com' 'MD' 'MBA' 'MBBS' 'M.Com'
 'B.Arch' 'LLM' 'B.Tech' 'BBA' 'ME' 'MHM' 'Others']


In [15]:
def map_degree_level(degree):
    undergrad = ['B.Pharm', 'BSc', 'BA', 'BCA', 'B.Ed', 'LLB', 'BE', 'BHM', 'B.Com', 'B.Arch', 'B.Tech', 'BBA']
    postgrad = ['M.Tech', 'M.Ed', 'MSc', 'M.Pharm', 'MCA', 'MA', 'MBA', 'M.Com', 'ME', 'MHM', 'LLM', 'MD']
    doctorate = ['PhD']
    school = ['Class 12']
    if degree in undergrad:
        return 1
    elif degree in postgrad:
        return 2
    elif degree in doctorate:
        return 3
    elif degree in school:
        return 0
    else:
        return -1  # Otros/no clasificados


In [16]:
# Encode Gender
gender_encoder = LabelEncoder()
preprocesed_data.loc[:, ['Gender']] = gender_encoder.fit_transform(preprocesed_data['Gender'])

# Encode Sleep Duration
order_sleep = ['Less than 5 hours', '5-6 hours', 'Others', '7-8 hours', 'More than 8 hours']
sleep_duration_encoder = OrdinalEncoder(categories=[order_sleep])
preprocesed_data.loc[:, ['Sleep Duration']] = sleep_duration_encoder.fit_transform(preprocesed_data[['Sleep Duration']])

# Encode Dietary Habits
preprocesed_data.loc[:, ['Dietary Habits']] = preprocesed_data[['Dietary Habits']].replace('Others', 'Moderate')
order_dietary = ['Unhealthy', 'Moderate', 'Healthy']
dietary_habits_encoder = OrdinalEncoder(categories=[order_dietary])
preprocesed_data.loc[:, ['Dietary Habits']] = dietary_habits_encoder.fit_transform(preprocesed_data[['Dietary Habits']])

# Encode Have you ever had suicidal thoughts?
suicidal_thoughts_encoder = LabelEncoder()
preprocesed_data.loc[:, ['Have you ever had suicidal thoughts ?']] = suicidal_thoughts_encoder.fit_transform(preprocesed_data['Have you ever had suicidal thoughts ?']) 

# Encode Family History of Mental Illness
family_history_encoder = LabelEncoder()
preprocesed_data.loc[:, ['Family History of Mental Illness']] = family_history_encoder.fit_transform(preprocesed_data['Family History of Mental Illness'])

## Encode Degree
preprocesed_data.loc[:, ['Degree']] = preprocesed_data['Degree'].apply(map_degree_level)

In [17]:
def fillna_random(df):
    for col in df.columns:
        if df[col].isnull().any():
            valores = df[col].dropna().values
            df[col] = df[col].apply(
                lambda x: np.random.choice(valores) if pd.isna(x) else x
            )
    return df

# 1) Rellenas todo el df
preprocesed_data = fillna_random(preprocesed_data)

In [18]:
preprocesed_data.to_csv('resources/preprocessed_student_depression_data.csv', index=False)


In [None]:
for col in preprocesed_data.columns:
    print(f"{col}: {preprocesed_data[col].min()}, {preprocesed_data[col].max()}")

Gender: [1 0]
Age: [33. 24. 31. 28. 25. 29. 30. 27. 19. 20. 23. 18. 21. 22. 34. 32. 26. 39.
 35. 42. 36. 58. 49. 38. 51. 44. 43. 46. 59. 54. 48. 56. 37. 41.]
Academic Pressure: [5. 2. 3. 4. 1. 0.]
CGPA: [ 8.97    5.9     7.03    5.59    8.13    5.7     9.54    8.04    9.79
  8.38    6.1     7.04    8.52    5.64    8.58    6.51    7.25    7.83
  9.93    8.74    6.73    5.57    8.59    7.1     6.08    5.74    9.86
  6.7     6.21    5.87    6.37    9.72    5.88    9.56    6.99    5.24
  9.21    7.85    6.95    5.86    7.92    9.66    8.94    9.71    7.87
  5.6     7.9     5.46    6.79    8.7     7.38    8.5     7.09    9.82
  8.89    7.94    9.11    6.75    7.53    9.49    9.01    7.64    5.27
  6.      9.44    5.75    7.51    9.05    6.38    8.95    9.88    5.32
  6.27    7.7     8.1     9.59    8.96    5.51    7.43    8.79    9.95
  5.37    6.86    8.32    9.74    5.66    7.48    8.23    8.81    6.03
  5.56    5.68    5.14    7.61    6.17    8.17    9.87    8.75    6.16
  9.5     7.99  