# Preparing the Data for ML Modeling

Import the necessary libraries

In [104]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

Import the Data

In [105]:
df = pd.read_excel("Data/Visualization_Data.xlsx")

Discarding the useless variables

In [106]:
# Columns to be removed
columns_to_remove = [
    "Ethnicity", "Currently_Single", "Exercise_Days", "Family_Income", "Family_Interaction", 
    "Family_Visit_Frequency", "Job_Type", "Lecture_Preferance", "Life_Threats", "Living_Parents", 
    "Love_Affair_Not_Satisfied", "Parents_Employment", "Religion", "Residence_Type", "Sleep_Hours", 
    "Sports_Participation", "Uni_Entry_Attempt", "Smoke_Frequency", "Travel_Mode"
]

# Remove specified columns
df = df.drop(columns=columns_to_remove)

### Encoding the ordinal categorical variables

In [107]:
"""
Accomodation Satisfaction takes on the values 1,2,3,4,5. However, it is more discriminative in predicting 
depression if 1,2 were combined and 4,5 was combined.
"""

# Define a mapping function for Accommodation_Satisfaction
def map_accommodation_satisfaction(value):
    if value in [1, 2]:
        return 1
    elif value == 3:
        return 2
    elif value in [4, 5]:
        return 3

# Apply the mapping function to the Accommodation_Satisfaction column
df['Accommodation_Satisfaction'] = df['Accommodation_Satisfaction'].apply(map_accommodation_satisfaction)


In [108]:
"""

The frequency of socializing takes on the values Less than once a month,
Once a month, 2 to 3 times a month, Once a week, More than once a week.
However, it is more discriminative in predicting 
depression if Less than once a month, Once a month were combined
and Once a week, More than once a week was combined.

"""

# Mapping dictionary for Socializing_Frequency
socializing_map = {
    'Less than once a month': 1,
    'Once a month': 1,
    '2 to 3 times a month': 2,
    'Once a week': 3,
    'More than once a week': 3
}

# Apply the mapping to the Socializing_Frequency column
df['Socializing_Frequency'] = df['Socializing_Frequency'].map(socializing_map)

In [109]:
"""

The frequency of socializing takes on the values Eat 3 meals a day but not necessarily a healthy diet,
Nutritionally Balanced Diet, Unable to eat 3 meals a day, Mostly eating junk food, 
On a diet. However, it is more discriminative in predicting 
depression if Unable to eat 3 meals a day, Mostly eating junk food were combined (As they are unhealthy)
and the rest of the categories were ordinally encoded from the least healthiest category to the most
healthiest category.

"""

# Mapping dictionary for Meal_Type
meal_type_map = {
    'Eat 3 meals a day but not necessarily a healthy diet': 2,
    'Nutritionally Balanced Diet': 4,
    'Unable to eat 3 meals a day': 1,
    'Mostly eating junk food': 1,
    'On a diet': 3
}

# Apply the mapping to the Meal_Type column
df['Meal_Type'] = df['Meal_Type'].map(meal_type_map)

In [110]:
# Mapping dictionary for Alcohol_Frequency
alcohol_frequency_map = {
    'Never': 1,
    'Rarely': 2,
    'Occasionally': 3
}

# Apply the mapping to the Alcohol_Frequency column
df['Alcohol_Frequency'] = df['Alcohol_Frequency'].map(alcohol_frequency_map)


In [111]:
# Mapping dictionary for Travel_Time
travel_time_map = {
    'Less than 1 hour': 1,
    'Between 1 and 3 hours': 2,
    'Between 3 and 5 hours': 3
}

# Apply the mapping to the Travel_Time column
df['Travel_Time'] = df['Travel_Time'].map(travel_time_map)


In [112]:
# Mapping dictionary for Academic_Achievements_Satisfaction
academic_achievements_map = {
    'No': 1,
    'Neither satisfied nor dissatisfied': 2,
    'No GPA as of yet': 2,
    'Yes': 3
}

# Apply the mapping to the Academic_Achievements_Satisfaction column
df['Academic_Achievements_Satisfaction'] = df['Academic_Achievements_Satisfaction'].map(academic_achievements_map)

In [113]:
# Mapping dictionary for Weight_Status
weight_status_map = {
    'Underweight': 1,
    'Healthy Weight': 2,
    'Overweight': 3,
    'Obesity': 4
}

# Apply the mapping to the Weight_Status column
df['Weight_Status'] = df['Weight_Status'].map(weight_status_map)


### Encoding the nominal categorical variables

In [114]:
# List of nominal categorical variables
nominal_vars = [
    'Sex', 'Sexual_Orientation', 'Uni_Year', 'Study_Stream', 'Academic_Program_Satisfaction', 
    'Societies_Participation', 'English_Difficulty', 'Ragging_Experience', 'Staff_Support_Perception', 
    'Awareness_Of_Help', 'Siblings', 'Financial_Support', 'Financial_Support_Duty', 'Family_Debts', 
    'Recent_Death', 'Family_Illness', 'Power_Cut_Impact', 'Home_Confinement', 'Medical_Access_Difficulty', 
    'Job_Loss_Family', 'Love_Affair_Satisfied', 'Never_Love', 'Separated'
]

# Apply dummy encoding to these variables
df_encoded = pd.get_dummies(df, columns=nominal_vars, drop_first=True)

### Encoding the Dependent Variable

In [115]:
# Mapping dictionary for depression_status
depression_status_map = {
    'No MDD': 0,
    'MDD': 1
}

# Apply the mapping to the depression_status column
df_encoded['depression_status'] = df_encoded['depression_status'].map(depression_status_map)

Normalizing the Data

In [116]:
# Select all columns except 'depression_status' and the post stratification weight
features = df_encoded.drop(['depression_status', 'Post_Stratification_Weight'], axis=1)

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform the features
normalized_features = scaler.fit_transform(features)

# Create a new DataFrame with the normalized features
# Ensure to include the 'depression_status' and 'Post_Stratification_Weight' back into the DataFrame
df_normalized = pd.DataFrame(normalized_features, columns=features.columns)
df_normalized['depression_status'] = df_encoded['depression_status'].values
df_normalized['Weight'] = df_encoded['Post_Stratification_Weight'].values

Creating representative Data

In [118]:
df_normalized

Unnamed: 0,Accommodation_Satisfaction,Socializing_Frequency,Life_Satisfaction,Appearance_Satisfaction,Meal_Type,Alcohol_Frequency,Travel_Time,Academic_Stress,Academic_Achievements_Satisfaction,Uni_Life_Enjoyment,...,Family_Illness_Yes,Power_Cut_Impact_Yes,Home_Confinement_Yes,Medical_Access_Difficulty_Yes,Job_Loss_Family_Yes,Love_Affair_Satisfied_Yes,Never_Love_Yes,Separated_Yes,depression_status,Weight
0,1.0,0.0,1.00,0.75,1.000000,0.0,0.5,0.75,0.5,0.75,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0,1.951133
1,0.5,0.0,0.75,1.00,0.333333,0.5,0.5,0.25,1.0,0.75,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0,0.782421
2,0.5,0.0,0.25,0.25,0.333333,1.0,0.5,0.75,0.0,0.50,...,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,1,1.079520
3,1.0,0.0,1.00,1.00,1.000000,0.0,0.0,1.00,1.0,0.75,...,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0,0.462825
4,0.5,0.5,0.25,0.25,0.333333,0.0,0.0,0.75,0.5,0.50,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0,0.462825
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
355,1.0,0.0,0.50,0.75,1.000000,0.0,0.0,0.75,0.5,0.50,...,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0,1.671984
356,0.5,0.0,0.50,0.50,1.000000,0.0,0.0,1.00,1.0,0.50,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1,1.671984
357,0.5,0.5,0.75,0.50,0.000000,0.0,0.5,0.50,0.5,0.25,...,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1,1.215460
358,0.5,0.0,0.50,0.50,0.333333,0.0,1.0,0.50,1.0,0.50,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0,0.782421
