## Kaggle competition:<br>
https://www.kaggle.com/competitions/playground-series-s4e11

## import libraries and load data

In [1]:
## import the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import warnings
warnings.filterwarnings("ignore")

## load the training data
train = pd.read_csv("train.csv")
## load the test data
test = pd.read_csv("test.csv")

## Data Preprocessing

In [2]:
print(train.shape)
print(test.shape)
print(train["Depression"].value_counts())
print(train.columns)

(140700, 20)
(93800, 19)
Depression
0    115133
1     25567
Name: count, dtype: int64
Index(['id', 'Name', 'Gender', 'Age', 'City',
       'Working Professional or Student', 'Profession', 'Academic Pressure',
       'Work Pressure', 'CGPA', 'Study Satisfaction', 'Job Satisfaction',
       'Sleep Duration', 'Dietary Habits', 'Degree',
       'Have you ever had suicidal thoughts ?', 'Work/Study Hours',
       'Financial Stress', 'Family History of Mental Illness', 'Depression'],
      dtype='object')


### check for outliers

In [3]:
print(train.head())
## check possible values for possible categorical columns
categories = [
    "Gender",
    "City",
    "Working Professional or Student",
    "Profession",
    "Academic Pressure",
    "Work Pressure",
    "Study Satisfaction",
    "Job Satisfaction",
    "Sleep Duration",
    "Dietary Habits",
    "Degree",
    "Have you ever had suicidal thoughts ?",
    "Work/Study Hours",
    "Financial Stress",
    "Family History of Mental Illness",
]
outlier_tests = pd.DataFrame()
for category in categories:
    test_unique = pd.Series(test[category].unique())  # Convert to Series
    train_unique = pd.Series(train[category].unique())  # Convert to Series
    if len(test_unique[~test_unique.isin(train_unique)]) > 0:
        print(category)
        outlier_test = test[
            test[category].isin(test_unique[~test_unique.isin(train_unique)])
        ]
        print(len(outlier_test))
        print(outlier_test)
        outlier_tests = pd.concat([outlier_tests, outlier_test])
        print("\n")
# Get unique rows of the DataFrame
outlier_tests = outlier_tests.drop_duplicates()
print(outlier_tests)
print(outlier_tests.shape)

   id      Name  Gender   Age           City Working Professional or Student  \
0   0  Aaradhya  Female  49.0       Ludhiana            Working Professional   
1   1     Vivan    Male  26.0       Varanasi            Working Professional   
2   2    Yuvraj    Male  33.0  Visakhapatnam                         Student   
3   3    Yuvraj    Male  22.0         Mumbai            Working Professional   
4   4      Rhea  Female  30.0         Kanpur            Working Professional   

         Profession  Academic Pressure  Work Pressure  CGPA  \
0              Chef                NaN            5.0   NaN   
1           Teacher                NaN            4.0   NaN   
2               NaN                5.0            NaN  8.97   
3           Teacher                NaN            5.0   NaN   
4  Business Analyst                NaN            1.0   NaN   

   Study Satisfaction  Job Satisfaction     Sleep Duration Dietary Habits  \
0                 NaN               2.0  More than 8 hours     

finding: 122 rows in test data have values that are not included in train data for some columns<br>
following step: use the most frequent value in train data for these rows


### use the most frequent value in train data for these outlier rows

In [4]:
# Loop through each category to identify and handle outliers
for category in categories:
    # Get unique values for the category in test and train datasets
    test_unique = pd.Series(test[category].unique())  # Convert to Series
    train_unique = pd.Series(train[category].unique())  # Convert to Series

    # Identify outlier values (present in test but not in train)
    unmatched_values = test_unique[~test_unique.isin(train_unique)]

    # Check if there are any outliers
    if len(unmatched_values) > 0:
        outliers_count = len(test[test[category].isin(unmatched_values)])

        print(f"{outliers_count} Outliers found in category: {category}")

        # Get the mode (most frequent value) for the category in train
        most_frequent_value = train[category].mode()[
            0
        ]  # Mode always returns a Series, take the first value

        # Replace the outlier values in test data with the most frequent value from train
        test.loc[test[category].isin(unmatched_values), category] = most_frequent_value

        print(
            f"Replaced outlier values in category '{category}' with '{most_frequent_value}'\n"
        )

30 Outliers found in category: City
Replaced outlier values in category 'City' with 'Kalyan'

20 Outliers found in category: Profession
Replaced outlier values in category 'Profession' with 'Teacher'

11 Outliers found in category: Sleep Duration
Replaced outlier values in category 'Sleep Duration' with 'Less than 5 hours'

13 Outliers found in category: Dietary Habits
Replaced outlier values in category 'Dietary Habits' with 'Moderate'

48 Outliers found in category: Degree
Replaced outlier values in category 'Degree' with 'Class 12'



### check for NAs

In [6]:
## split data based on Working Professional or Student
train_student = train[train["Working Professional or Student"] == "Student"]
train_professional = train[
    train["Working Professional or Student"] == "Working Professional"
]
test_student = test[test["Working Professional or Student"] == "Student"]
test_professional = test[
    test["Working Professional or Student"] == "Working Professional"
]

# Define the columns to process for each DataFrame
train_student_na_columns = ["Academic Pressure", "CGPA", "Study Satisfaction", "Dietary Habits", "Financial Stress"]
train_professional_na_columns = ["Work Pressure", "Job Satisfaction", "Profession", "Dietary Habits", "Degree", "Financial Stress"]
test_student_na_columns = ["Academic Pressure", "CGPA", "Study Satisfaction", "Dietary Habits", "Degree"]
test_professional_na_columns = ["Work Pressure", "Job Satisfaction", "Profession", "Dietary Habits", "Degree"]

# Replace missing values in train_student for the listed columns
for col in train_student_na_columns:
    train_student.loc[:, col] = train_student[col].fillna(train_student[col].mode()[0])

# Replace missing values in train_professional for the listed columns
for col in train_professional_na_columns:
    train_professional.loc[:, col] = train_professional[col].fillna(train_professional[col].mode()[0])

# Replace missing values in test_student for the listed columns
for col in test_student_na_columns:
    test_student.loc[:, col] = test_student[col].fillna(test_student[col].mode()[0])

# Replace missing values in test_professional for the listed columns
for col in test_professional_na_columns:
    test_professional.loc[:, col] = test_professional[col].fillna(test_professional[col].mode()[0])

# check nas in updated DataFrames
print("Updated train_student:")
print(train_student.isna().sum())

print("\nUpdated train_professional:")
print(train_professional.isna().sum())

print("\nUpdated test_student:")
print(test_student.isna().sum())

print("\nUpdated test_professional:")
print(test_professional.isna().sum())


Updated train_student:
id                                           0
Name                                         0
Gender                                       0
Age                                          0
City                                         0
Working Professional or Student              0
Profession                               27867
Academic Pressure                            0
Work Pressure                            27898
CGPA                                         0
Study Satisfaction                           0
Job Satisfaction                         27893
Sleep Duration                               0
Dietary Habits                               0
Degree                                       0
Have you ever had suicidal thoughts ?        0
Work/Study Hours                             0
Financial Stress                             0
Family History of Mental Illness             0
Depression                                   0
dtype: int64

Updated train_professio

### combine data

In [7]:
train_new = pd.concat([train_student, train_professional])
test_new = pd.concat([test_student, test_professional])
print(train.shape)
print(test.shape)
print(train_new.shape)
print(test_new.shape)
print(train_new.head())
print(test_new.head())
train_new.to_csv("train_new.csv", index=False)
test_new.to_csv("test_new.csv", index=False)

(140700, 20)
(93800, 19)
(140700, 20)
(93800, 19)
    id       Name  Gender   Age           City  \
2    2     Yuvraj    Male  33.0  Visakhapatnam   
8    8  Aishwarya  Female  24.0      Bangalore   
26  26     Aditya    Male  31.0       Srinagar   
30  30     Prisha  Female  28.0       Varanasi   
32  32     Chhavi  Female  25.0         Jaipur   

   Working Professional or Student Profession  Academic Pressure  \
2                          Student        NaN                5.0   
8                          Student        NaN                2.0   
26                         Student        NaN                3.0   
30                         Student        NaN                3.0   
32                         Student        NaN                4.0   

    Work Pressure  CGPA  Study Satisfaction  Job Satisfaction  \
2             NaN  8.97                 2.0               NaN   
8             NaN  5.90                 5.0               NaN   
26            NaN  7.03                 5.0  

## Build Model

In [8]:
from sklearn.model_selection import train_test_split
from category_encoders import TargetEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pandas as pd

def preprocess_data(train_data, test_data, features, target_column):
    """
    Preprocesses the train and test data:
    - Selects specific features
    - Encodes categorical variables
    """
    # Select specified features
    X_train = train_data[features]
    y_train = train_data[target_column]
    X_test = test_data[features]

    # Encode categorical variables using Target Encoding
    categorical_cols = X_train.select_dtypes(include=['object']).columns
    encoder = TargetEncoder(cols=categorical_cols)
    X_train_encoded = encoder.fit_transform(X_train, y_train)
    X_test_encoded = encoder.transform(X_test)

    return X_train_encoded, y_train, X_test_encoded

def train_and_predict(X_train, y_train, X_test, output_column_name):
    """
    Trains a stacking classifier and makes predictions on the test data.
    """
    # Train-test split for validation
    X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

    # Define base models
    base_models = [
        ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
        ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42))
    ]

    # Define meta-model
    meta_model = LogisticRegression()

    # Build stacking classifier
    stacking_clf = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)
    stacking_clf.fit(X_train_split, y_train_split)

    # Validate the model
    y_val_pred = stacking_clf.predict(X_val)
    accuracy = accuracy_score(y_val, y_val_pred)
    print(f"Validation Accuracy ({output_column_name}): {accuracy:.2f}")

    # Predict on the test data
    predictions = stacking_clf.predict(X_test)

    return predictions

# Features for student and professional models
student_features = [
    'Gender', 'Age', 'City', 'Academic Pressure', 'CGPA', 'Study Satisfaction', 'Sleep Duration', 
    'Dietary Habits', 'Degree', 'Have you ever had suicidal thoughts ?', 'Work/Study Hours', 
    'Financial Stress', 'Family History of Mental Illness'
]
professional_features = [
    'Gender', 'Age', 'City', 'Profession', 'Work Pressure', 'Job Satisfaction', 'Sleep Duration', 
    'Dietary Habits', 'Degree', 'Have you ever had suicidal thoughts ?', 'Work/Study Hours', 
    'Financial Stress', 'Family History of Mental Illness'
]

# Preprocess and train models
if __name__ == "__main__":
    # Train on train_student and predict on test_student
    X_train_student, y_train_student, X_test_student = preprocess_data(train_student, test_student, student_features, 'Depression')
    student_predictions = train_and_predict(X_train_student, y_train_student, X_test_student, 'Student')

    # Train on train_professional and predict on test_professional
    X_train_professional, y_train_professional, X_test_professional = preprocess_data(train_professional, test_professional, professional_features, 'Depression')
    professional_predictions = train_and_predict(X_train_professional, y_train_professional, X_test_professional, 'Professional')

    # Add predictions to respective test datasets
    test_student['Depression_Predicted'] = student_predictions
    test_professional['Depression_Predicted'] = professional_predictions

    # Combine the results
    test_combined = pd.concat([test_student, test_professional], ignore_index=True)

    # Output only 'id' and 'Depression_Predicted'
    output_file = 'test_new_with_predictions.csv'
    output_df = test_combined[['id', 'Depression_Predicted']]
    output_df.to_csv(output_file, index=False)

    # Display the first few rows of the output
    print(output_df.head())


Validation Accuracy (Student): 0.84
Validation Accuracy (Professional): 0.96
       id  Depression_Predicted
0  140703                     1
1  140708                     0
2  140719                     1
3  140720                     1
4  140721                     1
