In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 01. Importing Necessary Libraries

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import KBinsDiscretizer, StandardScaler 
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report

# 02. Loading the Datasets

In [None]:
# Load the datasets
train_path = "/kaggle/input/predicting-depression-machine-learning-challenge/train.csv"
test_path = "/kaggle/input/predicting-depression-machine-learning-challenge/test.csv"

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# 03. Checking the Data

##  Checking the Number of Unique Values in Each Column

In [None]:
for col in train_df.columns:
    print(f"{col}: {train_df[col].nunique()} unique values")

##  Checking Data Types

In [None]:
print(train_df.dtypes)

##  Displaying Unique Values for Categorical Columns

In [None]:
for col in train_df.select_dtypes(include=['object', 'category']):
    print(f"\nColumn: {col}")
    print(train_df[col].unique())

## Displaying Unique Values for Numerical Columns

In [None]:
for col in train_df.select_dtypes(include=['int64', 'float64']):
    print(f"\nColumn: {col}, Unique Values: {train_df[col].unique()[:10]}")

## Checking for Duplicates

In [None]:
print(f"Duplicate rows: {train_df.duplicated().sum()}")

## Target Class distribution

In [None]:
# Visualize target class distribution
plt.figure(figsize=(6,4))
sns.countplot(x=train_df['Depression'])
plt.title("Target Variable Distribution")
plt.show()

## Correlation Heatmap

In [None]:
# Select only numeric columns for correlation
numeric_cols = train_df.select_dtypes(include=['int64', 'float64']).columns

# Compute correlation only for numerical features
plt.figure(figsize=(10,6))
sns.heatmap(train_df[numeric_cols].corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Feature Correlations")
plt.show()

# Summary statistics
print(train_df.describe())

## Dataset Description

In [None]:
numerical_summary = train_df.describe()
numerical_summary.to_csv("numerical_summary.csv")

# Describe categorical features and save to CSV
categorical_summary = train_df.describe(include=[object])
categorical_summary.to_csv("categorical_summary.csv")

# Save missing values information
missing_values = train_df.isnull().sum()
missing_values.to_csv("missing_values.csv", header=["Missing Count"])

# 04. Handling Data Issues

## Handling Inconsistencies

In [None]:
# Standardizing Degree Names
def standardize_degree(value):
    degree_mapping = {
        "mtech": "M.Tech", "m.tech": "M.Tech", "m_tech": "M.Tech",
        "bsc": "B.Sc", "b.sc": "B.Sc", "b sc": "B.Sc",
        "btech": "B.Tech", "b.tech": "B.Tech", "b tech": "B.Tech",
        "msc": "M.Sc", "m.sc": "M.Sc", "m sc": "M.Sc",
        "llb": "LL.B", "ll.b": "LL.B",
        "mba": "MBA", "m.b.a": "MBA",
        "mbbs": "MBBS", "m.b.b.s": "MBBS"
    }
    value = str(value).strip().lower()
    return degree_mapping.get(value, value.title() if value.isalpha() else value)

# Categorize Sleep Duration
def categorize_sleep(value):
    value = str(value).lower().strip()

    # Handle specific misentered values
    mapping = {
        "than 5 hours": "Less than 5 hours",
        "9-5 hours": "5-9 hours",
        "9-6 hours": "6-9 hours",
        "10-6 hours": "6-10 hours"
    }
    
    # Define category mappings
    very_short = ["less than 5 hours", "1-2 hours", "2-3 hours", "3-4 hours", "4-5 hours", "1-3 hours", "1-6 hours"]
    short = ["5-6 hours", "6-7 hours", "4-6 hours", "3-6 hours", "5-9 hours"]
    normal = ["6-8 hours", "7-8 hours", "8-9 hours", "8 hours", "6-9 hours", "6-10 hours"]
    long = ["9-11 hours", "10-11 hours", "more than 8 hours"]
    outliers = ["40-45 hours", "45-48 hours", "49 hours", "55-66 hours", "35-36 hours"]
    invalid = ["indore", "pune", "work_study_hours", "sleep_duration", "no", "moderate", "unhealthy"]

    # Apply standardization
    if value in mapping:
        value = mapping[value]

    if value in very_short:
        return "Very Short Sleep"
    elif value in short:
        return "Short Sleep"
    elif value in normal:
        return "Normal Sleep"
    elif value in long:
        return "Long Sleep"
    elif value in outliers or value in invalid:
        return np.nan  # Mark as NaN for handling later
    else:
        return np.nan  # Default case

# Cleaning Dietary Habits
def clean_dietary_habits(value):
    valid_values = ["Healthy", "Unhealthy", "Moderate", "More Healthy", "Less Healthy"]
    value = str(value).strip().capitalize()
    return value if value in valid_values else np.nan

# Apply cleaning functions to train dataset
train_df["Degree"] = train_df["Degree"].apply(standardize_degree)
train_df["Sleep Duration"] = train_df["Sleep Duration"].apply(categorize_sleep)
train_df["Dietary Habits"] = train_df["Dietary Habits"].apply(clean_dietary_habits)

# Apply cleaning functions to test dataset
test_df["Degree"] = test_df["Degree"].apply(standardize_degree)
test_df["Sleep Duration"] = test_df["Sleep Duration"].apply(categorize_sleep)
test_df["Dietary Habits"] = test_df["Dietary Habits"].apply(clean_dietary_habits)

# Handle missing values safely
for column in ["Sleep Duration", "Dietary Habits", "Degree"]:
    if not train_df[column].dropna().mode().empty:
        train_df[column].fillna(train_df[column].mode()[0], inplace=True)
    else:
        # Fallback values
        default_values = {"Sleep Duration": "Normal Sleep", "Dietary Habits": "Moderate", "Degree": "B.Sc"}
        train_df[column].fillna(default_values[column], inplace=True)

    if not test_df[column].dropna().mode().empty:
        test_df[column].fillna(test_df[column].mode()[0], inplace=True)
    else:
        test_df[column].fillna(default_values[column], inplace=True)

## Checking for Outliers (Numerical Columns)

In [None]:
for col in train_df.select_dtypes(include=['int64', 'float64']):
    plt.figure(figsize=(5, 2))
    sns.boxplot(x=train_df[col])
    plt.title(col)
    plt.show()

## Checking for Skewness in the Data 

In [None]:
sns.histplot(train_df['Depression'], kde=True, bins=30)
plt.title("Distribution of Depression Column")
plt.show()

In [None]:
train_df['Depression'].skew()

# 05. Data Preprocessing

##  Droping Irrelevant Columns

In [None]:
# Drop irrelevant columns
train_df.drop(columns=["id", "Name"], inplace=True, errors='ignore')
test_df.drop(columns=["id", "Name"], inplace=True, errors='ignore')

## Filling Missing Values

In [None]:
# Fill missing values
categorical_cols = ["Profession", "Degree", "Dietary Habits"]
train_df[categorical_cols] = train_df[categorical_cols].fillna("Unknown")
test_df[categorical_cols] = test_df[categorical_cols].fillna("Unknown")

numerical_cols = ["Academic Pressure", "Work Pressure", "CGPA", "Study Satisfaction", "Job Satisfaction"]
for col in numerical_cols:
    median_value = train_df[col].median()
    train_df[col].fillna(median_value, inplace=True)
    test_df[col].fillna(median_value, inplace=True)

## Encoding Ordinal Categorical Variables

In [None]:
# Define ordinal encoding for Sleep Duration
sleep_mapping = {
    "Very Short Sleep": 1,
    "Short Sleep": 2,
    "Normal Sleep": 3,
    "Long Sleep": 4
}

# Apply encoding to train dataset
train_df["Sleep Duration"] = train_df["Sleep Duration"].map(sleep_mapping)

# Apply encoding to test dataset
test_df["Sleep Duration"] = test_df["Sleep Duration"].map(sleep_mapping)

## Encoding Binary Categorical Features

In [None]:
# Encode binary categorical features
binary_features = ["Have you ever had suicidal thoughts ?", "Family History of Mental Illness"]
for col in binary_features:
    train_df[col] = train_df[col].map({"Yes": 1, "No": 0})
    test_df[col] = test_df[col].map({"Yes": 1, "No": 0})

## One-Hot Encoding Categorical Variables

In [None]:
# One-hot encode categorical variables
train_df = pd.get_dummies(train_df, drop_first=True)
test_df = pd.get_dummies(test_df, drop_first=True)

## Ensuring Both Datasets Have the Same Columns

In [None]:
# Ensure both datasets have the same columns
missing_cols = set(train_df.columns) - set(test_df.columns)
for col in missing_cols:
    test_df[col] = 0

test_df = test_df[train_df.drop(columns=["Depression"]).columns]

# 06. Feature Engineering

In [None]:
# Fill missing values only in numeric columns
num_cols = ["CGPA", "Financial Stress"]

train_df[num_cols] = train_df[num_cols].apply(lambda x: x.fillna(x.median()))
test_df[num_cols] = test_df[num_cols].apply(lambda x: x.fillna(x.median()))

#  Interaction Features
train_df["Overall_Stress"] = train_df["Work Pressure"] * train_df["Financial Stress"]
test_df["Overall_Stress"] = test_df["Work Pressure"] * test_df["Financial Stress"]

# Binning Continuous Variables
binning = KBinsDiscretizer(n_bins=3, encode="ordinal", strategy="quantile")

for col in ["Age", "CGPA", "Financial Stress"]:
    train_df[f"{col}_Binned"] = binning.fit_transform(train_df[[col]])
    test_df[f"{col}_Binned"] = binning.transform(test_df[[col]])


# Feature Scaling (Only numerical features, not categorical ones)
scaler = StandardScaler()
num_cols = ["Work Pressure", "Financial Stress", "Overall_Stress"]  
train_df[num_cols] = scaler.fit_transform(train_df[num_cols])
test_df[num_cols] = scaler.transform(test_df[num_cols])

# 07. Model Preparation

## Splitting features and target

In [None]:
# Split features and target
X = train_df.drop(columns=["Depression"])
y = train_df["Depression"]

## Splitting Training Data for Validation

In [None]:
# Split training data for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 08. Training & Evaluating the Model

## Trainning XGBoost Model

In [None]:
# Train XGBoost model
xgb_model = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

## Predicting on Validation Set

In [None]:
# Predict on validation set
y_val_pred = xgb_model.predict(X_val)

##  Computing Accuracy

In [None]:
# Assuming y_val contains true labels and y_val_pred contains predicted labels
report = classification_report(y_val, y_val_pred, digits=2)
val_accuracy = accuracy_score(y_val, y_val_pred)

print("Validation Metrics:")
print(report)
print(f"Validation Accuracy: {val_accuracy:.4f}")

# 09. Making Predictions & Submission

##  Predicting on Test Set

In [None]:
# Predict on test set
test_predictions = xgb_model.predict(test_df)

## Preparing Submission File

In [None]:
# Prepare submission file
submission = pd.DataFrame({"id": pd.read_csv(test_path)["id"], "Depression": test_predictions})
submission.to_csv("submission.csv", index=False)
print("Submission file saved as submission.csv")