# Download Dataset

In [None]:
!gdown --id 1cQrNtrsgMhYtGSvEvU-Ox9YOHhXSdSkO

# Load Dataset

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [None]:
df = pd.read_excel('/content/Raw dataset_MJH.xlsx')

In [None]:
df.head()

In [None]:
df.info()

# Data preparation and Feature Engineering

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

In [None]:
# Load data
data =  pd.read_excel('/content/Raw dataset_MJH.xlsx')

In [None]:
data = data.drop(['Sr No'], axis = 1)

In [None]:
# Handle missing values
# Assuming missing values can be replaced with the mean for continuous and mode for categorical
for column in data.columns:
    if data[column].dtype == 'object':
        data[column] = data[column].fillna(data[column].mode()[0])
    else:
        data[column] = data[column].fillna(data[column].mean())


In [None]:
# Convert categorical variables using Label Encoding
label_encoders = {}
for column in data.columns:
    if data[column].dtype == 'object':
        le = LabelEncoder()
        data[column] = le.fit_transform(data[column])
        label_encoders[column] = le



In [None]:
label_encoders

In [None]:
# Feature Engineering
# Deriving categorical bins for age and BMI
data['Age Group'] = pd.cut(data['2. Age (years)=?'], bins=[0, 18, 35, 50, 65, 100], labels=['Child', 'Youth', 'Adult', 'Middle-aged', 'Senior'])
data['BMI Category'] = pd.cut(data['8. BMI-before COVID'], bins=[0, 18.5, 24.9, 29.9, 40], labels=['Underweight', 'Normal', 'Overweight', 'Obese'])

# Encode new categorical features
data['Age Group'] = LabelEncoder().fit_transform(data['Age Group'])
data['BMI Category'] = LabelEncoder().fit_transform(data['BMI Category'])

In [None]:
data.head()

In [None]:
# Evaluate correlations and drop highly correlated features if necessary
correlation_matrix = data.corr()
correlation_matrix

In [None]:
high_corr = correlation_matrix.index[correlation_matrix["17. Feeling depressed"] > 0.5]  # arbitrary threshold
high_corr

In [None]:
# Normalize or scale continuous variables
scaler = MinMaxScaler()  # or use StandardScaler() for models like SVM
continuous_columns = [col for col in data.columns if data[col].dtype in ['float64', 'int64']]
data[continuous_columns] = scaler.fit_transform(data[continuous_columns])


In [None]:
y = data['17. Feeling depressed']
X = data.drop('17. Feeling depressed', axis=1)

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


In [None]:
# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# Train Support Vector Machine (SVM)

In [None]:
# Support Vector Machine (SVM)
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)

# Train Neural Network

In [None]:
# Neural Network
nn_model = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam', max_iter=300)
nn_model.fit(X_train, y_train)
y_pred_nn = nn_model.predict(X_test)



# Train XGBoost

In [None]:
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)



# Train Logistic Regression

In [None]:
# Logistic Regression
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

# Evaluation function to print the metrics

In [None]:
# Evaluation function to print the metrics
def evaluate_model(y_test, y_pred, model_name):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro', zero_division=0)
    recall = recall_score(y_test, y_pred, average='macro', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)

    print(f"Model: {model_name}")
    print(confusion_matrix(y_test, y_pred))
    print(f"{model_name} Performance:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}\n")


# Evaluating all models
evaluate_model(y_test, y_pred_rf, 'Random Forest')
evaluate_model(y_test, y_pred_svm, 'SVM')
evaluate_model(y_test, y_pred_nn, 'Neural Network')
evaluate_model(y_test, y_pred_xgb, 'XGBoost')
evaluate_model(y_test, y_pred_lr, 'Logistic Regression')


# SHapley Additive exPlanations (SHAP)

Uses game theory to explain the output of any machine learning model by computing the contribution of each feature to a prediction.

In [None]:
!pip install shap

In [None]:
import xgboost as xgb
import shap
import matplotlib.pyplot as plt

# Understanding the SHAP Force Plot


The SHAP (SHapley Additive exPlanations) force_plot is an effective visualization tool for understanding the impact of each feature on a model's prediction for a single observation. It illustrates how each feature contributes, either positively or negatively, to the final prediction relative to the average prediction (baseline).



 Here’s a breakdown of the key components of a SHAP force plot:

- Base Value: This is the average prediction for the dataset, provided by the model over the training data. In classification, this would be the log odds of the baseline class, while in regression, it's the average outcome.

- Output Value: This is the actual prediction for the specific instance being explained. It is displayed at the end of the force plot.

- Features: Each feature that affects the prediction is shown as a force. Each force can either:

- Push to the right (positive effect): These are features that push the model’s prediction higher than the base value. They are usually shown in red.

- Push to the left (negative effect): These are features that push the model’s prediction lower than the base value. They are usually shown in blue.

-  Size of the Force: The size or length of each force represents the magnitude of that feature’s impact on the prediction. Larger forces have a more significant impact.

In [None]:
# Train the model
xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(X_train, y_train)

shap.initjs()
# Create the SHAP explainer and calculate SHAP values
explainer_xgb = shap.TreeExplainer(xgb_model)
shap_values_xgb = explainer_xgb.shap_values(X_train)

# Visualize the first prediction's explanation
shap.force_plot(explainer_xgb.expected_value, shap_values_xgb[0,:], X_train.iloc[0,:])


Tips for Interpreting Force Plots
Look for Dominant Features: Identify which features have the longest bars (either red or blue). These are the features most influential for this particular prediction.

Understand Thresholds: In classification tasks, consider where the final output value lies relative to a decision threshold. This can inform how features collectively move the prediction across this threshold.

Compare Instances: By looking at multiple force plots for different instances, you can see how feature contributions vary across the dataset, which might indicate patterns or inconsistencies in model behavior.

# Explanation:
- The bar plot shows the average impact of each feature on the model output magnitude,
- sorted by the importance of the features.
- The beeswarm plot provides a deeper insight into the positive and negative relationships of the features
- with the target variable, along with the distribution of the SHAP values for each feature across all data points.

In [None]:
# Assuming X_train and y_train are already defined and preprocessed
# Train the XGBoost model
xgb_model = xgb.XGBClassifier(random_state=42, verbosity=1, eval_metric='logloss')
xgb_model.fit(X_train, y_train)

# Create the SHAP explainer
explainer = shap.TreeExplainer(xgb_model)
shap_values = explainer.shap_values(X_train)

# Generate SHAP summary plot
shap.summary_plot(shap_values, X_train, plot_type="bar")

# You can also create a detailed SHAP summary plot (beeswarm plot) to see the distribution
shap.summary_plot(shap_values, X_train, plot_type='dot')




## Examine how changes in a feature change the model’s prediction

In [None]:
import xgboost as xgb
import shap
import matplotlib.pyplot as plt

# Assuming the XGBoost model and data are already defined and loaded: xgb_model, X_train

# Create the SHAP explainer and calculate SHAP values
explainer = shap.TreeExplainer(xgb_model)
shap_values = explainer.shap_values(X_train)

# Plot the SHAP dependence plot for a specific feature
# The following example assumes 'feature_name' is the name of the feature you're interested in
feature_name = '3. Education level=?'  # Replace 'example_feature_name' with your actual feature of interest

# Select a feature for coloring to show interactions; if not sure, let SHAP choose one
interaction_index = 'auto'  # Set to 'auto' or specify another feature name

shap.dependence_plot(ind=feature_name, shap_values=shap_values, features=X_train, interaction_index=interaction_index)

# Explanation:
# Each point on the plot represents a single datapoint in the dataset.
# The x-axis shows the value of the feature, and the y-axis shows the SHAP value for that feature.
# The color represents the value of the interacting feature, providing insight into how two features interact to impact the model output.


# Local Interpretable Model-agnostic Explanations (LIME)

Explains individual predictions by approximating the local decision boundary with an interpretable model (like linear models).

In [None]:
!pip install lime

## Understanding LIME Plot Components
- Prediction: The overall prediction of the classifier for the specific instance being explained. This is usually shown at the top of the plot.

- Local Model: LIME creates a simple model (like a linear model) that approximates the behavior of the complex model near the instance being explained. This local model is used to interpret the result.

Feature Contributions:

- Positive Contributions (Green Bars): Features that push the model’s prediction in the positive direction (towards the predicted class). The length of the bar indicates the strength of the contribution.
- Negative Contributions (Red Bars): Features that push the model’s prediction in the negative direction (away from the predicted class). Again, the length of the bar reflects the magnitude of the impact.
- Base Line/Intercept: This is the starting point of the prediction, assuming no features are contributing. It's akin to a baseline from which the contributions of all features are measured.

In [None]:
import xgboost as xgb
from lime import lime_tabular
import numpy as np

# Assume X_train, y_train are already defined and the XGBoost model is trained:
# xgb_model = xgb.XGBClassifier(...)
# xgb_model.fit(X_train, y_train)

# Create a LIME explainer object
explainer = lime_tabular.LimeTabularExplainer(
    training_data=np.array(X_train),
    feature_names=X_train.columns,
    class_names=['Negative', 'Positive'],  # Adjust based on your target classes
    mode='classification'
)

# Select the instance to explain
i = 2  # Index of the instance in the test set
instance = X_train.iloc[i]

# Generate explanations
exp = explainer.explain_instance(
    data_row=instance,
    predict_fn=xgb_model.predict_proba
)

# Visualize the explanation
exp.show_in_notebook(show_table=True, show_all=False)

# Explanation:
# This will generate an HTML output showing the contribution of each feature to the prediction of the selected instance.


Tips for Interpreting LIME Plots

- Consistency Check: Verify that the prediction shown in the LIME plot matches what your complex model outputs for the same instance. Inconsistencies might suggest that the local approximation isn't well fitted.

- Feature Values: Alongside the contribution bars, the feature values of the instance being explained are often displayed. This helps to contextualize why certain features are impacting the prediction in a specific way.

- Model Simplicity: The local model should be simple enough to understand but complex enough to accurately approximate the complex model near the instance. If the local model is too simple, it may not provide a faithful explanation.

- Exploring Multiple Instances: LIME explanations can vary significantly across different instances. To get a fuller understanding of the model’s behavior, generate and compare explanations for multiple different instances, especially those where the model performs poorly or unexpectedly.