In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import KFold, train_test_split, cross_val_score, cross_val_predict, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder

In [None]:
#Load dataset
data = pd.read_csv('C:/Users/HP/Desktop/suiciderisk.csv', sep =',',engine = 'python')
data.head(20)

In [None]:
# Summarize responses to create an overall risk response

# assuming the risk response is the sum of all questions responses divided by the number of questions
# Select only numeric columns before calculating mean
numeric_data = data.select_dtypes(include='number')
data['response'] = numeric_data.mean(axis=1).round().astype(int)

# Define the mapping function
def map_risk(response):
    if response in [1, 2]:
        return 'Low_risk'
    elif response == 3:
        return 'Medium_risk'
    elif response in [4, 5]:
        return 'High_risk'
# Apply the mapping function
data['risk_category'] = data['response'].apply(map_risk)

# Features and labels
X = data.drop(['response', 'risk_category'], axis=1)
y = data['risk_category']

# Display the Updated data
data.head(20)

In [None]:
# Assume 'response' is the feature indicating the response of an individual
# and 'risk_category' is the target variable we are trying to predict

# Encode all categorical variables
le = LabelEncoder()
for column in data.select_dtypes(include=['object']).columns:
    data[column] = le.fit_transform(data[column])

# Split data into response (feature) and target (risk category)
X = data.drop('response', axis=1)  # Features
y = data['risk_category']  # Target variable

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = {
    'Naive Bayes': GaussianNB(),
    'SVM': SVC(probability=True),
    'Random Forest': RandomForestClassifier(),
    'Logistic Regression': LogisticRegression(max_iter=500)
}

# Evaluation metrics
metrics = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted', 'roc_auc_ovr']

# Evaluate models using k-fold cross-validation
k = 5  # Number of folds
results = {}

for model_name, model in models.items():
    print(f"Evaluating {model_name}...")
    model_results = {}
    for metric in metrics:
        cv_scores = cross_val_score(model, X_train, y_train, cv=k, scoring=metric)
        model_results[metric] = cv_scores.mean()

        results[model_name] = model_results


# Calculate the average score for each model
average_scores = {model_name: sum(scores.values()) / len(scores) for model_name, scores in results.items()}

# Find the best model
best_model_name = max(average_scores, key=average_scores.get)

# Print cross-validation results
print("\nCross-validation results:")
for model_name, metrics in results.items():
    print(f"{model_name}:")
    for metric_name, score in metrics.items():
        print(f"  {metric_name}: {score:.4f}")
    print()

print(f"Best model: {best_model_name}")


In [None]:
# Define the models and their evaluation metrics
models = ['Naive Bayes', 'SVM', 'Random Forest', 'Logistic Regression']
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-score', 'ROC AUC']

# Evaluation results
results = {
    'Naive Bayes': {
        'Accuracy': 0.9950,
        'Precision': 0.9951,
        'Recall': 0.9950,
        'F1-score': 0.9943,
        'ROC AUC': 0.9556
    },
    'SVM': {
        'Accuracy': 0.9950,
        'Precision': 0.9951,
        'Recall': 0.9950,
        'F1-score': 0.9943,
        'ROC AUC': 1.0000
    },
    'Random Forest': {
        'Accuracy': 0.9975,
        'Precision': 0.9975,
        'Recall': 0.9975,
        'F1-score': 0.9971,
        'ROC AUC': 1.0000
    },
    'Logistic Regression': {
        'Accuracy': 0.9950,
        'Precision': 0.9951,
        'Recall': 0.9950,
        'F1-score': 0.9943,
        'ROC AUC': 1.0000
    }
}

# Prepare data for plotting (vertical)
x = np.arange(len(models))  # the label locations
width = 0.10  # the width of the bars

fig, ax = plt.subplots(figsize=(7, 5))

# Plot each metric
for i, metric in enumerate(metrics):
    scores = [results[model][metric] for model in models]
    ax.bar(x + i*width, scores, width, label=metric)

# Add labels, title, and legend
ax.set_xlabel('Machine Learning Models')
ax.set_ylabel('Score')
ax.set_title('Evaluation Metrics for Suicide Ideation Prediction Models')
ax.set_xticks(x + 2*width)
ax.set_xticklabels(models)
ax.legend()

# Show plot
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Define the models and their evaluation metrics
models = ['Naive Bayes', 'SVM', 'Random Forest', 'Logistic Regression']
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-score', 'ROC AUC']

# Evaluation results 
results = {
    'Naive Bayes': {
        'Accuracy': 0.9950,
        'Precision': 0.9951,
        'Recall': 0.9950,
        'F1-score': 0.9943,
        'ROC AUC': 0.9556
    },
    'SVM': {
        'Accuracy': 0.9950,
        'Precision': 0.9951,
        'Recall': 0.9950,
        'F1-score': 0.9943,
        'ROC AUC': 1.0000
    },
    'Random Forest': {
        'Accuracy': 0.9975,
        'Precision': 0.9975,
        'Recall': 0.9975,
        'F1-score': 0.9971,
        'ROC AUC': 1.0000
    },
    'Logistic Regression': {
        'Accuracy': 0.9950,
        'Precision': 0.9951,
        'Recall': 0.9950,
        'F1-score': 0.9943,
        'ROC AUC': 1.0000
    }
}

# Prepare data for plotting (horizontal)
y_pos = np.arange(len(models))  # the label locations
metrics_scores = {metric: [results[model][metric] for model in models] for metric in metrics}

fig, ax = plt.subplots(figsize=(7, 4))

# Plot each metric
bar_width = 0.15
for i, metric in enumerate(metrics):
    ax.barh(y_pos + i*bar_width, metrics_scores[metric], bar_width, label=metric)

# Add labels, title, and legend
ax.set_xlabel('Score')
ax.set_ylabel('Machine Learning Models')
ax.set_title('Evaluation Metrics for Suicide Ideation Prediction Models')
ax.set_yticks(y_pos + 2*bar_width)
ax.set_yticklabels(models)
ax.legend()

# Show plot
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Define the models and their evaluation metrics
models = ['Naive Bayes', 'SVM', 'Random Forest', 'Logistic Regression']
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-score', 'ROC AUC']

# Evaluation results (replace with your actual results)
results = {
    'Naive Bayes': {
        'Accuracy': 0.9950,
        'Precision': 0.9951,
        'Recall': 0.9950,
        'F1-score': 0.9943,
        'ROC AUC': 0.9556
    },
    'SVM': {
        'Accuracy': 0.9950,
        'Precision': 0.9951,
        'Recall': 0.9950,
        'F1-score': 0.9943,
        'ROC AUC': 1.0000
    },
    'Random Forest': {
        'Accuracy': 0.9975,
        'Precision': 0.9975,
        'Recall': 0.9975,
        'F1-score': 0.9971,
        'ROC AUC': 1.0000
    },
    'Logistic Regression': {
        'Accuracy': 0.9950,
        'Precision': 0.9951,
        'Recall': 0.9950,
        'F1-score': 0.9943,
        'ROC AUC': 1.0000
    }
}

# Prepare data for plotting(individual vertical)
metrics_scores = {metric: [results[model][metric] for model in models] for metric in metrics}

fig, axs = plt.subplots(nrows=len(metrics), ncols=1, figsize=(7, 12))

# Plot each metric
for i, metric in enumerate(metrics):
    ax = axs[i]
    ax.bar(models, [metrics_scores[metric][j] for j in range(len(models))], color=plt.cm.Paired(np.arange(len(models))))
    ax.set_ylabel(metric)
    ax.set_ylim([0, 1.05])

plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Define the models and their evaluation metrics
models = ['Naive Bayes', 'SVM', 'Random Forest', 'Logistic Regression']
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-score', 'ROC AUC']

# Evaluation results (replace with your actual results)
results = {
    'Naive Bayes': {
        'Accuracy': 0.9950,
        'Precision': 0.9951,
        'Recall': 0.9950,
        'F1-score': 0.9943,
        'ROC AUC': 0.9556
    },
    'SVM': {
        'Accuracy': 0.9950,
        'Precision': 0.9951,
        'Recall': 0.9950,
        'F1-score': 0.9943,
        'ROC AUC': 1.0000
    },
    'Random Forest': {
        'Accuracy': 0.9975,
        'Precision': 0.9975,
        'Recall': 0.9975,
        'F1-score': 0.9971,
        'ROC AUC': 1.0000
    },
    'Logistic Regression': {
        'Accuracy': 0.9950,
        'Precision': 0.9951,
        'Recall': 0.9950,
        'F1-score': 0.9943,
        'ROC AUC': 1.0000
    }
}

# Prepare data for plotting(individual horizontal)
metrics_scores = {metric: [results[model][metric] for model in models] for metric in metrics}

fig, axs = plt.subplots(nrows=len(metrics), ncols=1, figsize=(5, 12))

# Plot each metric
for i, metric in enumerate(metrics):
    ax = axs[i]
    ax.barh(models, [metrics_scores[metric][j] for j in range(len(models))], color=plt.cm.Paired(np.arange(len(models))))
    ax.set_xlabel(metric)
    ax.set_xlim([0, 1.05])
    ax.invert_yaxis()  # Invert y-axis to display the highest score at the top

plt.tight_layout()
plt.show()