In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import KFold, train_test_split, cross_val_score, cross_val_predict, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder

In [None]:
#Load dataset
data = pd.read_csv('C:/Users/HP/Desktop/suiciderisk.csv', sep =',',engine = 'python')
data.head(20)

In [None]:
# Summarize responses to create an overall risk response

# assuming the risk response is the sum of all questions responses divided by the number of questions
# Select only numeric columns before calculating mean
numeric_data = data.select_dtypes(include='number')
data['response'] = numeric_data.mean(axis=1).round().astype(int)

# Define the mapping function
def map_risk(response):
    if response in [1, 2]:
        return 'Low_risk'
    elif response == 3:
        return 'Medium_risk'
    elif response in [4, 5]:
        return 'High_risk'
# Apply the mapping function
data['risk_category'] = data['response'].apply(map_risk)

# Features and labels
X = data.drop(['response', 'risk_category'], axis=1)
y = data['risk_category']

# Display the Updated data
data.head(20)

In [None]:
# Assume 'response' is the feature indicating the response of an individual
# and 'risk_category' is the target variable we are trying to predict

# Encode all categorical variables
le = LabelEncoder()
for column in data.select_dtypes(include=['object']).columns:
    data[column] = le.fit_transform(data[column])

# Split data into response (feature) and target (risk category)
X = data.drop(['response','risk_category'], axis=1)  # Features
y = data['risk_category']  # Target variable

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define models
models = {
    'Naive Bayes': GaussianNB(),
    'SVM': SVC(probability=True),
    'Random Forest': RandomForestClassifier(),
    'Logistic Regression': LogisticRegression(max_iter=500)
}

# Evaluation metric
metrics = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted', 'roc_auc_ovr']

# Evaluate models using k-fold cross-validation
k = 5  # Number of folds
results = {}

for model_name, model in models.items():
    print(f"Evaluating {model_name}...")
    model_results = {}
    for metric in metrics:
        cv_scores = cross_val_score(model, X_train, y_train, cv=k, scoring=metric)
        model_results[metric] = cv_scores
        results[model_name] = model_results
        
# Calculate the average score for each model
average_scores = {}
for model_name, scores in results.items():
    average_score = sum([np.mean(score) for score in scores.values()]) / len(scores)
    average_scores[model_name] = average_score

# Find the best model
best_model_name = max(average_scores, key=average_scores.get)
        

# Print cross-validation results
print("\nCross-validation results:")
for model_name, metrics in results.items():
    print(f"{model_name}:")
    for metric_name, score in metrics.items():
        print(f"  {metric_name}: {np.mean(score):.4f}")
    print()

print(f"Best model: {best_model_name}")


In [None]:
# Plotting the results
# 1. Compound bar chart of machine learning algorithms and the result of the evaluation metrics
fig, ax = plt.subplots(2, 1, figsize=(12, 16))
fig.suptitle('Model Evaluation Metrics')

# Prepare data for the compound bar chart
metrics_means = {metric: [np.mean(results[model][metric]) for model in models] for metric in metrics}
bar_width = 0.15
index = np.arange(len(models))

# Vertical bar chart
for i, metric in enumerate(metrics):
    ax[0].bar(index + i * bar_width, metrics_means[metric], bar_width, label=metric)

ax[0].set_xlabel('Model')
ax[0].set_ylabel('Score')
ax[0].set_title('Vertical Compound Bar Plot of Evaluation Metrics')
ax[0].set_xticks(index + bar_width * (len(metrics) - 1) / 2)
ax[0].set_xticklabels(models.keys())
ax[0].set_ylim(0, 1)
ax[0].legend()

# Horizontal bar chart
for i, metric in enumerate(metrics):
    ax[1].barh(index + i * bar_width, metrics_means[metric], bar_width, label=metric)

ax[1].set_ylabel('Model')
ax[1].set_xlabel('Score')
ax[1].set_title('Horizontal Compound Bar Plot of Evaluation Metrics')
ax[1].set_yticks(index + bar_width * (len(metrics) - 1) / 2)
ax[1].set_yticklabels(models.keys())
ax[1].set_xlim(0, 1)
ax[1].legend()

plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()

# 2. Bar chart plot of individual evaluation metrics with the machine learning algorithms
fig, axes = plt.subplots(len(metrics), 1, figsize=(10, len(metrics) * 5))
fig.suptitle('Individual Evaluation Metrics by Model')

for i, metric in enumerate(metrics):
    scores = [np.mean(results[model][metric]) for model in models]
    axes[i].bar(models.keys(), scores)
    axes[i].set_title(f'{metric.capitalize()}')
    axes[i].set_ylabel('Score')
    axes[i].set_ylim(0, 1)

plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()

In [None]:
# Encode all categorical variables
le = LabelEncoder()
for column in data.select_dtypes(include=['object']).columns:
    data[column] = le.fit_transform(data[column])

# Split data into features and target
X = data.drop('risk_category', axis=1)  # Features
y = data['risk_category']  # Target variable

# Check the distribution of classes
print("Class distribution in original data:")
print(y.value_counts())

# Hold out 10 examples for validation
X_holdout, X_remaining, y_holdout, y_remaining = train_test_split(X, y, test_size=0.9, random_state=42)

# Check the distribution of classes in the holdout set and the rest
print("\nClass distribution in holdout data:")
print(y_holdout.value_counts())
print("\nClass distribution in rest of the data:")
print(y_remaining.value_counts())


# Split the remaining data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_remaining, y_remaining, test_size=0.2, random_state=42)

# Encode the target variable into a binary format
le_target = LabelEncoder()
y_train_binary = le_target.fit_transform(y_train)
y_test_binary = le_target.transform(y_test)
y_holdout_binary = le_target.transform(y_holdout)
y_remaining_binary = le_target.transform(y_remaining)

# Define models
models = {
    'Naive Bayes': GaussianNB(),
    'SVM': SVC(probability=True),
    'Random Forest': RandomForestClassifier(),
    'Logistic Regression': LogisticRegression(max_iter=500)
}

# Evaluation metrics
metrics = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted', 'roc_auc_ovr']

# Evaluate models using k-fold cross-validation
k = 5  # Number of folds
results = {}

for model_name, model in models.items():
    print(f"Evaluating {model_name}...")
    model_results = {}
    for metric in metrics:
        cv_scores = cross_val_score(model, X_train, y_train, cv=k, scoring=metric)
        model_results[metric] = cv_scores.mean()

        results[model_name] = model_results


# Calculate the average score for each model
average_scores = {model_name: sum(scores.values()) / len(scores) for model_name, scores in results.items()}

# Find the best model
best_model_name = max(average_scores, key=average_scores.get)

# Train the best model on the entire remaining data
best_model = models[best_model_name]
best_model.fit(X_remaining, y_remaining_binary)

# Validate the best model on the held-out examples
y_pred = best_model.predict(X_holdout)
y_pred_proba = best_model.predict_proba(X_holdout)[:, 1]

print("\nValidation results:")
print(f"  Accuracy: {accuracy_score(y_holdout_binary, y_pred):.4f}")
print(f"  Precision: {precision_score(y_holdout_binary, y_pred, average='weighted'):.4f}")
print(f"  Recall: {recall_score(y_holdout_binary, y_pred, average='weighted'):.4f}")
print(f"  F1-score: {f1_score(y_holdout_binary, y_pred, average='weighted'):.4f}")
print(f"  ROC-AUC: {roc_auc_score(y_holdout_binary, y_pred_proba):.4f}")

print(f"Best model: {best_model_name}")