In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, roc_curve,  confusion_matrix, f1_score
import matplotlib.pyplot as plt

# Step 1: Read the CSV file
df = pd.read_csv("Result/Biogrid_human_1.csv")

# Step 2: Aggregate the predictions
# Group by 'Protein' and aggregate 'Pred_label' using majority voting
# and 'Y_Prob' using mean value
agg_df = df.groupby('Protein').agg(
    label=('label', 'first'),  # Assuming 'label' is the same for all sub-sequences of a protein
    Pred_label=('Pred_label', lambda x: x.mode().iloc[0]),  # Majority voting
    Y_Prob=('Y_Prob', 'mean')  # Mean value
).reset_index()
agg_df['Y_Prob_thres'] = (agg_df['Y_Prob'] >= 0.5).astype(int)
# Step 3: Calculate performance metrics
y_true = agg_df['label']
y_pred = agg_df['Pred_label']
y_prob = agg_df['Y_Prob']
y_prob_thres = agg_df['Y_Prob_thres']
# Calculate metrics

print("Majority voting")
# Accuracy
accuracy = accuracy_score(y_true, y_prob_thres)

# Precision
precision = precision_score(y_true, y_prob_thres)

# Recall (Sensitivity)
recall = recall_score(y_true, y_prob_thres)

tn, fp, fn, tp = confusion_matrix(y_true, y_prob_thres).ravel()
spec = tn / (tn + fp)
print(f"Specificity: {spec}")
#confusion matrix
print(confusion_matrix(y_true, y_pred))
# AUC-ROC curve
auc_roc = roc_auc_score(y_true, y_prob)

# Plot AUC-ROC curve
fpr, tpr, _ = roc_curve(y_true, y_prob)
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % auc_roc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()

f1 = f1_score(y_true, y_prob_thres)
print(f"F1-score: {f1}")
# Print the metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall (Sensitivity): {recall}")
print(f"AUC-ROC: {auc_roc}")

# Save the aggregated results to a new CSV file
agg_df.to_csv('Result/seq_level_predictions.csv', index=False)


In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix, roc_curve, f1_score
import matplotlib.pyplot as plt

# Load the subsequence-level predictions
test_df = pd.read_csv('Result/Biogrid_human_1.csv')

# Sequence-level aggregation
seq_level_df = test_df.groupby('Protein').agg({
    'label': 'first',  # Take the first label as they should be the same across subsequences
    'Pred_label': lambda x: 1 if 1 in x.values else 0,  # One-hit rule
    'Y_Prob': 'mean'
      # Conditional adjustment based on the mean
}).reset_index()

# Calculate evaluation metrics
y_true = seq_level_df['label']
y_pred = seq_level_df['Pred_label']
y_prob = seq_level_df['Y_Prob']

# Metrics
print("One hit")
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
auc_roc = roc_auc_score(y_true, y_prob)
f1 = f1_score(y_true, y_pred)
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
spec = tn / (tn + fp)
print(f"Specificity: {spec}")
print(f"F1-score: {f1}")

# Confusion matrix for sensitivity (Recall is equivalent to sensitivity for positive class)
conf_matrix = confusion_matrix(y_true, y_pred)

# Print metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall (Sensitivity): {recall}")
print(f"AUC-ROC: {auc_roc}")
print("\nConfusion Matrix:")
print(conf_matrix)


seq_level_df.to_csv('seq_level_predictions_OH.csv', index=False)


# Plot AUC-ROC curve
fpr, tpr, thresholds = roc_curve(y_true, y_prob)
plt.figure()
plt.plot(fpr, tpr, color='blue', label=f'AUC-ROC = {auc_roc}')
plt.plot([0, 1], [0, 1], color='red', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('AUC-ROC Curve')
plt.legend(loc='lower right')
plt.show()


In [None]:
import numpy as np

# Calculate J = TPR - FPR and find the optimal threshold
J = tpr - fpr
optimal_idx = np.argmax(J)
optimal_threshold = thresholds[optimal_idx]

print(f"Optimal Threshold: {optimal_threshold:.4f}")

# Thresholding using the optimal threshold
y_pred_thresholded = (y_prob >= optimal_threshold).astype(int)

# Generate confusion matrix
conf_matrix = confusion_matrix(y_true, y_pred_thresholded)

print("\nConfusion Matrix after Thresholding:")
print(conf_matrix)

tn, fp, fn, tp = confusion_matrix(y_true, y_pred_thresholded).ravel()
spec = tn / (tn + fp)
print(f"Specificity: {spec}")
accuracy = accuracy_score(y_true, y_pred_thresholded)
precision = precision_score(y_true, y_pred_thresholded)
recall = recall_score(y_true, y_pred_thresholded)
f1 = f1_score(y_true, y_pred_thresholded)
print(f"F1-score: {f1}")
# Print results
print("Accuracy from Y_pred")
print(f"Accuracy: {accuracy:.8f}")
print(f"Precision: {precision:.8f}")
print(f"Recall: {recall:.8f}")

plt.figure()
plt.plot(fpr, tpr, color='blue', label=f'AUC-ROC = {auc_roc}')
plt.plot([0, 1], [0, 1], color='red', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('AUC-ROC Curve with Optimal Threshold')

# Plot the optimal threshold point
plt.plot(fpr[optimal_idx], tpr[optimal_idx], marker='o', color='black', label=f'Optimal Threshold: {optimal_threshold:.4f}')

plt.legend(loc='lower right')
plt.show()