In [1]:
import streamlit as st
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, mean_squared_error, precision_recall_curve
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Load SBERT model
@st.cache_resource
def load_model():
    return SentenceTransformer('all-MiniLM-L6-v2')

In [3]:
model = load_model()

st.title("📄 Smart Similarity Detection System")
st.write("Upload two text documents to compare semantic similarity using SBERT.")

# File upload
file1 = st.file_uploader("Upload first document", type=["txt"])
file2 = st.file_uploader("Upload second document", type=["txt"])

threshold = st.slider("Similarity threshold", 0.0, 1.0, 0.5, 0.01)

2025-08-23 11:00:52.108 
  command:

    streamlit run C:\Users\lucky\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\LocalCache\local-packages\Python312\site-packages\ipykernel_launcher.py [ARGUMENTS]


In [9]:
if file1 and file2:
    # Read text files
    text1 = file1.read().decode("utf-8")
    text2 = file2.read().decode("utf-8")

    st.subheader("Document Preview")
    st.text_area("Document 1", text1[:1000], height=150)
    st.text_area("Document 2", text2[:1000], height=150)
     # Encode & compute similarity
    emb1 = model.encode(text1, convert_to_tensor=True)
    emb2 = model.encode(text2, convert_to_tensor=True)
    similarity = util.cos_sim(emb1, emb2).item()

    st.subheader("🔎 Similarity Score")
    st.metric("Cosine Similarity", f"{similarity:.4f}") 
     # Classification based on threshold
    prediction = int(similarity >= threshold)
    st.write(f"Prediction (threshold={threshold}):", 
             "Similar ✅" if prediction == 1 else "Not Similar ❌")

    # Ask user for ground truth label
    st.subheader("Evaluation Metrics")
    st.write("Provide ground truth label to evaluate performance.")
    true_label = st.radio("Are the documents truly similar?", ["Yes", "No"])
    y_true = [1 if true_label == "Yes" else 0]
    y_pred = [prediction]

    # Metrics
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    try:
        roc = roc_auc_score(y_true, [similarity])
    except ValueError:
        roc = None
    rmse = np.sqrt(mean_squared_error([similarity], y_true))

    st.write(f"**Accuracy:** {acc:.3f}")
    st.write(f"**F1-Score:** {f1:.3f}")
    st.write(f"**ROC-AUC:** {roc:.3f}" if roc else "ROC-AUC: N/A (need both classes)")
    st.write(f"**RMSE:** {rmse:.3f}")

    # Precision-Recall curve
    st.subheader("Precision-Recall Visualization")
    y_scores = [similarity]
    precisions, recalls, _ = precision_recall_curve(y_true, y_scores)

    fig, ax = plt.subplots()
    ax.plot(recalls, precisions, marker='.')
    ax.set_xlabel("Recall")
    ax.set_ylabel("Precision")
    ax.set_title("Precision-Recall Curve")
    st.pyplot(fig)