In [1]:
import spacy
import nltk
from nltk.corpus import stopwords


TOP_N = 5
PATH_TO_DATASET = "../data/medical_questions_pairs.csv"
STOPWORDS = set(stopwords.words("english"))
TOKEN_PATTERN = r"(?u)\b\w\w+\b"
METRIC = "cosine"
N_JOBS = -1

# Load the spaCy English model
nlp = spacy.load("en_core_web_sm")

nltk.download("stopwords", quiet=True)

True

In [2]:
import os

import pandas as pd


if not os.path.isfile(PATH_TO_DATASET):
    raise FileNotFoundError("Dataset not found")

df = pd.read_csv(PATH_TO_DATASET)
df.columns = ["dr_id", "q1", "q2", "label"]
df.drop_duplicates(subset=["q1", "q2"], inplace=True)

df.count()

dr_id    3048
q1       3048
q2       3048
label    3048
dtype: int64

In [3]:
documents = pd.concat([df["q1"], df["q2"]]).unique()

true_questions = df[df["label"] == 1]
# Grouping a list of source question
validate = true_questions.groupby("q1")["q2"].apply(list).reset_index()
validate.columns = ["question", "target"]

# Total documents, test questions
documents.shape, validate.shape

((4567,), (1524, 2))

In [4]:
import numpy as np
from sklearn.neighbors import NearestNeighbors

from src.vectorizer import Vectorizer


def accuracy_at_n(x_question: pd.Series, true_docs: pd.Series, pred_docs: np.ndarray, n: int) -> float:
    """
    Computes the accuracy of the predicted and true documents.
    :param x_question: Series of input questions
    :param true_docs: Series of true similar questions
    :param pred_docs: Array of prediction similar questions
    :param n: Top-N accuracy param
    :return: Accuracy@n metric
    """
    correct_predictions = 0
    total_predictions = true_docs.shape[0]

    for doc, y_true, indexes in zip(x_question, true_docs, pred_docs):
        # Get the N similar documents excluding the document query itself (if any)
        y_pred = [documents[index] for index in indexes if documents[index] != doc][:n]

        if set(y_true).intersection(y_pred):
            correct_predictions += 1

    return correct_predictions / total_predictions


vectorizer = Vectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)

model = NearestNeighbors(n_neighbors=TOP_N + 1, metric=METRIC, n_jobs=N_JOBS)
model.fit(tfidf_matrix)

vectors = vectorizer.transform(validate["question"])
predictions = model.kneighbors(vectors, return_distance=False)

accuracy_at_n(validate["question"], validate["target"], predictions, TOP_N)

0.8877952755905512