# Extrinsic Evaluation
Post - Filter approach:

- we use the sensitivity labels from our intrinsic evaluation 
  
- in the post filter approach we rank the documents according to the  coordinate ascent algorithm optimizing towards normalized
Discounted Cumulative Gain (nDCG)

- for that we use predefined functions from https://github.com/rueycheng/CoordinateAscent/blob/master as the implementation was not mentioned in the paper

-> the general workflow will be the same as in extrinsic_LogisticRegression

In [2]:
import pandas as pd
bert_results = pd.read_csv("intermediate_results/sensitivity_predictions_comparison.csv")

In [3]:
judged_df = pd.read_csv("./data/judged.txt", sep="\t", header=None,
                        names=["Query", "Document-UI", "Document-Index", "Relevance1", "Relevance2", "Relevance3"])
# Verify the loaded DataFrame
print(judged_df.head())

   Query  Document-UI  Document-Index Relevance1 Relevance2 Relevance3
0      1     87097544           40626          d        NaN          d
1      1     87153566           11852          n        NaN          n
2      1     87157536           12693          d        NaN        NaN
3      1     87157537           12694          d        NaN        NaN
4      1     87184723           15450          n        NaN        NaN


get the query documents

In [4]:
def parse_queries(file_path):
    """
    Parses query files with the format:
    .I <Query ID>
    .B <Background>
    .W <Query Text>
    """
    query_list = []
    current_query_id = None
    current_query_text = None
    
    with open(file_path, "r") as file:
        for line in file:
            line = line.strip()
            if line.startswith(".I"):
                if current_query_id is not None and current_query_text is not None:
                    query_list.append({"Query": current_query_id, "query_text": current_query_text})
                current_query_id = int(line.split()[1])  # Extract Query ID
                current_query_text = None  # Reset query text
            elif line.startswith(".W"):
                current_query_text = ""  # Initialize query text
            elif current_query_text is not None:
                current_query_text += " " + line  # Append to query text
        
        # Append the last query
        if current_query_id is not None and current_query_text is not None:
            query_list.append({"Query": current_query_id, "query_text": current_query_text.strip()})
    
    return pd.DataFrame(query_list)

# Parse queries from files
queries1 = parse_queries("data/Queries1.txt")
queries2 = parse_queries("data/Queries2.txt")

# Combine the queries into a single DataFrame
queries_df = pd.concat([queries1, queries2], ignore_index=True)

# Verify the parsed queries
print(queries_df.head())


   Query                                         query_text
0      1   Are there adverse effects on lipids when prog...
1      2   pathophysiology and treatment of disseminated...
2      3   anticardiolipin and lupus anticoagulants, pat...
3      4                    reviews on subdurals in elderly
4      5   effectiveness of etidronate in treating hyper...


assign relevance labels from the judged.txt

In [5]:
def compute_relevance(row):
    # Count how many of the relevance columns contain 'd' (relevant)
    return sum(1 for val in [row["Relevance1"], row["Relevance2"], row["Relevance3"]] if val == "d")

# Add a total relevance score to judged_df
judged_df["Relevance_total"] = judged_df.apply(compute_relevance, axis=1)

# Keep only the necessary columns
judged_df_cleaned = judged_df[["Query", "Document-Index", "Relevance_total"]].rename(
    columns={"Document-Index": "sequential identifier"}
)

# Verify the processed DataFrame
print(judged_df_cleaned.head())

   Query  sequential identifier  Relevance_total
0      1                  40626                2
1      1                  11852                0
2      1                  12693                1
3      1                  12694                1
4      1                  15450                0


In [6]:
bert_with_relevance = pd.merge(bert_results, judged_df_cleaned, on="sequential identifier", how="left")

# Verify the merged DataFrame
print(bert_with_relevance.head())

   sequential identifier                                     title_abstract  \
0                    126  Prospective study of liver function in childre...   
1                    154  Postpartum thyroiditis--an underdiagnosed dise...   
2                    223  Primary renal actinomycosis in the presence of...   
3                    283  Clinical course of breast cancer patients with...   
4                    300  Cardiac abnormalities in patients with diffuse...   

   actual_sensitivity  predicted_sensitivity  Query  Relevance_total  
0                   0                      0     36                0  
1                   1                      1     76                0  
2                   1                      1      8                0  
3                   0                      0     22                1  
4                   0                      0     40                0  


In [7]:
bert_full = pd.merge(bert_with_relevance, queries_df, on="Query", how="left")

# Verify the final merged DataFrame
print(bert_full.head())

   sequential identifier                                     title_abstract  \
0                    126  Prospective study of liver function in childre...   
1                    126  Prospective study of liver function in childre...   
2                    154  Postpartum thyroiditis--an underdiagnosed dise...   
3                    154  Postpartum thyroiditis--an underdiagnosed dise...   
4                    223  Primary renal actinomycosis in the presence of...   

   actual_sensitivity  predicted_sensitivity  Query  Relevance_total  \
0                   0                      0     36                0   
1                   0                      0     36                0   
2                   1                      1     76                0   
3                   1                      1     76                0   
4                   1                      1      8                0   

                                          query_text  
0   CAN DILANTIN or PHENOBARBITAL CAU

# Post - Filter approach
- calculate bm25 and proximity count accordingly for the analysis
- as we already calculated the BM250 and the proximity score in the extrinsic evaluation of Logistic Regression, we are going to reuse that

In [19]:
test_predicted_with_scores = pd.read_csv("intermediate_results/test_predicted_withProximity.csv")

In [20]:
bert_full_with_bm25 = pd.merge(
    bert_full, 
    test_predicted_with_scores [["sequential identifier", "bm25_score","proximity_count"]],
    on="sequential identifier",
    how="left"
)

# Verify the merged DataFrame
print(bert_full_with_bm25.head())

   sequential identifier                                     title_abstract  \
0                    126  Prospective study of liver function in childre...   
1                    126  Prospective study of liver function in childre...   
2                    126  Prospective study of liver function in childre...   
3                    126  Prospective study of liver function in childre...   
4                    154  Postpartum thyroiditis--an underdiagnosed dise...   

   actual_sensitivity  predicted_sensitivity  Query  Relevance_total  \
0                   0                      0     36                0   
1                   0                      0     36                0   
2                   0                      0     36                0   
3                   0                      0     36                0   
4                   1                      1     76                0   

                                          query_text  proximity_count_x  \
0   CAN DILANTIN 

In [31]:
feature_columns = ["bm25_score", "proximity_count"]
X_test = bert_full_with_bm25[feature_columns].values

# Define relevance labels and Query IDs
y_test = bert_full_with_bm25["Relevance_total"].values
qid_test = bert_full_with_bm25["Query"].values

In [32]:
from coordinate_ascent import CoordinateAscent
from metrics import NDCGScorer
from scipy.sparse import csr_matrix


scorer = NDCGScorer(k=10, idcg_cache={})

X_test_sparse = csr_matrix(X_test)

model = CoordinateAscent(n_restarts=2, max_iter=25, verbose=True, scorer=scorer).fit(X_test_sparse, y_test, qid_test)

pred = model.predict(X_test_sparse, qid_test)

bert_full_with_bm25["predicted_scores"] = pred

1	1	1	0.14686533331255644
2	1	1	0.14686533331255644


In [35]:
bert_full_with_bm25_filtered= bert_full_with_bm25[bert_full_with_bm25["predicted_sensitivity"] == 0]

X_test_filtered = bert_full_with_bm25_filtered[["bm25_score", "proximity_count"]].values
y_test_filtered = bert_full_with_bm25_filtered["Relevance_total"].values  # Relevance labels
qid_test_filtered = bert_full_with_bm25_filtered["Query"].values  # Query IDs

In [36]:
ndcg_score = scorer(y_test_filtered, bert_full_with_bm25_filtered["predicted_scores"].values, qid_test_filtered).mean()

# Print the final score
print(f"Postfilter Average nCS-DCG@10: {ndcg_score:.4f}")

Postfilter Average nCS-DCG@10: 0.1444


-> these results are way off and not reproducable for us as we do not have enough information in the paper

# Joint-approach
- here we want to find a balanced result between sensitivity and relevance 
- for that we apply the penalty for sensitvity directly during the ranking process
- we again use the features from above, but also take sensitivity directly into account

In [38]:
feature_columns = ["bm25_score", "proximity_count"]
X_joint = bert_full_with_bm25[feature_columns].values

y_joint = bert_full_with_bm25[["Relevance_total", "predicted_sensitivity"]].values
qid_joint = bert_full_with_bm25["Query"].values

In [39]:
X_joint = csr_matrix(X_joint)

in the paper it was mentioned that they applied a penalty of 12

In [40]:
from metrics import NDCGScorer

class nCS_DCGScorer:
    def __init__(self, y_sensitivity, k=10, sensitivity_penalty=12, idcg_cache={}):
        self.y_sensitivity = y_sensitivity  # Store sensitivity labels
        self.k = k
        self.sensitivity_penalty = sensitivity_penalty
        self.ndcg_scorer = NDCGScorer(k=k, idcg_cache=idcg_cache)

    def __call__(self, y_relevance, pred, qid):
        # Apply sensitivity penalty to predicted scores
        penalized_pred = pred.copy()
        penalized_pred[self.y_sensitivity == 1] -= self.sensitivity_penalty

        # Compute nDCG@10 with penalized predictions
        return self.ndcg_scorer(y_relevance, penalized_pred, qid)

In [41]:
y_relevance = y_joint[:, 0]
y_sensitivity = y_joint[:, 1]

ncs_dcg_scorer = nCS_DCGScorer(y_sensitivity=y_sensitivity, k=10, sensitivity_penalty=12)

model = CoordinateAscent(
    n_restarts=5,
    max_iter=50,
    verbose=True,
    scorer=ncs_dcg_scorer  
).fit(X_joint, y_relevance, qid_joint)


1	1	1	0.1468821487688108
2	1	1	0.1468821487688108
3	1	0	0.1467969080124722
3	2	1	0.14689512113470213
3	1	0	0.14692537062626018
3	2	1	0.1470190111129167
4	1	1	0.1468821487688108
5	1	0	0.1467969080124722
5	2	1	0.14689512113470213
5	1	0	0.14692537062626018
5	2	1	0.1470190111129167


In [42]:
pred_joint = model.predict(X_joint, qid_joint)

bert_full_with_bm25["joint_scores"] = pred_joint

average_ncs_dcg = ncs_dcg_scorer(y_relevance, pred_joint, qid_joint).mean()
print(f"Joint Approach - Average nCS-DCG@10: {average_ncs_dcg:.4f}")

Joint Approach - Average nCS-DCG@10: 0.1470


these values are way below the results from the paper
-> as we dont have the exact approach they used in the paper, we cant reproduce the reults