In [None]:

!pip install pandas transformers bert-score scikit-learn torch

import pandas as pd
from bert_score import score
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
import torch

Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert-score
Successfully installed bert-score-0.3.13


In [None]:
import os
from google.colab import drive
drive.mount('/gdrive')
os.chdir('/gdrive/My Drive/nlp_project/')

Mounted at /gdrive


In [None]:
!ls

cv_nlp_ml_2023-2024_cleaned_142k.csv	   processed_data
embeddings.npy				   prompts
evaluation				   rag_results_30.csv
evaluation_results_old30_top_10.csv	   rag_results_30.gsheet
evaluation_results_old_8b_30.csv	   rag_results_current_50.csv
evaluation_results_spectre2_50_top_10.csv  rag_results_current_50.gsheet
evaluation_results_vague_50.csv		   rag_results_current_without_rf_50.csv
evaluation_results_vague_50.gsheet	   rag_results_current_without_rf_50.gsheet
evaluation_results_vague_50_top_5.csv	   rag_results_current_without_rf_50_top_20.csv
evaluation_spectre2_old30_top_10.csv	   rag_results_current_without_rf_50_top_5.csv
evaluation_spectre2_vague_top_20.csv	   rag_results_detailed_30.csv
faiss_index.bin				   rag_results_with_abstracts_30.csv
graphrag_eval_data.csv			   rag_results_with_abstracts_30.gsheet
graphrag_eval_data.gsheet		   research_prompts_15.csv
lit_review.log				   specter2_embeddings.npy
paper_ids_and_titles_RAG.csv		   specter2_faiss_index.bin


In [None]:
# Function to evaluate metrics
def evaluate_metrics(input_query, generated_review, model_name="distilbert-base-uncased"):

    # BERTScore Evaluation
    P, R, F1 = score([generated_review], [input_query], lang="en", model_type=model_name)
    bert_scores = {
        "BERTScore_Precision": P.mean().item(),
        "BERTScore_Recall": R.mean().item(),
        "BERTScore_F1": F1.mean().item()
    }

    # Cosine Similarity Evaluation
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    # Tokenize and encode the query and review
    inputs_query = tokenizer(input_query, return_tensors="pt", truncation=True, padding=True)
    inputs_review = tokenizer(generated_review, return_tensors="pt", truncation=True, padding=True)

    # Extract embeddings
    with torch.no_grad():
        query_embedding = model(**inputs_query).last_hidden_state.mean(dim=1)
        review_embedding = model(**inputs_review).last_hidden_state.mean(dim=1)

    # Compute cosine similarity
    cosine_sim = cosine_similarity(query_embedding.numpy(), review_embedding.numpy())[0][0]

    return {
        **bert_scores,
        "Cosine_Similarity": cosine_sim
    }



In [None]:
# Function to process CSV and calculate metrics
def process_csv(file_path, output_path, model_name="distilbert-base-uncased"):

    # Load CSV
    data = pd.read_csv(file_path)

    # Compute metrics for each row
    metrics_list = []
    for index, row in data.iterrows():
        input_query = row['query']
        generated_review = row['literature_review']
        metrics = evaluate_metrics(input_query, generated_review, model_name)
        metrics_list.append(metrics)

    # Add metrics to the DataFrame
    metrics_df = pd.DataFrame(metrics_list)
    result_df = pd.concat([data, metrics_df], axis=1)

    # Save the results to a new CSV
    result_df.to_csv(output_path, index=False)
    print(f"Results saved to {output_path}")



In [None]:

input_csv = "evaluation/rag_results_15_3b.csv"
output_csv = "evaluation_results_15_3b.csv"

In [None]:
process_csv(file_path=input_csv, output_path=output_csv, model_name="distilbert-base-uncased")

Results saved to evaluation_results_15_3b.csv


In [None]:
results_path = "evaluation_results_15_3b.csv"
results = pd.read_csv(results_path)

In [None]:
required_columns = ['BERTScore_Precision', 'BERTScore_Recall', 'BERTScore_F1', 'Cosine_Similarity']

In [None]:
metrics = results[required_columns].mean()

In [None]:
metrics

Unnamed: 0,0
BERTScore_Precision,0.655811
BERTScore_Recall,0.789952
BERTScore_F1,0.71651
Cosine_Similarity,0.832633
