In [None]:
!pip install ragas langchain-openai

Collecting ragas
  Downloading ragas-0.2.8-py3-none-any.whl.metadata (9.1 kB)
Collecting langchain-openai
  Downloading langchain_openai-0.2.12-py3-none-any.whl.metadata (2.7 kB)
Collecting datasets (from ragas)
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting tiktoken (from ragas)
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting langchain-community (from ragas)
  Downloading langchain_community-0.3.11-py3-none-any.whl.metadata (2.9 kB)
Collecting appdirs (from ragas)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting pysbd>=0.3.4 (from ragas)
  Downloading pysbd-0.3.4-py3-none-any.whl.metadata (6.1 kB)
Collecting openai>1 (from ragas)
  Downloading openai-1.57.4-py3-none-any.whl.metadata (24 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets->ragas)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets->ragas)
  Downloading xxhash-

In [None]:
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini"))
evaluator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings())
import pandas as pd

In [None]:
import ragas.metrics as metrics
print(dir(metrics))

['AgentGoalAccuracyWithReference', 'AgentGoalAccuracyWithoutReference', 'AnswerCorrectness', 'AnswerRelevancy', 'AnswerSimilarity', 'AspectCritic', 'BleuScore', 'ContextEntityRecall', 'ContextPrecision', 'ContextRecall', 'ContextUtilization', 'DataCompyScore', 'DistanceMeasure', 'ExactMatch', 'FactualCorrectness', 'Faithfulness', 'FaithfulnesswithHHEM', 'InstanceRubrics', 'LLMContextPrecisionWithReference', 'LLMContextPrecisionWithoutReference', 'LLMContextRecall', 'LLMSQLEquivalence', 'Metric', 'MetricOutputType', 'MetricType', 'MetricWithEmbeddings', 'MetricWithLLM', 'MultiModalFaithfulness', 'MultiModalRelevance', 'MultiTurnMetric', 'NoiseSensitivity', 'NonLLMContextPrecisionWithReference', 'NonLLMContextRecall', 'NonLLMStringSimilarity', 'ResponseRelevancy', 'RougeScore', 'RubricsScore', 'SemanticSimilarity', 'SimpleCriteriaScore', 'SingleTurnMetric', 'StringPresence', 'SummarizationScore', 'ToolCallAccuracy', 'TopicAdherenceScore', '__all__', '__builtins__', '__cached__', '__doc__

In [None]:
from google.colab import drive
drive.mount('/gdrive')
import os
os.chdir('/gdrive/My Drive/nlp_project')

In [None]:
import os
from google.colab import userdata
os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI_API_KEY_2")

In [None]:

# Load the CSV file containing RAG results
file_path = "research_prompts_15_graphrag.csv"
results_df = pd.read_csv(file_path)


In [None]:
results_df.head()

In [None]:
import pandas as pd
from ragas import evaluate
from ragas.metrics import Faithfulness
from ragas import SingleTurnSample

In [None]:
from ragas.dataset_schema import SingleTurnSample
from ragas.metrics import RubricsScore

# Define updated rubrics
coherence_rubrics = {
    # Coherence
    "score1_description": "The literature survey lacks logical flow and structure, making it difficult to follow. Ideas are disjointed and do not connect meaningfully.",
    "score2_description": "The survey has some coherence but contains significant logical gaps, with poorly connected ideas or transitions.",
    "score3_description": "The survey is moderately coherent, with minor gaps or inconsistencies in the flow. Some transitions may be abrupt or unclear.",
    "score4_description": "The survey is mostly coherent, with well-connected ideas and a logical structure. Minor issues in transitions are present but do not hinder comprehension.",
    "score5_description": "The literature survey is fully coherent, with seamless flow, clear transitions, and a strong logical structure throughout.",
}

informativeness_rubrics = {
    # Informativeness
    "score1_description": "The review omits critical information from the retrieved contexts and does not address important aspects of the query.",
    "score2_description": "The review provides limited information, leaving out key details from the retrieved contexts that are relevant to the query.",
    "score3_description": "The review is moderately informative, addressing the query but missing some important insights from the retrieved contexts.",
    "score4_description": "The review is highly informative, covering most key points from the retrieved contexts and addressing the query well.",
    "score5_description": "The review is exhaustive, addressing all key points from the retrieved contexts, fully answering the query with detailed insights.",

}

redundancy_rubrics = {
    # Redundancy
    "score1_description": "The review is excessively repetitive, with redundant statements overshadowing meaningful content from the retrieved contexts.",
    "score2_description": "The review has noticeable redundancy, which detracts from its overall quality and reduces efficiency.",
    "score3_description": "The review is moderately repetitive but maintains sufficient unique content to address the query.",
    "score4_description": "The review has some redundancy and focuses on delivering unique, relevant content from the retrieved contexts.",
    "score5_description": "The review has minimal redundancy, with every statement contributing meaningfully to the query response.",

}

citation_coverage_rubrics = {
    # Citation Coverage
    "score1_description": "The review lacks citations or includes irrelevant and improperly integrated citations from the retrieved contexts.",
    "score2_description": "The review includes citations but they are sparse, improperly integrated, or do not align well with the retrieved contexts.",
    "score3_description": "The review has moderate citation coverage, but some citations are irrelevant or missing from key retrieved contexts.",
    "score4_description": "The review includes relevant citations that are mostly well-integrated and appropriately linked to the retrieved contexts.",
    "score5_description": "The review comprehensively integrates all relevant citations, seamlessly aligning with the retrieved contexts and adding value to the narrative.",
}

# Define the RubricsScore object
coherence_rubric = RubricsScore(rubrics=coherence_rubrics, llm=evaluator_llm)
informativeness_rubric = RubricsScore(rubrics=informativeness_rubrics, llm=evaluator_llm)
redundancy_rubric = RubricsScore(rubrics=redundancy_rubrics, llm=evaluator_llm)
citation_coverage_rubric = RubricsScore(rubrics=citation_coverage_rubrics, llm=evaluator_llm)


In [None]:
# Create lists to store scores
coherence_scores, informativeness_scores, redundancy_scores, citation_scores = [], [], [], []


In [None]:
import time

In [None]:
# Loop through each row and evaluate
for index, row in results_df.iterrows():
    sample = SingleTurnSample(
        user_input=row['query'],
        response=row['response'],
        retrieved_contexts=[row['context']]
    )

    coherence_score = coherence_rubric.single_turn_score(sample)
    informativeness_score = informativeness_rubric.single_turn_score(sample)
    redundancy_score = redundancy_rubric.single_turn_score(sample)
    citation_score = citation_coverage_rubric.single_turn_score(sample)

    # Append scores to lists
    coherence_scores.append(coherence_score/2)
    informativeness_scores.append(informativeness_score/2)
    redundancy_scores.append(redundancy_score/2)
    citation_scores.append(citation_score/2)

    # Print scores for debugging
    print(index, coherence_score, informativeness_score, redundancy_score, citation_score)
    time.sleep(60)

In [None]:
coherence_scores.append(coherence_score/2)
informativeness_scores.append(informativeness_score/2)
redundancy_scores.append(redundancy_score/2)
citation_scores.append(citation_score/2)

In [None]:
# Add scores to the dataframe
results_df['Coherence'] = coherence_scores
results_df['Informativeness'] = informativeness_scores
results_df['Redundancy'] = redundancy_scores
results_df['Citation Coverage'] = citation_scores

In [None]:
# Save the updated dataframe to a new CSV
results_df.to_csv("graphrag_eval_with_scores.csv", index=False)

In [None]:
# Calculate and display average scores
avg_coherence = sum(coherence_scores) / len(coherence_scores)
avg_informativeness = sum(informativeness_scores) / len(informativeness_scores)
avg_redundancy = sum(redundancy_scores) / len(redundancy_scores)
avg_citation = sum(citation_scores) / len(citation_scores)

print(f"Average Coherence: {avg_coherence}")
print(f"Average Informativeness: {avg_informativeness}")
print(f"Average Redundancy: {avg_redundancy}")
print(f"Average Citation Coverage: {avg_citation}")

Average Coherence: 4.029411764705882
Average Informativeness: 4.029411764705882
Average Redundancy: 4.029411764705882
Average Citation Coverage: 4.0588235294117645
