In [1]:
import numpy as np
import pandas as pd
import requests
import json
from tqdm import tqdm

In [2]:

MY_GCUBE_TOKEN = 'c7f14730-4c9b-4da5-aec2-d9af2579d505-843339462'

class WATAnnotation:
    # An entity annotated by WAT

    def __init__(self, d, sentence=None, index=None):
        
        self.sentence = sentence  
        self.index = index        
        # char offset (included)
        self.start = d['start']
        # char offset (not included)
        self.end = d['end']

        # annotation accuracy
        self.rho = d['rho']
        # spot-entity probability
        self.prior_prob = d['explanation']['prior_explanation']['entity_mention_probability']

        # annotated text
        self.spot = d['spot']

        # Wikpedia entity info
        self.wiki_id = d['id']
        self.wiki_title = d['title']


    def json_dict(self):
        # Simple dictionary representation
        return {'original_sentence': self.sentence, 
                'sentence_index': self.index,        
                'wiki_title': self.wiki_title,
                'wiki_id': self.wiki_id,
                'start': self.start,
                'end': self.end,
                'rho': self.rho,
                'prior_prob': self.prior_prob
                }

def wat_entity_linking(text, index):
    # Main method, text annotation with WAT entity linking system
    wat_url = 'https://wat.d4science.org/wat/tag/tag'
    payload = [("gcube-token", MY_GCUBE_TOKEN),
               ("text", text),
               ("lang", 'en'),
               ("tokenizer", "nlp4j"),
               ('debug', 9),
               ("method",
                "spotter:includeUserHint=true:includeNamedEntity=true:includeNounPhrase=true,prior:k=50,filter-valid,centroid:rescore=true,topk:k=5,voting:relatedness=lm,ranker:model=0046.model,confidence:model=pruner-wiki.linear")]

    response = requests.get(wat_url, params=payload)
    if response.status_code == 200:
        json_response = response.json()
        if 'annotations' in json_response:
            return [WATAnnotation(a, sentence=text, index=index).json_dict() for a in json_response['annotations']]
        else:
            print("No annotations found for this sentence.")
            return []
    else:
        print("Failed to fetch data:", response.status_code)
        return []


In [None]:
####annotation of generated questions
# reference.npy

predict_sentences = np.load('/home/qiyu/Dev/ziqing/T5/train/eval_squad_once/predictionstopic2_T5_squad_once.npy')
#predict_sentences = np.load('/home/qiyu/Dev/ziqing/T5/train/eval_squad_once/predictions_T5_squad_once.npy')
annotations = []
for idx, sentence in enumerate(predict_sentences):
    sentence_annotations = wat_entity_linking(sentence, idx)
    if sentence_annotations:
        annotations.append(sentence_annotations)
    else:
        print(f"No annotations for sentence {idx}: {sentence}")
        
# Assuming `annotations` is already populated as shown in previous parts
# Flatten the annotations list if necessary (since annotations may be lists of lists)
flattened_annotations = [item for sublist in annotations for item in sublist]

# Define the output file path

annotation_predict_path = '/home/qiyu/Dev/ziqing/T5/train/eval_squad_once/TOPIC_w2v/annotation_otherpredict.json'
#annotation_predict_path = '/home/qiyu/Dev/ziqing/T5/train/eval_squad_once/TOPIC_w2v/annotation_labelpredict.json'

# Write the annotations to a JSON file with the specified structure
try:
    with open(annotation_predict_path, 'w', encoding='utf-8') as file:
        json.dump(flattened_annotations, file, ensure_ascii=False, indent=4)
    print("All annotations have been processed and saved successfully.")
except Exception as e:
    print(f"An error occurred while writing to the file: {e}")


In [None]:
#####KhanQ.csv topic id（annotation）
# Step 2: Load data from CSV file
df = pd.read_csv('/home/qiyu/Dev/ziqing/T5/combined_KhanQ.csv')
topics = df['topic']

# Step 3: Process topics with entity linking
annotations = []
for idx, topic in tqdm(enumerate(topics), total=len(topics), desc="Processing Topics"):
    if pd.notna(topic):  # Only process if the topic is not NA
        topic_annotations = wat_entity_linking(topic, idx)
        annotations.append(topic_annotations)
    else:
        print(f"Skipping NA topic at index {idx}")
        
# Step 4: Save annotations to a new file
annotation_reference_path = '/home/qiyu/Dev/ziqing/T5/train/annotation_referencetopic_combinedKhanQ.json'
try:
    with open(annotation_reference_path, 'w', encoding='utf-8') as file:
        json.dump(annotations, file, ensure_ascii=False, indent=4)
    print("All topics have been processed and annotations have been saved successfully.")
except Exception as e:
    print(f"An error occurred while writing to the file: {e}")

In [6]:
annotation_reference_path = '/home/qiyu/Dev/ziqing/T5/train/annotation_referencetopic_combinedKhanQ.json'

In [7]:

def compute_relatedness(token, ids, relatedness='jaccard'):
    base_url = 'https://wat.d4science.org/wat/relatedness/graph'
    params = {
        'gcube-token': token,
        'lang': 'en',
        'ids': ids,
        'relatedness': relatedness
    }
    response = requests.get(base_url, params=params)
    if response.status_code == 200:
        return response.json()
    else:
        return response.text
    
def default_converter(o):
    if isinstance(o, np.integer):
        return int(o)
    elif isinstance(o, np.floating):
        return float(o)
    elif isinstance(o, np.ndarray):
        return o.tolist()
    else:
        raise TypeError(f"Object of type '{type(o).__name__}' is not JSON serializable")


In [None]:
##relatedness after annotation
# Step 2: Load JSON data from files
with open(annotation_predict_path, 'r', encoding='utf-8') as file:
    predict_data = json.load(file)

predict_df = pd.DataFrame(predict_data)

with open(annotation_reference_path, 'r', encoding='utf-8') as file:
    reference_data = json.load(file)


flattened_reference_data = [item for sublist in reference_data for item in sublist]
reference_df = pd.DataFrame(flattened_reference_data)


print(predict_df.head())
print(reference_df.head())

In [None]:
MY_GCUBE_TOKEN = 'c7f14730-4c9b-4da5-aec2-d9af2579d505-843339462'

# Step 3: Process each sentence index from annotation_predict.json
results = []
for index in predict_df['sentence_index'].unique():
    # Find matching reference data
    ref_entry = reference_df[reference_df['sentence_index'] == index]
    if not ref_entry.empty:
        # Collect all wiki_ids from both datasets for the current index
        pred_ids = predict_df[predict_df['sentence_index'] == index]['wiki_id'].tolist()
        ref_ids = ref_entry['wiki_id'].tolist()
        entity_ids = pred_ids + ref_ids

        # Compute relatedness
        relatedness_result = compute_relatedness(MY_GCUBE_TOKEN, entity_ids)
        results.append({
            'sentence_index': index,
            'relatedness_result': relatedness_result
        })
    
#original_relatedness_path = '/home/qiyu/Dev/ziqing/T5/train/eval_squad_once/TOPIC_w2v/original_relate_labeltopic.json'
original_relatedness_path = '/home/qiyu/Dev/ziqing/T5/train/eval_squad_once/TOPIC_w2v/original_relate_othertopic.json'


with open(original_relatedness_path, 'w', encoding='utf-8') as file:
    json.dump(results, file, indent=4, default=default_converter)

print("Results have been successfully saved")


In [10]:
def select_max_relatedness(input_path):
    with open(input_path, 'r', encoding='utf-8') as file:
        data = json.load(file)

    results = []
    for item in data:
        max_relatedness = max(pair['relatedness'] for pair in item['relatedness_result']['pairs']) if item['relatedness_result']['pairs'] else 1
        results.append({
            "sentence_index": item['sentence_index'],
            "relation": item['relatedness_result']['relation'],
            "relatedness_score": max_relatedness
        })
    return results


def update_and_save_scores(predict_df, reference_df, results, output_path):

    scores_df = pd.DataFrame(results)

    for index, group in predict_df.groupby('sentence_index'):
        ref_group = reference_df[reference_df['sentence_index'] == index]
        if set(group['wiki_id']).intersection(set(ref_group['wiki_id'])):
            scores_df.loc[scores_df['sentence_index'] == index, 'relatedness_score'] = 1

    scores_df.to_json(output_path, orient='records', lines=False, force_ascii=False, indent=4)
    print(f"Updated scores have been successfully saved to '{output_path}'.")


In [None]:
select_max = select_max_relatedness(original_relatedness_path)
#final_output_path = '/home/qiyu/Dev/ziqing/T5/train/eval_squad_once/TOPIC_w2v/relate_labeltopic.json'
final_output_path = '/home/qiyu/Dev/ziqing/T5/train/eval_squad_once/TOPIC_w2v/relate_othertopic.json'

update_and_save_scores(predict_df, reference_df, select_max, final_output_path)

In [None]:
def save_results_to_file(results, file_path):
    """
    Save the results to a specified text file.
    
    Parameters:
        results (list): List of strings containing the results to be written to the file.
        file_path (str): Path to the file where results will be saved.
    """
    with open(file_path, 'w', encoding='utf-8') as file:
        for result in results:
            file.write(result + "\n")

def calculate_and_report_relatedness(file_path):
    """
    Load score data, perform analysis, and report the relatedness scores.
    
    Parameters:
        file_path (str): Path to the JSON file containing score data.
    """
    # Load JSON data from the file
    with open(file_path, 'r', encoding='utf-8') as file:
        score_data = json.load(file)

    # Convert the loaded JSON data to a pandas DataFrame
    score_df = pd.DataFrame(score_data)

    # Calculate the average relatedness score for all entries
    average_relatedness_score = score_df['relatedness_score'].mean()
    average_score_str = f"Average Relatedness Score: {average_relatedness_score}"

    # Filter and calculate the average score for entries with 'relatedness_score' >= 0.01
    filtered_score_df = score_df[score_df['relatedness_score'] >= 0.01]
    average_relatedness_score_con = filtered_score_df['relatedness_score'].mean()
    average_score_con_str = f"Average Relatedness Score (Con): {average_relatedness_score_con}"

    # Calculate the number of entries with a relatedness score of exactly 1.0
    count_score_one = score_df[score_df['relatedness_score'] == 1.0].shape[0]

    # Calculate the total number of entries and the percentage of entries with a score of 1.0
    total_count = score_df.shape[0]
    percentage = (count_score_one / total_count) * 100
    percentage_str = f"Percentage of 'relatedness_score' equal to 1.0: {percentage:.2f}%"

    # Save results to text file
    results = [average_score_str, average_score_con_str, percentage_str]
    
    #output_text_path = '/home/qiyu/Dev/ziqing/T5/train/eval_squad_once/TOPIC_w2v/relatedness_labeltopic.txt'
    output_text_path = '/home/qiyu/Dev/ziqing/T5/train/eval_squad_once/TOPIC_w2v/relatedness_othertopic.txt'

    save_results_to_file(results, output_text_path)
    print(f"Results have been successfully saved to '{output_text_path}'.")

#final_output_path= '/home/qiyu/Dev/ziqing/T5/train/eval_squad_once/TOPIC_w2v/relate_labeltopic.json'
final_output_path= '/home/qiyu/Dev/ziqing/T5/train/eval_squad_once/TOPIC_w2v/relate_othertopic.json'

calculate_and_report_relatedness(final_output_path)
