Step 1: Extract data into csv file

In [None]:
import csv
import os
import json
from collections import defaultdict

# Set the root directory where the JSON file is located
json_dir = 'document_parses/pdf_json'

# Used to store processing results
cord_uid_to_text = []

# Open metadata.csv
with open('metadata.csv', encoding='utf-8') as f_in:
    reader = csv.DictReader(f_in)
    for row in reader:
        cord_uid = row.get('cord_uid', '')
        title = row.get('title', '')
        abstract = row.get('abstract', '')
        authors = row.get('authors', '').split('; ')

        introduction = []

        # Get the JSON file path field (if it exists)
        pdf_json_files = row.get('pdf_json_files', '')
        if pdf_json_files:
            for rel_path in pdf_json_files.split('; '):
                json_path = os.path.join(json_dir, os.path.basename(rel_path))

                if not os.path.exists(json_path):
                    continue  # Skip if the file does not exist

                try:
                    with open(json_path, encoding='utf-8') as f_json:
                        full_text_dict = json.load(f_json)

                        for paragraph_dict in full_text_dict.get('body_text', []):
                            paragraph_text = paragraph_dict.get('text', '')
                            section_name = paragraph_dict.get('section', '')
                            if 'intro' in section_name.lower():
                                introduction.append(paragraph_text)

                        if introduction:
                            break 
                except Exception as e:
                    print(f"Skipping error file: {json_path} Error: {e}")
                    continue

        if introduction:
            cord_uid_to_text.append({
                'cord_uid': cord_uid,
                'title': title,
                'abstract': abstract,
                'introduction': ' '.join(introduction)  
            })

# Writing to a CSV file
with open('output.csv', 'w', encoding='utf-8', newline='') as f_out:
    writer = csv.DictWriter(f_out, fieldnames=['cord_uid', 'title', 'abstract', 'introduction'])
    writer.writeheader()
    for row in cord_uid_to_text:
        writer.writerow(row)

In [None]:
import pandas as pd

df = pd.read_csv('output.csv')

In [None]:
df.shape

In [None]:
df.head()

Step 2: Calculate ROUGE precision

In [None]:
from rouge_score import rouge_scorer
import re

df = pd.read_csv("output.csv")  

# Clean text function
def clean_text(text):
    if pd.isna(text):
        return ""
    text = re.sub(r'\s+', ' ', str(text)).strip()
    return text

# Initialize ROUGE scorer
scorer = rouge_scorer.RougeScorer(['rouge2', 'rougeL'], use_stemmer=True)

# Calculate ROUGE precision
rouge2_precisions = []
rougeL_precisions = []

for _, row in df.iterrows():
    abstract = clean_text(row['abstract'])
    intro = clean_text(row['introduction'])
    scores = scorer.score(intro, abstract)  # hypothesis = abstract
    rouge2_precisions.append(scores['rouge2'].precision)
    rougeL_precisions.append(scores['rougeL'].precision)

# Save back to CSV
df['rouge2_precision'] = rouge2_precisions
df['rougeL_precision'] = rougeL_precisions

df.to_csv("output_with_rouge.csv", index=False)

In [None]:
df = pd.read_csv('output_with_rouge.csv')

In [None]:
df.shape

In [None]:
df.head()

Step 3: Filter out suspicious data

In [None]:
df = pd.read_csv("output_with_rouge.csv")


df = df[(df['rouge2_precision'] <= 0.5) & (df['rougeL_precision'] <= 0.5)]
df = df.drop_duplicates(subset='cord_uid')
df = df[df['introduction'].str.len() >= 2 * df['abstract'].str.len()]

df = df.dropna(subset=['abstract', 'introduction'])
df = df[(df['abstract'].str.strip() != '') & (df['introduction'].str.strip() != '')]

df.to_csv("output_cleaned.csv", index=False)