# 1. create a detectron2 environment

# 2. Scraping PDFs

In [2]:
import os
os.environ['PATH'] = "/opt/homebrew/bin:" + os.environ['PATH']
!echo $PATH


/opt/homebrew/bin:/Users/billionaire/opt/anaconda3/envs/detectron2/bin:/Users/billionaire/opt/anaconda3/condabin:/opt/homebrew/bin:/opt/homebrew/sbin:/usr/local/bin:/System/Cryptexes/App/usr/bin:/usr/bin:/bin:/usr/sbin:/sbin:/var/run/com.apple.security.cryptexd/codex.system/bootstrap/usr/local/bin:/var/run/com.apple.security.cryptexd/codex.system/bootstrap/usr/bin:/var/run/com.apple.security.cryptexd/codex.system/bootstrap/usr/appleinternal/bin:/opt/X11/bin


In [3]:
!/opt/homebrew/bin/tesseract -v


tesseract 5.4.1
 leptonica-1.84.1
  libgif 5.2.1 : libjpeg 8d (libjpeg-turbo 3.0.0) : libpng 1.6.43 : libtiff 4.6.0 : zlib 1.2.12 : libwebp 1.4.0 : libopenjp2 2.5.2
 Found NEON
 Found libarchive 3.7.4 zlib/1.2.12 liblzma/5.4.6 bz2lib/1.0.8 liblz4/1.9.4 libzstd/1.5.6
 Found libcurl/8.1.2 SecureTransport (LibreSSL/3.3.6) zlib/1.2.12 nghttp2/1.55.1


In [4]:
import layoutparser as lp
import cv2

# Load the model with the local path
model = lp.Detectron2LayoutModel(
    config_path='model/config.yaml',
    model_path='model/model_final.pth',
    label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"},
    extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8]
)

  from .autonotebook import tqdm as notebook_tqdm
sysctl: unknown oid 'machdep.cpu.leaf7_features'


In [None]:
import fitz  # PyMuPDF
import os
import layoutparser as lp
import cv2
import pytesseract
from PIL import Image, ImageDraw, ImageFont
import re
import numpy as np
import pickle
import pandas as pd
from fuzzywuzzy import fuzz

# Convert PDFs to images
def render_pdf_pages_to_images(pdf_path, image_folder, zoom=2):
    if not os.path.exists(image_folder):
        os.makedirs(image_folder)

    document = fitz.open(pdf_path)
    for page_number, page in enumerate(document):
        mat = fitz.Matrix(zoom, zoom)
        pix = page.get_pixmap(matrix=mat)
        image_filename = f"{image_folder}/output_page_{page_number + 1}.png"
        pix.save(image_filename)
        print(f"Saved {image_filename}")

    document.close()

# Extract font information
def extract_font_info(page):
    font_info = []
    blocks = page.get_text("dict")["blocks"]
    for block in blocks:
        if "lines" in block:
            for line in block["lines"]:
                for span in line["spans"]:
                    font_info.append({
                        "bold": span["flags"] & 2 != 0,
                        "italic": span["flags"] & 1 != 0,
                        "size": span["size"],
                        "text": span["text"]
                    })
    return font_info

# Check captions

def is_potential_caption(block, figure_blocks, distance=20, word_limit=50):
    text = block.text
    if "©" in text:
        return True

    # Check useless title
    title_keywords = ["funding", "conflict of interest", "supplementary materials", "declaration",
                      "acknowledgments", "Data availability", "Author contributions", "Publisher's note", "appendix"]


    if block.type == 'Title':
        for keyword in title_keywords:
            if fuzz.partial_ratio(text.lower(), keyword) > 50:  
                return False

    if re.match(r'^(Fig\.|Figure|Table|Box)\s*\d+|^(supplementary\w*|appendix\w*|fund\w*|conflict of interest\w*|Data availability\w*|Publi\w*|Abbreviation\w*|Author\w*|The Author|Copyright|Correspond\w*|Assess\w*|Email\w*|Tel\w*|Open access|Keywords|Address\w*|Receive\w*|Review\w*)', text, re.IGNORECASE):
        return True

    for figure_block in figure_blocks:
        x0, y0, x1, y1 = figure_block.coordinates
        bx0, by0, bx1, by1 = block.coordinates

        # Check if the block is within 20 pixels of figure_block
        if (
            abs(y0 - by1) <= distance or abs(y1 - by0) <= distance or
            abs(x0 - bx1) <= distance or abs(x1 - bx0) <= distance
        ):
            # Check if the text block has less than 50 characters
            if len(block.text.strip().split()) < word_limit:
                return True
    return False 



TOLERANCE = 50

# Sort by x coordinate first, then by y coordinate within a certain tolerance
def sort_blocks_by_coordinates(blocks, tolerance=TOLERANCE):
    if not blocks:  
        return []

    blocks = sorted(blocks, key=lambda b: b.coordinates[0])
    
    sorted_blocks = []
    current_line = []
    current_x = blocks[0].coordinates[0]
    
    for block in blocks:
        if abs(block.coordinates[0] - current_x) <= tolerance:
            current_line.append(block)
        else:
            current_line.sort(key=lambda b: b.coordinates[1])
            sorted_blocks.extend(current_line)
            current_line = [block]
            current_x = block.coordinates[0]
    

    current_line.sort(key=lambda b: b.coordinates[1])
    sorted_blocks.extend(current_line)
    
    return sorted_blocks

def draw_box(image, layout, box_width=3, show_element_id=True):

    pil_image = Image.fromarray(image)

    draw = ImageDraw.Draw(pil_image)

    for element in layout:
        box = element.coordinates
        draw.rectangle(box, outline="red", width=box_width)

        if show_element_id and hasattr(element, 'id'):
            font = ImageFont.load_default()
            text = f"ID: {element.id}"
            text_bbox = font.getbbox(text)
            text_width = text_bbox[2] - text_bbox[0]
            text_height = text_bbox[3] - text_bbox[1]
            draw.rectangle([box[0], box[1] - text_height, box[0] + text_width, box[1]], fill="red")
            draw.text((box[0], box[1] - text_height), text, fill="white", font=font)

    image = cv2.cvtColor(np.array(pil_image), cv2.COLOR_RGB2BGR)
    return image

# Extract abstract and title

def extract_abstract_from_first_pages(images, model):
    ocr_agent = lp.TesseractAgent(languages='eng')
    abstract_text = ""
    first_title = None

    for image in images:
        layout = model.detect(image)

        for idx, block in enumerate(layout):
            block.id = idx
            segment_image = block.pad(left=5, right=5, top=5, bottom=5).crop_image(image)
            text = ocr_agent.detect(segment_image).strip()
            block.set(text=text, inplace=True)
        
        abstract_block = None

        # Step 1: Look for an "Abstract" title block
        for block in layout:
            if block.type == 'Title' and block.text.strip().lower() == "abstract":
                abstract_block = block
                break

        if abstract_block and abstract_block.id is not None:
            # Collect text blocks below the "Abstract" title block
            for block in layout:
                if block.id is not None and block.id > abstract_block.id:
                    if block.type == 'Title' and block.text.strip().lower() == "introduction":
                        break  # Stop at the "Introduction" title block
                    abstract_text += block.text.strip() + " "
                    if len(block.text.strip()) == 0:
                        break

        if not abstract_text.strip():
            # Step 2: Keyword matching
            keywords = ["aim", "aims", "background", "purpose", "purposes", "introduction", "objective", "objectives", 
                        "method", "methods", "material","materials", "materials and methods", 
                        "result","results", "conclusion", "conclusions", "discussion"]
            keyword_blocks = []
            
            for block in layout:
                text = block.text.strip().lower()
                if any(keyword in text for keyword in keywords):
                    keyword_blocks.append(block)
            
            if keyword_blocks:
                # Sort keyword blocks by y-coordinate and collect nearby blocks within 20 pixels
                keyword_blocks = sorted(keyword_blocks, key=lambda b: b.coordinates[1])
                for keyword_block in keyword_blocks:
                    for block in layout:
                        if abs(block.coordinates[1] - keyword_block.coordinates[1]) <= 20:
                            abstract_text += block.text.strip() + " "

        if not first_title:
            # Look for the first title that is not "Abstract"
            for block in layout:
                if block.type == 'Title' and block.text.strip().lower() != "abstract":
                    first_title = block.text.strip().replace('\n', ' ')
                    break

    if not abstract_text.strip():
        # Step 3: Heuristic method
        text_blocks = [block.text.strip() for page_layout in layout for block in page_layout if block.type == 'Text']
        for text in text_blocks[:5]:  # Assuming the abstract appears in the first five blocks
            if len(text.split()) > 50:
                abstract_text = text
                break

    return abstract_text.strip(), first_title


# Generate title information tables
def generate_title_info_table(sorted_blocks, font_info_dict, output_folder):
    title_data = []
    for block in sorted_blocks:
        if block.type == 'Title':
            page_number = block.page_number
            text = block.text.strip()
            font_info = next((info for info in font_info_dict[page_number] if info['text'] in text), None)
            title_data.append({
                "Text": text,
                "Bold": font_info.get("bold", False) if font_info else False,
                "Italic": font_info.get("italic", False) if font_info else False,
                "Size": font_info.get("size", 0) if font_info else 0
            })

    df = pd.DataFrame(title_data)
    df.to_csv(os.path.join(output_folder, 'title_blocks_info.csv'), index=False)
    return df

# Find reference titles
def find_reference_title(df):
    reference_titles = ["introduction", "background", "conclusion","conclusions", "references"]
    for title in reference_titles:
        for index, row in df.iterrows():
            if title in row['Text'].lower():
                print(f"Reference title found: {row['Text']}")
                return row
    return None


# Label similar section headings
def mark_similar_titles(df, reference_title, paper_title):
    def is_similar_row(row1, row2):
        return (
            row1['Bold'] == row2['Bold'] and
            row1['Italic'] == row2['Italic'] and
            abs(row1['Size'] - row2['Size']) <= 1
        )

    if reference_title is not None:
        df['Similar'] = df.apply(lambda x: "title" if x['Text'] == paper_title else ("Ref" if x.equals(reference_title) else is_similar_row(x, reference_title)), axis=1)
    else:
        df['Similar'] = df.apply(lambda x: "title" if x['Text'] == paper_title else False, axis=1)

    for index, row in df.iterrows():
        if row['Similar'] != False:
            print(f"Row {index}: {row['Text']} | Bold: {row['Bold']} | Italic: {row['Italic']} | Size: {row['Size']} | Similar: {row['Similar']}")

    return df


# Extract text from images

def extract_text_from_images(image_folder, output_folder, model, pdf_path):
    ocr_agent = lp.TesseractAgent(languages='eng')
    all_text_blocks = []
    all_captions = []
    font_info_dict = {}
    abstract_text = None
    first_title = None
    document = fitz.open(pdf_path)
    first_page_processed = False

    first_image_path = os.path.join(image_folder, 'output_page_1.png')
    second_image_path = os.path.join(image_folder, 'output_page_2.png')
    images = [cv2.imread(first_image_path)[..., ::-1], cv2.imread(second_image_path)[..., ::-1]]

    # Extract abstract and title
    if not first_page_processed:
        abstract_text, first_title = extract_abstract_from_first_pages(images, model)
        first_page_processed = True

    for image_name in sorted(os.listdir(image_folder)):
        if image_name.endswith('.png'):
            page_number = int(re.search(r'\d+', image_name).group())
            image_path = os.path.join(image_folder, image_name)
            image = cv2.imread(image_path)
            image = image[..., ::-1]

            page = document.load_page(page_number - 1)
            font_info_dict[page_number] = extract_font_info(page)

            layout = model.detect(image)

            for block in layout:
                segment_image = block.pad(left=5, right=5, top=5, bottom=5).crop_image(image)
                text = ocr_agent.detect(segment_image)
                block.set(text=text, inplace=True)
                block.page_number = page_number

            text_blocks = lp.Layout([b for b in layout if b.type in ['Text', 'Title', 'List']])
            figure_blocks = lp.Layout([b for b in layout if b.type in ['Figure', 'Table']])
            text_blocks = lp.Layout([b for b in text_blocks if not any(b.is_in(b_fig) for b_fig in figure_blocks)])
            
            sorted_blocks = sort_blocks_by_coordinates(text_blocks)

            if sorted_blocks:  
                for idx, block in enumerate(sorted_blocks):
                    block.id = idx  # set id 

            captions = []
            filtered_text_blocks = []

            for block in sorted_blocks:
                if is_potential_caption(block, figure_blocks):
                    captions.append(block)
                else:
                    filtered_text_blocks.append(block)

            all_captions.extend(captions)
            all_text_blocks.extend(filtered_text_blocks)

            image_with_boxes = draw_box(image, sorted_blocks)
            output_image_filename = os.path.join(output_folder, 'images_with_boxes', f"{os.path.splitext(image_name)[0]}_with_boxes.png")
            cv2.imwrite(output_image_filename, image_with_boxes)

    os.makedirs(os.path.join(output_folder, 'text'), exist_ok=True)
    os.makedirs(os.path.join(output_folder, 'pickle'), exist_ok=True)

    with open(os.path.join(output_folder, 'text', 'captions.txt'), 'w', encoding='utf-8') as caption_file:
        for caption in all_captions:
            caption_file.write(caption.text + "\n")
    
    all_text_blocks.sort(key=lambda b: (b.page_number, b.id))

    
    if abstract_text:
        with open(os.path.join(output_folder, 'text', 'Abstract.txt'), 'w', encoding='utf-8') as abstract_file:
            abstract_file.write(abstract_text)
        with open(os.path.join(output_folder, 'pickle', 'Abstract.pkl'), 'wb') as abstract_pkl_file:
            pickle.dump(abstract_text, abstract_pkl_file)

    return all_text_blocks, font_info_dict, abstract_text, first_title



# Extract text from images
def extract_and_save_sections(sorted_blocks, df, output_folder):
    sections = {}
    current_section = None

    for block in sorted_blocks:
        # Step 1: Check if block is a numbered title
        if block.type == 'Title':
            text = block.text.strip()
            if re.match(r'^\d+(\.\d+)?', text):  # Check if the title starts with a number or a decimal
                if re.match(r'^\d+\.\d+', text):  # Skip subheadings like "1.1", "2.1"
                    if current_section:
                        sections[current_section].append(text)
                else:
                    current_section = text
                    sections[current_section] = []
            elif text.lower() in df[df['Similar'].isin([True, 'Ref'])]['Text'].str.lower().tolist():
                current_section = text
                sections[current_section] = []

        # Step 2: Collect text under the current section
        if current_section:
            sections[current_section].append(block.text.strip())

    section_order = []
    for section, texts in sections.items():
        section_text = " ".join(texts).replace('\n', ' ')
        filename = f"{section}.pkl".replace(" ", "_").replace("/", "_")
        section_order.append((section, filename))
        with open(os.path.join(output_folder, 'pickle', filename), 'wb') as f:
            pickle.dump(section_text, f)
        with open(os.path.join(output_folder, 'text', f"{section}.txt".replace(" ", "_").replace("/", "_")), 'w', encoding='utf-8') as section_file:
            section_file.write(section_text)
        print(f"Saved {section} to {filename}")

    return section_order

# Generate csv
def generate_csv(output_root_folder, df_data):
    rows = []
    for data in df_data:
        row = {'PDF name': data['pdf_name'], 'article': data['paper_title'], 'abstract': data['abstract']}
        for idx, (section, filename) in enumerate(data['section_order'], start=1):
            if filename == 'Abstract.pkl':
                continue
            row[f'section {idx}'] = filename
        rows.append(row)

    df = pd.DataFrame(rows)
    df.to_csv(os.path.join(output_root_folder, 'all_papers.csv'), index=False)

# process the whole folder

def process_pdfs_in_folder(pdf_folder, output_root_folder, model):
    df_data = []

    for pdf_file in os.listdir(pdf_folder):
        if pdf_file.endswith('.pdf'):
            pdf_path = os.path.join(pdf_folder, pdf_file)
            pdf_name = os.path.splitext(pdf_file)[0]
            pdf_output_folder = os.path.join(output_root_folder, pdf_name)

            # create folders
            for folder in ['images', 'images_with_boxes', 'text', 'pickle']:
                os.makedirs(os.path.join(pdf_output_folder, folder), exist_ok=True)

            # Convert PDFs into images
            render_pdf_pages_to_images(pdf_path, os.path.join(pdf_output_folder, 'images'))

            # Extract text and save them
            all_text_blocks, font_info_dict, abstract_text, first_title = extract_text_from_images(os.path.join(pdf_output_folder, 'images'), pdf_output_folder, model, pdf_path)

            # Find reference titles
            df = generate_title_info_table(all_text_blocks, font_info_dict, pdf_output_folder)
            reference_title = find_reference_title(df)

            # Label the similar headings
            df = mark_similar_titles(df, reference_title, first_title)
            df.to_csv(os.path.join(pdf_output_folder, 'title_blocks_info.csv'), index=False)

            # Save sections
            section_order = extract_and_save_sections(all_text_blocks, df, pdf_output_folder)
            
            df_data.append({
                'pdf_name': pdf_name,
                'paper_title': first_title,
                'abstract': 'Abstract.pkl',
                'section_order': section_order
            })

    generate_csv(output_root_folder, df_data)



pdf_folder = 'Diabetes PDFs'
output_root_folder = 'Diabetes PDFs Outputs'

process_pdfs_in_folder(pdf_folder, output_root_folder, model)

# 3. Combine sections 

In [56]:
import pandas as pd
from fuzzywuzzy import fuzz

# Load the CSV file
file_path = '/Diabetes PDFs Outputs/all_papers.csv'
df = pd.read_csv(file_path)

# Define the keywords for section1
section1_keywords = ["introduction", "background", "aim", "objective", "introduction and background"]

def fuzzy_match(section, keywords, threshold=80):
    for keyword in keywords:
        if fuzz.partial_ratio(section.lower(), keyword.lower()) >= threshold:
            return keyword.lower()
    return None

def reorder_sections(row):
    # Extract fixed columns
    fixed_columns = row[['PDF name', 'article', 'abstract']].tolist()
    # Extract all other columns and drop NaN values
    sections = row.drop(['PDF name', 'article', 'abstract']).dropna().tolist()
    
    # Create a list to hold the reordered sections
    reordered_sections = []
    abstract_idx = sections.index('Abstract.pkl') if 'Abstract.pkl' in sections else -1
    
    if abstract_idx != -1:
        reordered_sections = sections[:abstract_idx+1]
        remaining_sections = sections[abstract_idx+1:]
    else:
        remaining_sections = sections
    
    # Move specific sections to immediately follow the abstract section
    sections_to_move = []
    objective_sections = []
    for sec in remaining_sections:
        match = fuzzy_match(sec, section1_keywords)
        if match in ['introduction', 'background']:
            sections_to_move.append(sec)
        elif match == 'objective':
            objective_sections.append(sec)
    
    # Append moved sections in order
    reordered_sections.extend(sections_to_move)
    reordered_sections.extend(objective_sections)
    
    # Append the remaining sections after the moved sections
    reordered_sections.extend([sec for sec in remaining_sections if sec not in sections_to_move + objective_sections])
    
    # Combine fixed columns and reordered sections
    final_row = fixed_columns + reordered_sections
    
    # Ensure the final row has the same number of columns as the original
    final_row_dict = {f'section {i+1}': final_row[i+3] if i+3 < len(final_row) else None for i in range(len(row) - 3)}
    final_row_dict.update({'PDF name': fixed_columns[0], 'article': fixed_columns[1], 'abstract': fixed_columns[2]})
    
    return pd.Series(final_row_dict)

# Apply the function to each row
new_df = df.apply(reorder_sections, axis=1)

# Ensure columns are ordered correctly
new_df = new_df[['PDF name', 'article', 'abstract'] + [col for col in new_df.columns if col.startswith('section')]]

# Save the new dataframe to a CSV file
output_path = '/Diabetes PDFs Outputs/reordered_all_papers.csv'
new_df.to_csv(output_path, index=False)

# new_df.head()


Unnamed: 0,PDF name,article,abstract,section 1,section 2,section 3,section 4,section 5,section 6,section 7,section 8,section 9,section 10,section 11,section 12,section 13,section 14,section 15
0,s12889-024-18580-0,Prevalence of metabolic syndrome and associat...,Abstract.pkl,Introduction.pkl,Methods.pkl,Result.pkl,Prevalence_of_metabolic_syndrome_among_type_2_...,Association_between_BMI_and_prevalence_of_meta...,Conclusion.pkl,,,,,,,,,
1,Evaluation of the Lifetime Benefits of Metform...,Evaluation of the Lifetime Benefits of Metform...,Abstract.pkl,1_Introduction.pkl,2_Methods.pkl,3_Results.pkl,4_Discussion.pkl,5_Conclusion.pkl,,,,,,,,,,
2,regulatory_patterns_of_chinese_patent_medicine...,Regulatory patterns of Chinese patent medicine...,Abstract.pkl,1._Background.pkl,2._Methods.pkl,3._Results.pkl,4._Conclusion.pkl,5._Conclusion.pkl,Author_contributions.pkl,References.pkl,,,,,,,,
3,Association of Common Genetic Variants in Mito...,Association of Common Genetic Variants in Mito...,Abstract.pkl,INTRODUCTION.pkl,MetHops.pkl,Resutts.pkl,Discussion.pkl,REFERENCES.pkl,,,,,,,,,,
4,s00125-024-06144-1,Subcutaneously administered tirzepatide vs sem...,Abstract.pkl,Introduction.pkl,Methods.pkl,Results.pkl,Discussion.pkl,References.pkl,,,,,,,,,,
5,Evidence that tirzepatide protects against dia...,Evidence that tirzepatide protects against dia...,Abstract.pkl,Background.pkl,Methods.pkl,"Meta-analysis:_search_strategy,_selection_crit...",Results.pkl,Meta-analysis_of_clinical_results.pkl,Evidence_that_tirzepatide_protects\nagainst_di...,Cardioprotective_effect_of_TZT_by_meta-analysi...,In_vitro_results_in_AC16_cell_line_results_in_...,Protective_Effects_of_TZT_on_cell_proliferatio...,Conclusions.pkl,Supplementary_Information.pkl,Publisher’s_Note.pkl,,,
6,fendo-14-1285147,Efficacy and safety of insulin glargine 300 un...,Abstract.pkl,Background.pkl,Objectives.pkl,Methods.pkl,Results.pkl,Results_of_the_search.pkl,Discussion.pkl,Conclusion.pkl,Data_availability_statement.pkl,Author_contributions.pkl,References.pkl,Funding.pkl,Acknowledgments.pkl,Conflict_of_interest.pkl,Publisher's_note.pkl,Supplementary_material.pkl
7,fcdhc-03-947552,Unpacking determinants and consequences of foo...,Abstract.pkl,Introduction.pkl,Objectives.pkl,Diving_downstream:_Food_insecurity_as_a\npoten...,Implications_for_nutrition_equity.pkl,Overview_of_study_setting_and_design.pkl,"Blood_sampling,_processing,_and_analysis.pkl",Variables.pkl,Exit_interview_and_participant_follow-up_calls...,Variable.pkl,Data_storage_and_management.pkl,Statistical_analysis.pkl,Insulin_sensitivity.pkl,,,
8,Visceral adiposity index as a predictor of typ...,Visceral adiposity index as a predictor of typ...,Abstract.pkl,1._Introduction.pkl,2._Materials_and_methods.pkl,3._Results.pkl,4._Discussion.pkl,5._Conclusion.pkl,Authors’_contributions.pkl,Funding.pkl,Disclosure_of_potential_conflicts_of_interest.pkl,Appendix_A._Supplementary_data.pkl,References.pkl,,,,,


In [None]:
import os
import pandas as pd
import pickle
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import requests

# 读取all_papers.csv
all_papers_path = '/Diabetes PDFs Outputs/reordered_all_papers.csv'
all_papers_df = pd.read_csv(all_papers_path)

section1_keywords = ["introduction", "background", "aim", "objective", "introduction and background"]
section2_keywords = ["results", "conclusion", "discussion", "limitations"]
section3_keywords = ["methods", "study design"]
remove_section_keywords = ["acknowledgement", "conflict of interest", "funding", "references", "appendix", "contributors", "author", "data source", "data availability"]

def load_text(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def save_combined_content(content, filename, folder):
    txt_path = os.path.join(folder, filename + '.txt')
    pkl_path = os.path.join(folder, filename + '.pkl')
    with open(txt_path, 'w', encoding='utf-8') as txt_file:
        txt_file.write(content)
    with open(pkl_path, 'wb') as pkl_file:
        pickle.dump(content, pkl_file)

def match_keyword(text, keywords):
    best_match, score = process.extractOne(text, keywords, scorer=fuzz.partial_ratio)
    return best_match if score >= 80 else None

def classify_sections(df):
    new_rows = []
    for _, row in df.iterrows():
        pdf_name = row['PDF name']
        article = row['article']
        abstract = row['abstract']
        
        sections = [col for col in row.index if col.startswith('section') and pd.notna(row[col])]
        
        section1 = []
        section2 = []
        section3 = []
        
        # Step 1: Remove all sections with remove_keywords
        sections = [sec for sec in sections if not match_keyword(row[sec].lower(), remove_section_keywords)]
        
        # Step 2: Collect section1
        section1_collecting = True
        for section in sections:
            section_title = row[section].lower()
            if match_keyword(section_title, section1_keywords):
                section1.append(section)
            elif match_keyword(section_title, section2_keywords):
                section1_collecting = False
                break
        
        # Step 3: Collect section2
        section2_collecting = False
        section2_start = None
        section2_end = None
        for section in sections:
            section_title = row[section].lower()
            if match_keyword(section_title, section2_keywords):
                if section2_start is None:
                    section2_start = section
                section2_end = section
                section2_collecting = True
        
        if section2_start and section2_end:
            section2_start_index = sections.index(section2_start)
            section2_end_index = sections.index(section2_end)
            section2 = sections[section2_start_index:section2_end_index + 1]
        
        # Step 4: Collect section3 (remaining sections between section1 and section2)
        if section1 and section2:
            section1_end_index = sections.index(section1[-1])
            section2_start_index = sections.index(section2[0])
            section3 = sections[section1_end_index + 1 : section2_start_index]
            section3 = [s for s in section3 if not match_keyword(row[s].lower(), remove_section_keywords)]
        else:
            section3 = []

        # Ensure section3 has content if empty
        if not section3:
            section3 = [sec for sec in sections if match_keyword(row[sec].lower(), section3_keywords)]
        
        new_row = {
            'PDF name': pdf_name,
            'article': article,
            'abstract': abstract,
            'section1': ','.join(section1),
            'section2': ','.join(section2),
            'section3': ','.join(section3)
        }
        new_rows.append(new_row)
    
    return pd.DataFrame(new_rows)

def remove_useless_info(text):
    lines = text.split('\n')
    useful_lines = []
    for line in lines:
        if any(keyword in line.lower() for keyword in remove_section_keywords):
            break
        useful_lines.append(line)
    return '\n'.join(useful_lines)


classified_df = classify_sections(all_papers_df)

rows = []

# Process each pdf folder
for _, row in classified_df.iterrows():
    pdf_name = row['PDF name']
    article = row['article']
    abstract_filename = row['abstract']
    
    section1_sections = row['section1'].split(',')
    section2_sections = row['section2'].split(',')
    section3_sections = row['section3'].split(',')
    
    pdf_folder = os.path.join('Diabetes PDFs Outputs', pdf_name)
    text_folder = os.path.join(pdf_folder, 'text')
    pickle_folder = os.path.join(pdf_folder, 'pickle')
    combined_folder = os.path.join(pdf_folder, 'combined')
    os.makedirs(combined_folder, exist_ok=True)
    
    abstract_text = load_text(os.path.join(text_folder, abstract_filename.replace('.pkl', '.txt')))
    
    section1_text = ""
    section2_text = ""
    section3_text = ""
    
    for section in section1_sections:
        if section:
            section1_text += load_text(os.path.join(text_folder, all_papers_df.loc[all_papers_df['PDF name'] == pdf_name, section].values[0].replace('.pkl', '.txt')))
    
    for section in section2_sections:
        if section:
            section2_text += load_text(os.path.join(text_folder, all_papers_df.loc[all_papers_df['PDF name'] == pdf_name, section].values[0].replace('.pkl', '.txt')))
    
    for section in section3_sections:
        if section:
            section3_text += load_text(os.path.join(text_folder, all_papers_df.loc[all_papers_df['PDF name'] == pdf_name, section].values[0].replace('.pkl', '.txt')))
    

    # print(f"PDF: {pdf_name}")
    # print(f"Section 1 Content: {section1_text[:500]}")  
    # print(f"Section 2 Content: {section2_text[:500]}")  
    # print(f"Section 3 Content: {section3_text[:500]}") 
    # print("\n")
    
    # Save
    save_combined_content(abstract_text, 'Abstract', combined_folder)
    save_combined_content(section1_text, 'Section1', combined_folder)
    save_combined_content(section2_text, 'Section2', combined_folder)
    save_combined_content(section3_text, 'Section3', combined_folder)

    # Get citation
    citation = get_citation(article)
    
    # Collect row data 
    rows.append({
        'PDF name': pdf_name,
        'article': article,
        'abstract': os.path.join(combined_folder, 'Abstract.pkl'),
        'section1': os.path.join(combined_folder, 'Section1.pkl'),
        'section2': os.path.join(combined_folder, 'Section2.pkl'),
        'section3': os.path.join(combined_folder, 'Section3.pkl'),
        'citation': citation
    })

new_df = pd.DataFrame(rows)
new_df.to_csv(os.path.join('Diabetes PDFs Outputs', 'combined_papers_info.csv'), index=False)
# print("successfully")


In [63]:
import os
import pandas as pd
import pickle
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import requests

# 读取all_papers.csv
all_papers_path = '/Users/billionaire/Desktop/LLM/OCR/Diabetes PDFs Outputs/reordered_all_papers.csv'
all_papers_df = pd.read_csv(all_papers_path)

section1_keywords = ["introduction", "background", "aim", "objective"]
section2_keywords = ["results", "conclusion", "discussion", "limitations"]
section3_keywords = ["methods", "study design"]
remove_section_keywords = ["acknowledgement", "conflict of interest", "funding", "references", "appendix", "contributors", "author", "data source", "data availability"]

def load_text(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def save_combined_content(content, filename, folder):
    txt_path = os.path.join(folder, filename + '.txt')
    pkl_path = os.path.join(folder, filename + '.pkl')
    with open(txt_path, 'w', encoding='utf-8') as txt_file:
        txt_file.write(content)
    with open(pkl_path, 'wb') as pkl_file:
        pickle.dump(content, pkl_file)

def match_keyword(text, keywords):
    best_match, score = process.extractOne(text, keywords, scorer=fuzz.partial_ratio)
    return best_match if score >= 80 else None

def classify_sections(df):
    new_rows = []
    for _, row in df.iterrows():
        pdf_name = row['PDF name']
        article = row['article']
        abstract = row['abstract']
        
        sections = [col for col in row.index if col.startswith('section') and pd.notna(row[col])]
        
        section1 = []
        section2 = []
        section3 = []
        
        # Step 1: Remove all sections with remove_keywords
        sections = [sec for sec in sections if not match_keyword(row[sec].lower(), remove_section_keywords)]
        
        # Step 2: Collect section1
        section1_collecting = True
        for section in sections:
            section_title = row[section].lower()
            if match_keyword(section_title, section1_keywords):
                section1.append(section)
            elif match_keyword(section_title, section2_keywords):
                section1_collecting = False
                break
        
        # Step 3: Collect section2
        section2_collecting = False
        section2_start = None
        section2_end = None
        for section in sections:
            section_title = row[section].lower()
            if match_keyword(section_title, section2_keywords):
                if section2_start is None:
                    section2_start = section
                section2_end = section
                section2_collecting = True
        
        if section2_start and section2_end:
            section2_start_index = sections.index(section2_start)
            section2_end_index = sections.index(section2_end)
            section2 = sections[section2_start_index:section2_end_index + 1]
        
        # Step 4: Collect section3 (remaining sections between section1 and section2)
        if section1 and section2:
            section1_end_index = sections.index(section1[-1])
            section2_start_index = sections.index(section2[0])
            section3 = sections[section1_end_index + 1 : section2_start_index]
            section3 = [s for s in section3 if not match_keyword(row[s].lower(), remove_section_keywords)]
        else:
            section3 = []

        # Ensure section3 has content if empty
        if not section3:
            section3 = [sec for sec in sections if match_keyword(row[sec].lower(), section3_keywords)]
        
        new_row = {
            'PDF name': pdf_name,
            'article': article,
            'abstract': abstract,
            'section1': ','.join(section1),
            'section2': ','.join(section2),
            'section3': ','.join(section3)
        }
        new_rows.append(new_row)
    
    return pd.DataFrame(new_rows)

def remove_useless_info(text):
    lines = text.split('\n')
    useful_lines = []
    for line in lines:
        if any(keyword in line.lower() for keyword in remove_section_keywords):
            break
        useful_lines.append(line)
    return '\n'.join(useful_lines)

classified_df = classify_sections(all_papers_df)

rows = []

# Process each pdf folder
for _, row in classified_df.iterrows():
    pdf_name = row['PDF name']
    article = row['article']
    abstract_filename = row['abstract']
    
    section1_sections = row['section1'].split(',')
    section2_sections = row['section2'].split(',')
    section3_sections = row['section3'].split(',')
    
    pdf_folder = os.path.join('Diabetes PDFs Outputs', pdf_name)
    text_folder = os.path.join(pdf_folder, 'text')
    pickle_folder = os.path.join(pdf_folder, 'pickle')
    combined_folder = os.path.join(pdf_folder, 'combined')
    os.makedirs(combined_folder, exist_ok=True)
    

    abstract_text = load_text(os.path.join(text_folder, abstract_filename.replace('.pkl', '.txt')))
    
    section1_text = ""
    section2_text = ""
    section3_text = ""
    
    for section in section1_sections:
        if section:
            section1_text += load_text(os.path.join(text_folder, all_papers_df.loc[all_papers_df['PDF name'] == pdf_name, section].values[0].replace('.pkl', '.txt')))
    
    for section in section2_sections:
        if section:
            section2_text += load_text(os.path.join(text_folder, all_papers_df.loc[all_papers_df['PDF name'] == pdf_name, section].values[0].replace('.pkl', '.txt')))
    
    for section in section3_sections:
        if section:
            section3_text += load_text(os.path.join(text_folder, all_papers_df.loc[all_papers_df['PDF name'] == pdf_name, section].values[0].replace('.pkl', '.txt')))
    
    # Remove useless information
    section1_text = remove_useless_info(section1_text)
    section2_text = remove_useless_info(section2_text)
    section3_text = remove_useless_info(section3_text)

    # print(f"PDF: {pdf_name}")
    # print(f"Section 1 Content: {section1_text[:500]}")  
    # print(f"Section 2 Content: {section2_text[:500]}")  
    # print(f"Section 3 Content: {section3_text[:500]}")  
    # print("\n")
    
    save_combined_content(abstract_text, 'Abstract', combined_folder)
    save_combined_content(section1_text, 'Section1', combined_folder)
    save_combined_content(section2_text, 'Section2', combined_folder)
    save_combined_content(section3_text, 'Section3', combined_folder)

    # get citation
    citation = get_citation(article)
    
    rows.append({
        'PDF name': pdf_name,
        'article': article,
        'abstract': os.path.join(combined_folder, 'Abstract.pkl'),
        'section1': os.path.join(combined_folder, 'Section1.pkl'),
        'section2': os.path.join(combined_folder, 'Section2.pkl'),
        'section3': os.path.join(combined_folder, 'Section3.pkl'),
        'citation': citation
    })

new_df = pd.DataFrame(rows)

new_df.to_csv(os.path.join('Diabetes PDFs Outputs', 'combined_papers_info.csv'), index=False)

print("successfully。")


PDF: s12889-024-18580-0
Section 1 Content: Introduction The complicated pathophysiologic condition known as the metabolic syndrome is characterised by insulin resis- tance, hypertension, hyperlipidaemia, and abdominal obesity and which originate primarily from an imbalance between energy expenditure and calorie intake [1]. Even though the NCEP-ATPIII, IDF, and WHO criteria are the most often utilised clinical criteria for the diagnosis of metabolic syndrome, there are numerous similarities between them, there are also notable differences
Section 2 Content: 
Section 3 Content: 






PDF: Evaluation of the Lifetime Benefits of Metformin and SGLT2 Inhibitors in Type 2 Diabetes Mellitus Patients with Cardiovascular Disease A Systematic Review and Two-Stage Meta-Analysis
Section 1 Content: 1 Introduction The incidence of type 2 diabetes mellitus (T2DM) is on the rise globally. There are an estimated 537 million diagnosed cases, and by 2045, this number will rise to 783 million [1]. Patients with T2DM and established cardiovascular disease (CVD) are particularly susceptible to recurrent major adverse cardiovascular events (MACE), with a 1.7-fold increase in risk [2]. These patients are also at a higher risk for cardiovascular and non-cardiovascular complications. Optimizing glycaem
Section 2 Content: 
Section 3 Content: 






PDF: regulatory_patterns_of_chinese_patent_medicine_for.26
Section 1 Content: 1. Background The World Health Organization defines “comorbidity” as the coexistence of 2 or more health conditions that necessitate ongoing and diverse treatments and can mutually influence each other, In China, the prevalence of chronic diseases is 69.13%, and the comorbidity rate is 43.65%. Among these, multiple dis- eases commonly coexist among elderly patients with type 2 dia- betes.!") Recent data from the British Medical Journal indicates that the prevalence of comorbidity in high-income 
Section 2 Content: 3. Results 3.1. Study identification 3.1. Study identification In this study, a total of 80 English articles and 792 Chinese articles were initially identified through the screening process. Articles that did not meet the criteria were excluded based on a review of their titles and abstracts. Additionally, CPMs with <2 relevant articles were excluded. Ultimately, 12 RCTs!*! were included for analysi



PDF: Association of Common Genetic Variants in Mitogen‑activated Protein Kinase Kinase Kinase Kinase 4 with Type 2 Diabetes Mellitus in a Chinese Han Population
Section 1 Content: 
Section 2 Content: Resutts Clinical and biochemical characteristics of study subjects There were four subjects in T2DM group and 24 subjects in controls have not detected the genotypes, so finally the T2DM group includes 996 subjects, and the control group includes 976 subjects. The clinical characteristics of the participants are shown in Table 1. There were 612 males and 383 females (mean age, 46.1 + 12.6 years) in the patients and 568 males and 399 females (mean age, 42.9 + 11.7 years) in the control subjects. 
Section 3 Content: MetHops Study subjects and phenotypic definitions Using a case-control approach, a total of 2000 unrelated subjects from Chinese Han population were recruited from October 2010 to September 2013, comprising 1000 T2DM patients and 1000 normoglycemic control subjects. Inclusion cri



PDF: s00125-024-06144-1
Section 1 Content: 
Section 2 Content: 
Section 3 Content: Methods The protocol of this systematic review and meta- analysis is registered in PROSPERO (registration no. CRD42022382594) [9]. We report our methods and result: in accordance with the Preferred Reporting Items for Sys tematic reviews and Meta-Analyses (PRISMA) statement for network meta-analyses [10]. ty criteria We included RCTs published in English sed s.c. tirzepatide at maintenance doses of 5 mg, 10 mg or 15 mg once weekly, or s.c. semaglutide at main- tenance doses of 0.5 mg, 1.0 mg or 






PDF: Evidence that tirzepatide protects against diabetes-related cardiac damages
Section 1 Content: Background Glucagon-like peptide-1 receptor agonists (GLP- 1RAs), widely used antidiabetic drugs, are approved and recommended in several treatment guidelines for reducing the risk of major adverse cardiovascular events (MACE), such as cardiovascular death, non-fatal myocardial infarction (MI), and non-fatal stroke [1-6]. However, unlike sodium-glucose co-transporter 2 (SGLT2) inhibitors [7], evidence of a benefit for GLP-1 RAs in heart failure (HF) is controversial and remains to be fully estab
Section 2 Content: ResultsMeta-analysis of clinical resultsEvidence that tirzepatide protects against diabetes-related cardiac damages Fatemeh Taktaz'*, Lucia Scisciola'”’, Rosaria Anna Fontanella', Ada Pesapane', Puja Ghosh’, Martina Franzese', Giovanni Tortorella', Armando Puocci!, Eduardo Sommella?, Giuseppe Signoriello*, Fabiola Olivieri*®, Michelangela Barbieri'* and Giuseppe Paolisso'**Card



PDF: fendo-14-1285147
Section 1 Content: Background Description of the condition More than 500 million people are living with diabetes. Furthermore, the number of people with diabetes is expected to reach 643 million by 2030 and 783 million by 2045. In 2021, 6.7 million deaths were attributed to diabetes (1). Diabetes is currently the greatest pandemic of the 21st century according to many epidemiologists (2, 3). The high global prevalence of diabetes has a vexing impact on individuals, healthcare systems, and countries all over the gl
Section 2 Content: 
Section 3 Content: 






PDF: fcdhc-03-947552
Section 1 Content: Introduction The fourth decade of the HIV epidemic marks many milestones in the treatment and management of HIV as a chronic disease. These medical advances have paved the way for new questions about how to optimize the health span for people living with HIV (PLWH), including chronic co- morbidity risk reduction and management. PLWH experience higher rates of diabetes compared to the general US population (1), with one in ten PLWH having diabetes (1) and another three in ten PLWH having prediabe
Section 2 Content: 
Section 3 Content: 






PDF: Visceral adiposity index as a predictor of type 2 diabetes mellitus risk- A systematic review and doseeresponse meta-analysis
Section 1 Content: 1. Introduction Diabetes mellitus remains a serious and growing challenge to public health and places a huge burden on individuals affected and their families. According to the International Diabetes Federation, in 2021, it is estimated that 537 million (10.5% of the population) people have diabetes, and this number is projected to reach 643 million (11.3%) by 2030, and 783 million (12.2%) by 2045 [1]. While Type 2 diabetes mellitus (T2DM) accounts for the vast majority (over 90%) of diabetes wo
Section 2 Content: 3. Results 3.1. Study selection and characteristics of included studies 3.1. Study selection and characteristics of included studies A total of 508 potentially relevant records were identified in our literature. According to the pre-specified inclusion criteria, 14 studies [14,29—41] included in our analysis (Fig. 1). Overall, A



PDF: Prognostic Significance of Diastolic Dysfunction in Type 2 Diabetes Mellitus Patients With Sepsis and Septic Shock- Insights From a Longitudinal Tertiary Care Study
Section 1 Content: Introduction Sepsis affects 48.9 million people per year globally, of which 11 million die, representing a mortality rate of 19.7% [1]. According to the Third International Consensus Definitions for Sepsis and Septic Shock (Sepsis-  is defined as a life-threatening dysfunction caused by the dysregulated host response to  sociated with a >10% in-hospital mortality. "Septic shock is defined as the subset of sepsis in which particularly profound circulatory, cellular, and metabolic abnormalities ar
Section 2 Content: Results A total of 132 patients were enrolled after the exclusion of 12 patients with poor echocardiography images, 10 patients with known cases of coronary artery disease showing regional wall abnormalities, and five patients with moderate to severe valvular heart disease. The main source 



整合和保存完成。
