In [1]:
pip install PyMuPDF


Collecting PyMuPDF
  Downloading PyMuPDF-1.23.23-cp38-none-macosx_11_0_arm64.whl.metadata (3.4 kB)
Collecting PyMuPDFb==1.23.22 (from PyMuPDF)
  Downloading PyMuPDFb-1.23.22-py3-none-macosx_11_0_arm64.whl.metadata (1.4 kB)
Downloading PyMuPDF-1.23.23-cp38-none-macosx_11_0_arm64.whl (3.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.7/3.7 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading PyMuPDFb-1.23.22-py3-none-macosx_11_0_arm64.whl (29.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m51.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: PyMuPDFb, PyMuPDF
Successfully installed PyMuPDF-1.23.23 PyMuPDFb-1.23.22
Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
from dotenv import load_dotenv
import openai
import csv
import pandas as pd
from openai import OpenAI
import matplotlib.pyplot as plt
import fitz

In [3]:
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")
if not openai_api_key:
    raise ValueError("No OpenAI API key found. Check your .env file.")
openai.api_key = openai_api_key

In [4]:
def read_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

In [43]:
client = OpenAI()
global api_call_count 
api_call_count = 0

def summarize_text(text):
    global api_call_count

    system_prompt = """
    Summarize the following scientific paper with each piece of information on a separate line and in the following order: 
    1. Authors: [List of authors]
    2. Date of Publication: [Publication date]
    3. Title: [Title of the paper]
    4. Most Important Findings: [Key findings]
    5. Methods: [Methods used]
    """


    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": text}
    ]

    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=messages,
            temperature=0.5,
            max_tokens=500,  
        )

        api_call_count += 1

        response_text = response.choices[0].message.content.strip()
        
        #print("Summary before parsing:", response_text)  # Uncomment for debugging

        return response_text

    except Exception as e:
        print(f"An error occurred: {e}")
        return ""



In [47]:
def parse_structured_summary(summary):
    summary_data = {
        'Authors': '',
        'Date of Publication': '',
        'Title': '',
        'Most Important Findings': '',
        'Methods': ''
    }
    
    lines = summary.strip().split('\n')
    current_section = None
    
    for line in lines:
        if line.startswith("1. Authors:"):
            current_section = 'Authors'
            summary_data[current_section] = line.replace("1. Authors:", "").strip()
        elif line.startswith("2. Date of Publication:"):
            current_section = 'Date of Publication'
            summary_data[current_section] = line.replace("2. Date of Publication:", "").strip()
        elif line.startswith("3. Title:"):
            current_section = 'Title'
            summary_data[current_section] = line.replace("3. Title:", "").strip()
        elif line.startswith("4. Most Important Findings:"):
            current_section = 'Most Important Findings'
        elif line.startswith("5. Methods:"):
            current_section = 'Methods'
        else:
            # Accumulate the content for "Most Important Findings" and "Methods"
            if current_section:
                summary_data[current_section] += (line.strip() + " ") if line.strip() else ""
    
    # Trim trailing whitespace
    for key in summary_data:
        summary_data[key] = summary_data[key].strip()
    
    return summary_data



In [37]:
def create_summary_dataframe(pdf_paths):
    
    parsed_summaries = []
    
    for path in pdf_paths:

        text = read_pdf(path)
        
        summary = summarize_text(text)
        
        parsed_summary = parse_structured_summary(summary)
        
        parsed_summaries.append(parsed_summary)
    
    df = pd.DataFrame(parsed_summaries)
    
    return df

In [40]:
#single PDF test
#pdf_paths = ['/Users/Zantana/repo/Dailys/24_02_19/test_paper1.pdf']
#df = create_summary_dataframe(pdf_paths)
#print(df)

Summary before parsing: 1. Authors: Paul F. Simmering, Paavo Huoviala
2. Date of Publication: October 30, 2023
3. Title: Large Language Models for Aspect-Based Sentiment Analysis
4. Most Important Findings:
   - GPT-3.5 achieved a state-of-the-art F1 score of 83.8 on the joint aspect term extraction and polarity classification task of the SemEval-2014 Task 4.
   - Fine-tuned GPT-3.5 outperformed the previous state-of-the-art model, InstructABSA, even without in-context examples.
   - Detailed prompts improved performance in zero-shot and few-shot settings but were not necessary for fine-tuned models.
   - Fine-tuning the model was the most efficient option for achieving superior performance in aspect-based sentiment analysis.
   - Errors made by the models were related to discrepancies between the model's predictions and the annotation rules of the benchmark dataset.
5. Methods:
   - Evaluation was done on the SemEval-2014 dataset for aspect-based sentiment analysis.
   - Models tested

In [41]:
#df.head()

Unnamed: 0,Authors,Date of Publication,Title,Most Important Findings,Methods
0,,,,- GPT-3.5 achieved a state-of-the-art F1 score...,- Evaluation was done on the SemEval-2014 data...


In [42]:
#df.to_csv('summary.csv', index=False)

In [44]:
def create_summary_dataframe_from_folder(folder_path):
   
    parsed_summaries = []
    
    files = os.listdir(folder_path)
    
    pdf_files = [file for file in files if file.endswith('.pdf')]
    
    for pdf_file in pdf_files:
        
        pdf_path = os.path.join(folder_path, pdf_file)
        
        text = read_pdf(pdf_path)
    
        summary = summarize_text(text)
    
        parsed_summary = parse_structured_summary(summary)
    
        parsed_summaries.append(parsed_summary)
    
    df = pd.DataFrame(parsed_summaries)
    
    return df

In [48]:
folder_path = '/Users/Zantana/repo/Dailys/24_02_19/test_paper'
df = create_summary_dataframe_from_folder(folder_path)
print(df)


                                             Authors Date of Publication  \
0                  Paul F. Simmering, Paavo Huoviala    October 30, 2023   
1  Ashish Vaswani, Noam Shazeer, Niki Parmar, Jak...     2nd August 2023   

                                               Title  \
0  Large Language Models for Aspect-Based Sentime...   
1                          Attention Is All You Need   

                             Most Important Findings  \
0  - GPT-3.5 achieved a state-of-the-art F1 score...   
1  - Proposed the Transformer model based solely ...   

                                             Methods  
0  - Evaluation was done on the SemEval-2014 data...  
1  - Introduced the Transformer architecture base...  


In [49]:
df.to_csv('summary.csv', index=False)