In [None]:
import os
import json
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer #Used to lemmatize words
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
import pandas as pd

from processing_pdf.processing_pdf import *
from model_summarizer.allenai_summarizer import SummarizationModel, load_data
from data_preparation.gpt_summary import get_summaries
from presentation_generation.generate_ppt import *

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

pd.options.display.max_colwidth = 1000

# Preparation: 
### Modify variables for pipeline in this cell:

In [None]:
filename = "1901.00039v2" # Filename of the pdf
summary_models = ["allenai/led-large-16384-arxiv", "facebook/bart-large-xsum", "gpt-3.5-turbo-0125"] # Available summarization models
model_name = summary_models[0] # Selected summarization model
use_gpt_ppt_parsing = False # True for GPT parsing, False for manual parsing

# 1. Processing PDF
- Reads the pdf file in the /data/paper_pdf directory
- Try to retrieve the table of contents (manually specify if not available)
- Extract text and images on section or subsection level based on the table of contents
- Save extracted data in the /processed directory

In [None]:
pdf_file = f"{filename}.pdf"
project_path = os.getcwd()
project_data_path = project_path + "/data/paper_pdf"
project_processed_data_path = project_path + "/processed"
if not os.path.exists(project_data_path):
    os.makedirs(project_data_path)
if not os.path.exists(project_processed_data_path):
    os.makedirs(project_processed_data_path)

In [None]:
doc, total_text, total_pages = open_file(project_data_path + "/" + pdf_file)

# Use processing_pdf.auto_find_toc(), which will clean up the original toc
# table_of_content = doc.get_toc()
table_of_content = auto_find_toc(doc)
display(table_of_content)

In [None]:
#uncomment this list to customize table-of-content
# table_of_content = [[1, 'I. INTRODUCTION', 1],
#  [1, 'II. SFC BASED ON IPV6 SEGMENT ROUTING', 2],
#  [1, 'III. DESIGN OF THE SRV6 PROXY', 4],
#  [2, 'A. General Concepts and State-of-the-art', 4],
#  [2, 'B. SRNKv1', 5],
#  [2, 'C. SRNKv2', 7],
#  [2, 'D. Implementation of other SR proxy types', 8],
#  [1, 'IV. TESTING ENVIRONMENT', 8],
#  [1, 'V. PERFORMANCE ANALYSIS', 9],
#  [1, 'VII. CONCLUSIONS', 11]]

# separate content into sections
clear_processed_folder(project_processed_data_path)
title, authors, other_info, abstract = find_meta_data(doc, table_of_content)
df_meta = pd.DataFrame([title, abstract]).T
df_meta.columns = ["Title", "Abstract"]
ds, json_dict = separate_content(total_text, table_of_content)
save_dataframe(ds, df_meta, json_dict, project_processed_data_path,  pdf_file.rsplit(".", 1)[0])
# extract images
find_images(doc, table_of_content, total_pages, project_processed_data_path)

# display(ds)
print(f"\nTitle: {title}")
print(f"\nAuthors:{authors}")
print(f"\nOther info: {other_info}")
print(f"\nAbstract:{abstract}")


In [None]:
# open json file
with open(f"{project_processed_data_path}/{filename}.json") as f:
    data = json.load(f)
    for item in data:
        display(item)

# 2. Model Summarisation
- If using GPT summarization (`model_name` is equal to 'gpt-3.5-turbo-0125'): Send query via OpenAI API
- Else, load chosen fine-tuned summarization model and generate section level summaries (this is the default option)

In [None]:
if model_name == "gpt-3.5-turbo-0125":
    with open(f"processed/{filename}.json", encoding='utf-8') as f:
        data = json.load(f)
    get_summaries(data)
    with open(f"gpt_summaries/{filename}.json", 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)
else:
    summarizer = SummarizationModel(model_name)
    model = summarizer.model

    # path for the extracted pdf's json file
    data_file_path = os.path.join('processed', f'{filename}.json')
    # Load  the json file for summarization
    pdf_data = load_data(data_file_path)

    summarizer_model = SummarizationModel(model_name)

    #Write the final summary to the summary jsonfile
    output_file =os.path.join('model_summarizer/results/model-summary_results.json')
    summarizer.summarize_pdf(pdf_data, output_file, summarizer_model)

# 3. Generate PPT
- Obtain the summaries and images
- Caption images with pretrained image captioning model
- Parse PPT JSON either w GPT or manually
- Save JSON information as PowerPoint using python-pptx

In [None]:
section_texts = []
section_names = []
section_image_paths = []
if model_name == "gpt-3.5-turbo-0125":
    with open(f"gpt_summaries/{filename}.json", encoding='utf-8') as f:
        data = json.load(f)
    for section in data:
        section_names.append(section["Section"])
        section_texts.append(get_section_groundtruth(section)) # Use GPT generated summaries
        section_image_paths.append(get_section_image_paths(section))
else:
    with open(f"processed/{filename}.json", encoding='utf-8') as f:
        data = json.load(f)
    for section in data:
        section_names.append(section["Section"])
        section_texts.append(get_section_summary(section)) # Use pretrained model generated summaries
        section_image_paths.append(get_section_image_paths(section))

Create the slide json

In [None]:
if use_gpt_ppt_parsing:
    title_slide_data = get_title_slide_data(filename)
    toc_slide_data = get_toc_slide_data(filename, section_names)
    content_slide_datas = get_content_slide_datas(section_names, section_texts)
else:
    title_slide_data = {'title': filename, 'subtitle': 'Presentation subtitle'}
    toc_slide_data = {'title': 'Table of Contents', 'content': [{'text': filename, 'indent_level': 0}]}
    for section in data:
        if section['Section'] == "No_title":
            continue
        toc_slide_data['content'].append({'text': section['Section'], 'indent_level': 1})
    content_slide_datas = []
    for i, section in enumerate(data):
        if section['Section'] == "No_title":
            continue
        slide = {'title': section['Section'],
                'content': []}
        
        sents = sent_tokenize(section_texts[i])
        for sent in sents[1:]:
            slide['content'].append({'text': sent, 'indent_level': 0})

        content_slide_datas.append([slide])

The generated presentation slides are put into subfolder <project_path>/data/powerpoints/

In [None]:
theme = "Parcel"
prs = generate_section_level_ppt(theme, title_slide_data, toc_slide_data, content_slide_datas, section_image_paths)
prs.save(f"data/powerpoints/{filename}.pptx")

Done!