# Libraries

In [37]:
import requests
import pdfplumber
from io import BytesIO
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.probability import FreqDist
import re

In [38]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/juanm/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/juanm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# PDF2Text | _Understanding the Formation of Galaxies with Warm Dark Matter_


In [39]:
url = "https://arxiv.org/pdf/2310.06882.pdf"
response = requests.get(url)

In [41]:
with open('../data/v1/paper_raw.pdf', 'wb') as f:
    f.write(response.content)

In [42]:
pdf = pdfplumber.open(BytesIO(response.content))
text = "\n".join([page.extract_text() for page in pdf.pages])
pdf.close()

In [43]:
with open("../data/v1/paper.txt", "w", encoding="utf-8") as f:
    f.write(text)

# Summarization | _TextRank_

In [44]:
def summarize_text(text, num_sentences):
    # Tokenize mathematical expressions
    math_tokens = re.findall(r'[A-Za-z]+|[0-9]+|\S', text)

    # Tokenize the text into sentences
    sentences = sent_tokenize(text)

    # Tokenize the text into words and remove stopwords
    stop_words = set(stopwords.words("english"))
    words = [word for word in math_tokens if word.casefold() not in stop_words]

    # Calculate the frequency distribution of words
    word_freq = FreqDist(words)

    # Mathematical keywords and operators
    math_keywords = ["function", "equation", "variable"]
    math_operators = set(["+", "-", "*", "/", "="])

    # Calculate the score for each sentence
    sentence_scores = {}
    for sentence in sentences:
        keyword_score = sum(1 for keyword in math_keywords if keyword in sentence.lower())
        operator_score = sum(1 for token in math_tokens if token in math_operators)
        
        for word in word_tokenize(sentence.lower()):
            if word in word_freq.keys():
                # Adjust the sentence score based on keyword and operator presence
                sentence_scores[sentence] = word_freq[word] + keyword_score + operator_score

    # Sort the sentences based on their scores
    sorted_sentences = sorted(sentence_scores.items(), key=lambda x: x[1], reverse=True)

    # Limit sentence length and select the top N sentences for the summary
    MAX_SENTENCE_LENGTH = 30
    summary_sentences = [sentence[0] for sentence in sorted_sentences if len(sentence[0].split()) <= MAX_SENTENCE_LENGTH][:num_sentences]

    return " ".join(summary_sentences)

In [53]:
summary = summarize_text(text, 10)

In [54]:
with open("../data/v1/summary.txt", "w", encoding="utf-8") as f:
    f.write(summary)