In [75]:
%pip install nltk pdfplumber sentence_transformers

Note: you may need to restart the kernel to use updated packages.


In [76]:
%pip freeze > requirements.txt

Note: you may need to restart the kernel to use updated packages.


# Extract Text from the PDF

In [77]:
import PyPDF2

def extract_text_from_pdf(file_path):
    """
    Extract text from a PDF file using PyPDF2.
    :param file_path: Path to the PDF file.
    :return: Extracted text as a single string.
    """
    text = ""
    with open(file_path, "rb") as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        for page in pdf_reader.pages:
            text += page.extract_text() + "\n"  # Extract text page by page
    return text

# file_path = "..\Dataset\Papers\P001.pdf"
file_path = "../Dataset/Reference/Publishable/NeurIPS/R013.pdf"
pdf_text = extract_text_from_pdf(file_path)
print(pdf_text)

Generalization in ReLU Networks via Restricted
Isometry and Norm Concentration
Abstract
Regression tasks, while aiming to model relationships across the entire input space,
are often constrained by limited training data. Nevertheless, if the hypothesis func-
tions can be represented effectively by the data, there is potential for identifying a
model that generalizes well. This paper introduces the Neural Restricted Isometry
Property (NeuRIPs), which acts as a uniform concentration event that ensures all
shallow ReLU networks are sketched with comparable quality. To determine the
sample complexity necessary to achieve NeuRIPs, we bound the covering numbers
of the networks using the Sub-Gaussian metric and apply chaining techniques. As-
suming the NeuRIPs event, we then provide bounds on the expected risk, applicable
to networks within any sublevel set of the empirical risk. Our results show that all
networks with sufficiently small empirical risk achieve uniform generalization.
1 Introd

# Title and Abstract Extraction

In [78]:
# Function to dynamically extract title and abstract
def extract_title_and_abstract(pdf_text):
    """
    Extract the title and abstract dynamically from the PDF text.
    Title: From the start to "Abstract".
    Abstract: From "Abstract" to "Introduction".
    """
    title, abstract = "", ""
    try:
        # Extract Title
        title_end_idx = pdf_text.index("Abstract")
        title = pdf_text[:title_end_idx].strip()

        # Extract Abstract
        abstract_start_idx = pdf_text.index("Abstract") + len("Abstract")
        abstract_end_idx = pdf_text.index("Introduction")
        abstract = pdf_text[abstract_start_idx:abstract_end_idx].strip()
    except ValueError as e:
        print(f"Error extracting title and abstract: {e}")
    
    return title, abstract

title, abstract = extract_title_and_abstract(pdf_text)

print("Title:\n", title)
print("\nAbstract:\n", abstract)

Title:
 Generalization in ReLU Networks via Restricted
Isometry and Norm Concentration

Abstract:
 Regression tasks, while aiming to model relationships across the entire input space,
are often constrained by limited training data. Nevertheless, if the hypothesis func-
tions can be represented effectively by the data, there is potential for identifying a
model that generalizes well. This paper introduces the Neural Restricted Isometry
Property (NeuRIPs), which acts as a uniform concentration event that ensures all
shallow ReLU networks are sketched with comparable quality. To determine the
sample complexity necessary to achieve NeuRIPs, we bound the covering numbers
of the networks using the Sub-Gaussian metric and apply chaining techniques. As-
suming the NeuRIPs event, we then provide bounds on the expected risk, applicable
to networks within any sublevel set of the empirical risk. Our results show that all
networks with sufficiently small empirical risk achieve uniform generalizatio

In [79]:
from sklearn.feature_extraction.text import TfidfVectorizer
from pprint import pprint

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform([abstract])

# Extract keywords and their scores
scores = zip(vectorizer.get_feature_names_out(), X.toarray()[0])
sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)

# Top keywords
keywords = sorted_scores  # Adjust the number as needed

pprint(sorted_scores)

[('networks', np.float64(0.3730019232961255)),
 ('neurips', np.float64(0.27975144247209416)),
 ('risk', np.float64(0.27975144247209416)),
 ('achieve', np.float64(0.18650096164806276)),
 ('data', np.float64(0.18650096164806276)),
 ('empirical', np.float64(0.18650096164806276)),
 ('event', np.float64(0.18650096164806276)),
 ('model', np.float64(0.18650096164806276)),
 ('uniform', np.float64(0.18650096164806276)),
 ('acts', np.float64(0.09325048082403138)),
 ('aiming', np.float64(0.09325048082403138)),
 ('applicable', np.float64(0.09325048082403138)),
 ('apply', np.float64(0.09325048082403138)),
 ('bound', np.float64(0.09325048082403138)),
 ('bounds', np.float64(0.09325048082403138)),
 ('chaining', np.float64(0.09325048082403138)),
 ('comparable', np.float64(0.09325048082403138)),
 ('complexity', np.float64(0.09325048082403138)),
 ('concentration', np.float64(0.09325048082403138)),
 ('constrained', np.float64(0.09325048082403138)),
 ('covering', np.float64(0.09325048082403138)),
 ('determ

# Long Paragraphs with No Full Stops

In [80]:
from nltk.tokenize import sent_tokenize

import nltk
nltk.download('punkt_tab')

# Function to validate paragraphs
def validate_paragraphs(paragraphs, word_threshold=50):
    results = []
    for para in paragraphs:
        sentences = sent_tokenize(para)  # Tokenize into sentences
        if len(sentences) == 0:  # No sentences detected
            results.append((para, False))  # Invalid paragraph
            continue
        avg_words_per_sentence = len(para.split()) / len(sentences)
        results.append((para, avg_words_per_sentence <= word_threshold))
    return results

# Split the extracted text into paragraphs
paragraphs = pdf_text.split("\n\n")  # Assuming double newlines separate paragraphs
validation_results = validate_paragraphs(paragraphs)

# Display validation results
for para, is_valid in validation_results:
    print(f"VALID: {is_valid}\nParagraph: {para}...\n")  # Print first 200 chars for clarity

VALID: True
Paragraph: Generalization in ReLU Networks via Restricted
Isometry and Norm Concentration
Abstract
Regression tasks, while aiming to model relationships across the entire input space,
are often constrained by limited training data. Nevertheless, if the hypothesis func-
tions can be represented effectively by the data, there is potential for identifying a
model that generalizes well. This paper introduces the Neural Restricted Isometry
Property (NeuRIPs), which acts as a uniform concentration event that ensures all
shallow ReLU networks are sketched with comparable quality. To determine the
sample complexity necessary to achieve NeuRIPs, we bound the covering numbers
of the networks using the Sub-Gaussian metric and apply chaining techniques. As-
suming the NeuRIPs event, we then provide bounds on the expected risk, applicable
to networks within any sublevel set of the empirical risk. Our results show that all
networks with sufficiently small empirical risk achieve uniform g

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\vikra\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


# Title and Abstract Misalignment

In [81]:
from sentence_transformers import SentenceTransformer, util

# Load pre-trained model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Check title-abstract alignment
def check_title_abstract_alignment(title, abstract, threshold=0.5):
    title_embedding = model.encode(title, convert_to_tensor=True)
    abstract_embedding = model.encode(abstract, convert_to_tensor=True)
    similarity = util.cos_sim(title_embedding, abstract_embedding)
    return similarity.item() >= threshold, similarity.item()

# Perform the check
is_aligned, similarity_score = check_title_abstract_alignment(title, abstract)
print(f"Title and Abstract are {'aligned' if is_aligned else 'misaligned'} (Similarity Score: {similarity_score:.2f})")

Title and Abstract are misaligned (Similarity Score: 0.48)


# Abstracts having Multiple Paragraphs

In [82]:
paragraphs = abstract.strip().split("\n\n")
multiple_paras = len(paragraphs) > 1
print(multiple_paras)

False
