# KEYWORDS DETECTION FOR CONFERENCES

In [183]:
import PyPDF2

def extract_text_from_pdf(file_path):
    """
    Extract text from a PDF file using PyPDF2.
    :param file_path: Path to the PDF file.
    :return: Extracted text as a single string.
    """
    text = ""
    with open(file_path, "rb") as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        for page in pdf_reader.pages:
            text += page.extract_text() + "\n"  # Extract text page by page
    return text

# File path
file_path = "../Dataset/Reference/Publishable/TMLR/R015.pdf"

# Extract text from PDF
pdf_text = extract_text_from_pdf(file_path)

# Display the extracted text
print(pdf_text)

Examining the Convergence of Denoising Diffusion Probabilistic
Models: A Quantitative Analysis
Abstract
Deep generative models, particularly diffusion models, are a significant family within deep learning. This study
provides a precise upper limit for the Wasserstein distance between a learned distribution by a diffusion model
and the target distribution. In contrast to earlier research, this analysis does not rely on presumptions regarding
the learned score function. Furthermore, the findings are applicable to any data-generating distributions within
restricted instance spaces, even those lacking a density relative to the Lebesgue measure, and the upper limit is not
exponentially dependent on the ambient space dimension. The primary finding expands upon recent research by
Mbacke et al. (2023), and the proofs presented are fundamental.
1 Introduction
Diffusion models, alongside generative adversarial networks and variational autoencoders (V AEs), are among the most influential
families

In [184]:
def extract_abstract(pdf_text):
    """
    Extract the title and abstract dynamically from the PDF text.
    Title: From the start to "Abstract".
    Abstract: From "Abstract" to "Introduction".
    """
    abstract = "", ""
    try:
        # Extract Abstract
        abstract_start_idx = pdf_text.index("Abstract") + len("Abstract") + len("Introdcution")
        abstract_end_idx = pdf_text.index("2")
        print(abstract_start_idx, abstract_end_idx)
        abstract = pdf_text[abstract_start_idx:abstract_end_idx].strip()
    except ValueError as e:
        print(f"Error extracting title and abstract: {e}")
    
    return abstract

abstract = extract_abstract(pdf_text)

print("\nAbstract:\n", abstract)

115 799

Abstract:
 tive models, particularly diffusion models, are a significant family within deep learning. This study
provides a precise upper limit for the Wasserstein distance between a learned distribution by a diffusion model
and the target distribution. In contrast to earlier research, this analysis does not rely on presumptions regarding
the learned score function. Furthermore, the findings are applicable to any data-generating distributions within
restricted instance spaces, even those lacking a density relative to the Lebesgue measure, and the upper limit is not
exponentially dependent on the ambient space dimension. The primary finding expands upon recent research by
Mbacke et al. (


In [185]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample paragraph
paragraph = "Python is a versatile programming language. It is widely used in web development, data science, and machine learning."

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform([abstract])

# Extract keywords and their scores
scores = zip(vectorizer.get_feature_names_out(), X.toarray()[0])
sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)

# Top keywords
keywords = sorted_scores  # Adjust the number as needed
from pprint import pprint
pprint(sorted_scores)


[('diffusion', np.float64(0.22941573387056174)),
 ('distribution', np.float64(0.22941573387056174)),
 ('learned', np.float64(0.22941573387056174)),
 ('limit', np.float64(0.22941573387056174)),
 ('models', np.float64(0.22941573387056174)),
 ('research', np.float64(0.22941573387056174)),
 ('upper', np.float64(0.22941573387056174)),
 ('al', np.float64(0.11470786693528087)),
 ('ambient', np.float64(0.11470786693528087)),
 ('analysis', np.float64(0.11470786693528087)),
 ('applicable', np.float64(0.11470786693528087)),
 ('contrast', np.float64(0.11470786693528087)),
 ('data', np.float64(0.11470786693528087)),
 ('deep', np.float64(0.11470786693528087)),
 ('density', np.float64(0.11470786693528087)),
 ('dependent', np.float64(0.11470786693528087)),
 ('dimension', np.float64(0.11470786693528087)),
 ('distance', np.float64(0.11470786693528087)),
 ('distributions', np.float64(0.11470786693528087)),
 ('does', np.float64(0.11470786693528087)),
 ('earlier', np.float64(0.11470786693528087)),
 ('et', 

In [186]:
KDD_KEYWORDS = [
    "data cleaning",
    "preparation",
    "data transformation",
    "mining",
    "scalability",
    "explainability",
    "data privacy"
]

CVPR_KEYWORDS = [
    "image",
    "3d",
    "detection",
    "video",
    "segmentation",
    "transformer",
    "representation",
    "generation",
    "diffusion",
]

TMLR_KEYWORDS = [
    "Statistical Learning Theory",
    "Optimization Algorithms",
    "Generalization",
    "Adversarial Robustness",
    "Kernel Methods",
    "Probabilistic Graphical Models",
    "Bayesian Inference",
    "Computational Efficiency",
    "Ethics",
    "Fairness",
    "Policy Implications",
    "Benchmarking Studies",
    "Responsible"
]

NEURIPS_KEYWORDS = [
    "Deep Learning Architectures",
    "Transformers",
    "Diffusion Models",
    "Multimodal Networks",
    "Neuroscience-Inspired Models",
    "Non-Convex Optimization",
    "Differential Privacy",
]

EMNLP_KEYWORDS = [
    "Natural Language Understanding",
    "Natural Language Generation",
    "Machine Translation",
    "Speech-to-Text",
    "Sentiment Analysis",
    "Question Answering",
    "Dialogue Systems",
    "Information Retrieval",
    "Summarization",
    "Text Classification",
    "Entity Recognition"
    "semantic parsing",
    "semantic classification",
    "natural language toolkit",
    "shallow parsing",
    "ambiguous language",
    "pragmatic language",
    "morphological language",
    "chunking",
    "syntax processing",
    "role labeling",
    "textual entailment",
    "word sense disambiguation",
    "discourse analysis",
    "information extraction",
    "corereference resolution",
]

## Keyword matching in the abstract of papers

In [187]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Keywords for each conference
keywords = {
    "KDD": KDD_KEYWORDS,
    "CVPR": CVPR_KEYWORDS,
    "TMLR": NEURIPS_KEYWORDS,
    "NEURIPS": TMLR_KEYWORDS,
    "EMNLP": EMNLP_KEYWORDS
}

# Extracted keywords
extracted_keywords = [keyword for keyword, score in sorted_scores]

# Create a list of documents (one per conference) and the extracted keywords
documents = [' '.join(conf_keywords) for conf_keywords in keywords.values()]
documents.append(' '.join(extracted_keywords))

# Compute the vectorized representation of the keywords
vectorizer = CountVectorizer()
vectors = vectorizer.fit_transform(documents)

# Compute cosine similarity between the extracted keywords vector and each conference vector
similarities = cosine_similarity(vectors[-1], vectors[:-1])

# Find the conference with the highest cosine similarity
conference = list(keywords.keys())[similarities.argmax()]

print(f"Predicted Conference: {conference}")


Predicted Conference: TMLR
