In [1]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [2]:
from PyPDF2 import PdfReader
import os

# Path to the dataset folder in Kaggle
dataset_folder = '/kaggle/input/kdsh-datasetheadx/KDSH Dataset'

# List all PDF files in the folder
pdf_files = [file for file in os.listdir(dataset_folder) if file.endswith('.pdf')]
ds_unclassified = [file for file in os.listdir(dataset_folder) if file.endswith('.pdf') and file.startswith('P')]
ds_publishnot = [file for file in pdf_files if 'R001' <= file <= 'R006']
ds_publish = [file for file in pdf_files if 'R006' <= file <= 'R016']

In [3]:
print(ds_publishnot)
print(ds_publish)

['R003.pdf', 'R005.pdf', 'R002.pdf', 'R004.pdf', 'R001.pdf']
['R015.pdf', 'R010.pdf', 'R012.pdf', 'R008.pdf', 'R011.pdf', 'R007.pdf', 'R009.pdf', 'R013.pdf', 'R014.pdf', 'R006.pdf']


In [4]:
import pandas as pd
import re
import os
from PyPDF2 import PdfReader

# Initialize a list to store the extracted data
data = []

# Extract required information from each PDF
for pdf_file in pdf_files:
    file_path = os.path.join(dataset_folder, pdf_file)
    reader = PdfReader(file_path)
    
    # Extract text from all pages of the PDF
    full_text = ""
    for page in reader.pages:
        full_text += page.extract_text()

    # Extract information
    pdf_name = pdf_file
    title = full_text.split('\n')[0]  # Assume the title is the first line
    abstract_start = full_text.lower().find("abstract")
    intro_start = full_text.lower().find("1 introduction")
    abstract = full_text[abstract_start:intro_start].strip() if abstract_start != -1 and intro_start != -1 else "N/A"

    # Define the pattern for Section 2
    pattern2 = r"2\s[A-Z][a-zA-Z\s]+\n"
    match = re.search(pattern2, full_text)
    introend_search = match.start() if match else -1
    introduction_end = introend_search
    introduction = full_text[intro_start:introduction_end].strip() if intro_start != -1 else "N/A"

    # Search for Section 2
    section2_start = introduction_end
    pattern3 = r"3\s[A-Z][a-zA-Z\s]+\n"
    match3 = re.search(pattern3, full_text)
    section2end_search = match3.start() if match3 else -1
    section2_end = section2end_search
    section2 = full_text[section2_start:section2_end].strip() if section2_start != -1 else "N/A"

    # Search for Section 3
    section3_start = section2_end
    pattern4 = r"4\s[A-Z][a-zA-Z\s]+\n"
    match4 = re.search(pattern4, full_text)
    section3end_search = match4.start() if match4 else -1
    section3_end = section3end_search
    section3 = full_text[section3_start:section3_end].strip() if section3_start != -1 else "N/A"
    
    # Extract the rest of the paper
    rest_of_paper = full_text[section3_end:].strip() if section3_end != -1 else "N/A"

    # Append the extracted information to the list
    data.append([pdf_name, title, abstract, introduction, section2, section3, rest_of_paper])

# Create a DataFrame
columns = ["PDF Name", "Title of Paper", "Abstract", "Introduction", "Section 2", "Section 3", "Rest of the Paper"]
df = pd.DataFrame(data, columns=columns)

# Display the dataset
print(df)


     PDF Name                                     Title of Paper  \
0    P063.pdf  Representation Transferability in Neural Networks   
1    P038.pdf  Utilizing Graph Neural Networks to Analyze Esp...   
2    P119.pdf  Entropy Dynamics in Turbulent Flumplenook Systems   
3    P071.pdf  The Significance of Fillers in Textual Represe...   
4    P020.pdf  Deep Learning for 3D Protein Structure Predict...   
..        ...                                                ...   
145  P050.pdf  Interpreting Recurrent and Attention-Based Neural   
146  P134.pdf     Unraveling the Enigmatic Parallels Between DNA   
147  P101.pdf          A Convolutional LSTM Network Approach for   
148  P073.pdf  Exploring Soil Dynamics through a Multidiscipl...   
149  P096.pdf  Volcanic Eruptions in Relation to Quiche Recip...   

                                              Abstract  \
0    Abstract\nDeep neural networks, which are buil...   
1    Abstract\nGraph Neural Networks (GNNs) for Pre...   
2    Abst

In [5]:
df.tail(20)

Unnamed: 0,PDF Name,Title of Paper,Abstract,Introduction,Section 2,Section 3,Rest of the Paper
130,P032.pdf,Exploring the Transcendental Nexus of Water and,Abstract\nThe aquatic nuances of water travers...,1 Introduction\nIn order to fully grasp the im...,2 Related Work\nThe notion of water as a fluid...,3 Methodology\nThe investigation of water nece...,4 Experiments\nThe initialization of our resea...
131,P088.pdf,Analyzing Groups of Neurons in Neural Networks...,"Abstract\nThe concept of a ""modular"" structure...","1 Introduction\nModularity, a principle where ...",2 Related Work\nThe investigation of modularit...,3 Quantifying modularity by clustering similar...,4 Experiments\n4.1 Setup and initial hypothese...
132,P111.pdf,Leveraging Deep Learning for Enhanced Bayesian...,Abstract\nBayesian optimization (BO) is a wide...,1 Introduction\nBayesian optimization (BO) is ...,2 Related Work\nSeveral methods have been deve...,3 Methodology\n3.1 Bayesian Optimization Prere...,"4 Auxiliary Information\nTypically, we assume ..."
133,P039.pdf,RAG Optimization via Galactic Kitten Dynamics and,Abstract\nInvestigating RAG necessitates scrut...,1 Introduction\nRAG is a phenomenon that has b...,2 Related Work\nThe inherent properties of gal...,3 Methodology\nIn order to facilitate a compre...,4 Experiments\nIn order to facilitate a compre...
134,P102.pdf,A Large-Scale Car Dataset for Fine-Grained,Abstract\nThis paper aims to highlight vision ...,1 Introduction\nCars represent a revolution in...,2 Related Work\nMost previous car model resear...,3 Properties of CompCars\nThe CompCars dataset...,"4 Applications\nIn this section, we study thre..."
135,P007.pdf,Joint Syntacto-Discourse Parsing and the,Abstract\nDiscourse parsing has long been trea...,1 Introduction\nDistinguishing the semantic re...,,3 Recurrent Neural Models and Training\nThe sc...,4 Experiments\nWe use the treebank described i...
136,P085.pdf,Privacy Evaluation in Tabular Synthetic Data:,Abstract\nThis paper examines the present meth...,1 Introduction and Relation to Prior Research\...,2 Definitions and Notation\nTo the best of our...,3 Synthetic Data Privacy Risks\nThree signific...,4 Mathematical Privacy Properties\n4.1 Differe...
137,P053.pdf,Microprocessor Architectures and their Interse...,Abstract\nMicroprocessors have been profoundly...,1 Introduction\nThe intersection of microproce...,2 Related Work\nThe advent of microprocessor t...,3 Methodology\nThe elucidation of microprocess...,4 Experiments\nThe experimental design for thi...
138,P036.pdf,Profound Impact on Gravity on the Surface of a,Abstract\nThe study of gravity necessitates a ...,1 Introduction\nThe complexity of gravity and ...,2 Related Work\nThe concept of gravity has bee...,3 Methodology\nTo initiate our inquiry into th...,4 Experiments\nThe notion of gravity was first...
139,P046.pdf,Symbiotic Adversarial Robustness for Graph Neural,Abstract\nDeep learning models are known to be...,1 Introduction\nGraph neural networks (GNNs) a...,2 Preliminaries\nNotation. We denote a graph b...,3 Symbiotic Attacks\nThe Symbiotic Objective. ...,4 Evaluation\n4.1 Setup\nWe compare the symbio...


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   PDF Name           150 non-null    object
 1   Title of Paper     150 non-null    object
 2   Abstract           150 non-null    object
 3   Introduction       150 non-null    object
 4   Section 2          150 non-null    object
 5   Section 3          150 non-null    object
 6   Rest of the Paper  150 non-null    object
dtypes: object(7)
memory usage: 8.3+ KB


In [7]:
print(df["Introduction"])

0      1 Introduction\nDeep networks, constructed wit...
1      1 Introduction\nThe realm of Graph Neural Netw...
2      1 Introduction\nThe notion of entropy, a conce...
3      1 Introduction\nThis paper addresses the criti...
4      1 Introduction\nThe prediction of 3D protein s...
                             ...                        
145    1 Introduction\nDeep learning has achieved tre...
146    1 Introduction\nThe intersection of quantum me...
147    1 Introduction\nThis paper addresses the criti...
148    1 Introduction\nThe fledgling discipline of so...
149    1 Introduction\nThe ostensibly unrelated field...
Name: Introduction, Length: 150, dtype: object


In [8]:
print(df["Section 2"][2])

2 Related Work
The concept of entropy has been extensively studied in various fields, including the art of baking
croissants, where the flaky layers of dough are believed to exhibit a high degree of entropy due to
the random arrangement of butter and pastry. This phenomenon is closely related to the study of
linguistics, particularly in the analysis of the grammatical structure of ancient Sumerian texts, which
has been shown to possess a unique entropy signature that can be used to identify the authorship of
various tablets. Furthermore, research has demonstrated that the entropy of a system can be directly
correlated to the number of jellybeans in a jar, with a higher entropy corresponding to a greater
number of jellybeans.
In a related study, scientists discovered that the entropy of a cup of coffee is directly proportional to
the amount of creamer added, with a maximum entropy achieved when the creamer is stirred in a
counterclockwise direction. This finding has significant implicat