In [48]:
pip install nltk pdfplumber sentence_transformers

Note: you may need to restart the kernel to use updated packages.


In [49]:
pip freeze > requirements.txt

Note: you may need to restart the kernel to use updated packages.


# Extract Text from the PDF

In [50]:
import PyPDF2

def extract_text_from_pdf(file_path):
    """
    Extract text from a PDF file using PyPDF2.
    :param file_path: Path to the PDF file.
    :return: Extracted text as a single string.
    """
    text = ""
    with open(file_path, "rb") as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        for page in pdf_reader.pages:
            text += page.extract_text() + "\n"  # Extract text page by page
    return text

file_path = "..\Dataset\Papers\P001.pdf"
pdf_text = extract_text_from_pdf(file_path)
print(pdf_text)

Leveraging Clustering Techniques for Enhanced
Drone Monitoring and Position Estimation
Abstract
Drone tracking and localization are essential for various applications, including
managing drone formations and implementing anti-drone strategies. Pinpointing
and monitoring drones in three-dimensional space is difficult, particularly when
trying to capture the subtle movements of small drones during rapid maneuvers.
This involves extracting faint signals from varied flight settings and maintaining
alignment despite swift actions. Typically, cameras and LiDAR systems are used
to record the paths of drones. However, they encounter challenges in categorizing
drones and estimating their positions accurately. This report provides an overview
of an approach named CL-Det. It uses a clustering-based learning detection strategy
to track and estimate the position of drones using data from two types of LiDAR
sensors: Livox Avia and LiDAR 360. This method merges data from both LiDAR
sources to accurat

  file_path = "..\Dataset\Papers\P001.pdf"


# Title and Abstract Extraction

In [51]:
# Function to dynamically extract title and abstract
def extract_title_and_abstract(pdf_text):
    """
    Extract the title and abstract dynamically from the PDF text.
    Title: From the start to "Abstract".
    Abstract: From "Abstract" to "Introduction".
    """
    title, abstract = "", ""
    try:
        # Extract Title
        title_end_idx = pdf_text.index("Abstract")
        title = pdf_text[:title_end_idx].strip()

        # Extract Abstract
        abstract_start_idx = pdf_text.index("Abstract") + len("Abstract")
        abstract_end_idx = pdf_text.index("Introduction")
        abstract = pdf_text[abstract_start_idx:abstract_end_idx].strip()
    except ValueError as e:
        print(f"Error extracting title and abstract: {e}")
    
    return title, abstract

title, abstract = extract_title_and_abstract(pdf_text)

print("Title:\n", title)
print("\nAbstract:\n", abstract)

Title:
 Leveraging Clustering Techniques for Enhanced
Drone Monitoring and Position Estimation

Abstract:
 Drone tracking and localization are essential for various applications, including
managing drone formations and implementing anti-drone strategies. Pinpointing
and monitoring drones in three-dimensional space is difficult, particularly when
trying to capture the subtle movements of small drones during rapid maneuvers.
This involves extracting faint signals from varied flight settings and maintaining
alignment despite swift actions. Typically, cameras and LiDAR systems are used
to record the paths of drones. However, they encounter challenges in categorizing
drones and estimating their positions accurately. This report provides an overview
of an approach named CL-Det. It uses a clustering-based learning detection strategy
to track and estimate the position of drones using data from two types of LiDAR
sensors: Livox Avia and LiDAR 360. This method merges data from both LiDAR
sources

# Long Paragraphs with No Full Stops

In [52]:
from nltk.tokenize import sent_tokenize

import nltk
nltk.download('punkt_tab')

# Function to validate paragraphs
def validate_paragraphs(paragraphs, word_threshold=50):
    results = []
    for para in paragraphs:
        sentences = sent_tokenize(para)  # Tokenize into sentences
        if len(sentences) == 0:  # No sentences detected
            results.append((para, False))  # Invalid paragraph
            continue
        avg_words_per_sentence = len(para.split()) / len(sentences)
        results.append((para, avg_words_per_sentence <= word_threshold))
    return results

# Split the extracted text into paragraphs
paragraphs = pdf_text.split("\n\n")  # Assuming double newlines separate paragraphs
validation_results = validate_paragraphs(paragraphs)

# Display validation results
for para, is_valid in validation_results:
    print(f"VALID: {is_valid}\nParagraph: {para}...\n")  # Print first 200 chars for clarity

VALID: True
Paragraph: Leveraging Clustering Techniques for Enhanced
Drone Monitoring and Position Estimation
Abstract
Drone tracking and localization are essential for various applications, including
managing drone formations and implementing anti-drone strategies. Pinpointing
and monitoring drones in three-dimensional space is difficult, particularly when
trying to capture the subtle movements of small drones during rapid maneuvers.
This involves extracting faint signals from varied flight settings and maintaining
alignment despite swift actions. Typically, cameras and LiDAR systems are used
to record the paths of drones. However, they encounter challenges in categorizing
drones and estimating their positions accurately. This report provides an overview
of an approach named CL-Det. It uses a clustering-based learning detection strategy
to track and estimate the position of drones using data from two types of LiDAR
sensors: Livox Avia and LiDAR 360. This method merges data from both L

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\vikra\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


# Title and Abstract Misalignment

In [53]:
from sentence_transformers import SentenceTransformer, util

# Load pre-trained model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Check title-abstract alignment
def check_title_abstract_alignment(title, abstract, threshold=0.5):
    title_embedding = model.encode(title, convert_to_tensor=True)
    abstract_embedding = model.encode(abstract, convert_to_tensor=True)
    similarity = util.cos_sim(title_embedding, abstract_embedding)
    return similarity.item() >= threshold, similarity.item()

# Perform the check
is_aligned, similarity_score = check_title_abstract_alignment(title, abstract)
print(f"Title and Abstract are {'aligned' if is_aligned else 'misaligned'} (Similarity Score: {similarity_score:.2f})")

Title and Abstract are aligned (Similarity Score: 0.66)


# Abstracts having Multiple Paragraphs

In [54]:
paragraphs = abstract.strip().split("\n\n")
multiple_paras = len(paragraphs) > 1
print(multiple_paras)

False
