#TEXT **CLASSIFICATION**

In [None]:
!pip install transformers einops




In [None]:
!pip install sentence-transformers



In [None]:
!pip install pdfplumber
# Install the pdfplumber library which is necessary for extracting text from PDF files



In [None]:
from sentence_transformers import CrossEncoder
import pdfplumber

# Load a cross-encoder model to rank venue descriptions based on a query.
model = CrossEncoder(
    "jinaai/jina-reranker-v2-base-multilingual",
    automodel_args={"torch_dtype": "auto"},
    trust_remote_code=True,
)

def extract_venues_from_pdf(pdf_path: str) -> list:
    """
    Extracts venue descriptions from a given PDF file.

    Args:
    - pdf_path (str): Path to the PDF file containing venue descriptions.

    Returns:
    - List[str]: List of venue descriptions extracted from the PDF.
    """
    venues = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            # Extract text from each page
            text = page.extract_text()
            # Assuming each line in the PDF represents a venue description
            if text:
                venues.extend(text.splitlines())
    # Filter out any empty strings from the list
    venues = [venue.strip() for venue in venues if venue.strip()]
    return venues

def extract_venues_from_multiple_pdfs(pdf_paths: list) -> list:
    """
    Extracts venue descriptions from multiple PDF files.

    Args:
    - pdf_paths (list): List of paths to the PDF files containing venue descriptions.

    Returns:
    - List[str]: Combined list of venue descriptions extracted from all PDFs.
    """
    all_venues = []
    for pdf_path in pdf_paths:
        venues = extract_venues_from_pdf(pdf_path)
        all_venues.extend(venues)
    return all_venues

# List of paths to your PDF files
pdf_paths = ["/content/drive/MyDrive/PDF/The Boy Who Cried Wolf.pdf",
             "/content/drive/MyDrive/PDF/The Lion and the Mouse.pdf",
             "/content/drive/MyDrive/PDF/The Tortoise and the Hare.pdf",
             "/content/drive/MyDrive/PDF/DS.pdf"]  # Replace with the paths to your actual PDF files

# Extract venue descriptions from multiple PDF files
venues = extract_venues_from_multiple_pdfs(pdf_paths)
# Example query describing event venue requirements
query = "data encoding"

# Check if venues list is empty and handle it
if not venues:
    print("No venues found in the provided PDFs.")
else:
    # Constructing pairs of query and each venue description
    venue_pairs = [[query, venue] for venue in venues]

    # Predict scores for each pair to measure relevance to the query
    scores = model.predict(venue_pairs, convert_to_tensor=True).tolist()

    # Rank the venues based on the query and return the best matches with scores
    rankings = model.rank(query, venues, return_documents=True, convert_to_tensor=True)

    # Filter out results with Chinese characters in the descriptions
    filtered_rankings = [ranking for ranking in rankings if not any('\u4e00' <= char <= '\u9fff' for char in ranking['text'])]

    # Further filter based on relevant content and a score threshold
    relevant_keywords = ["encoding", "data encoding", "encoding techniques"]
    threshold = 0.5  # Adjust threshold as needed
    final_rankings = [
        ranking for ranking in filtered_rankings
        if any(keyword.lower() in ranking['text'].lower() for keyword in relevant_keywords) and ranking['score'] >= threshold
    ]

    print(f"Query: {query}")
    for ranking in final_rankings:
        print(f"ID: {ranking['corpus_id']}, Score: {ranking['score']:.4f}, Venue Description: {ranking['text']}")


Query: data encoding
ID: 45, Score: 0.6992, Venue Description: Encoding is a crucial step in data preprocessing, particularly when dealing with categorical
ID: 56, Score: 0.6523, Venue Description: Importance of Encoding in Data Analysis
ID: 44, Score: 0.6367, Venue Description: What is Encoding in Data Science?
ID: 24, Score: 0.6094, Venue Description: Introduction to Data Science Analysis and Encoding
ID: 47, Score: 0.5781, Venue Description: encoding is the process of transforming categorical variables (such as text or labels) into
ID: 62, Score: 0.5703, Venue Description: Combining Data Science Analysis with effective Encoding techniques enables data
