# Dependencies

## Libraries

In [6]:
# Text Extraction
from pdfminer.high_level import extract_text

# Preprocessing
import re
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize

# Embedding Generation
from sentence_transformers import SentenceTransformer

# Semantic Chunking
from sklearn.cluster import KMeans
import numpy as np

# LLM for summarization
import os
from getpass import getpass
from openai import OpenAI

# Warnings
import warnings
warnings.filterwarnings('ignore')

## Module Downloads

### Preprocessing Modules

In [7]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# !python -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Akash\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Akash\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Akash\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Tokenization Module

In [8]:
model = SentenceTransformer('bert-base-nli-mean-tokens')

## LLM API Key

In [9]:
OPENAI_API_KEY = getpass('Enter OpenAI API Key:')

Enter OpenAI API Key: ········


# Main

## Helper Functions

### Data Extraction

In [20]:
def _extract_text_from_pdf(filepath):
    text = extract_text(filepath)
    
    return text

### Data Prepration

In [21]:
def _preprocess_text(text):
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    doc = nlp(text)
    
    processed_tokens = []
    for token in doc:
        if token.ent_type_ in ('PERSON', 'ORG', 'GPE', 'LOC'):
            processed_tokens.append(token.text)
        else:
            processed_tokens.append(token.text.lower())

    text = ' '.join(processed_tokens)
    words = word_tokenize(text)

    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    preprocessed_text = ' '.join(words)
    
    return preprocessed_text

### Embedding Generation

In [22]:
def _embedding_generation(preprocessed_text):
    sentences = sent_tokenize(preprocessed_text)
    sentence_embeddings = model.encode(sentences)

    return sentence_embeddings, sentences

### Semantic Chunking

In [42]:
def _semantic_chunking(preprocessed_text, n_clusters):
    sentence_embeddings, sentences = _embedding_generation(preprocessed_text)
    
    kmeans = KMeans(n_clusters, n_init='auto', random_state=42)
    kmeans.fit(sentence_embeddings)

    clusters = {}
    for idx, label in enumerate(kmeans.labels_):
        if label not in clusters:
            clusters[label] = []
            clusters[label].append(sentences[idx])

    return clusters

### Abstractive Summarization

In [43]:
def _abstractive_summary(clusters):
    client = OpenAI(api_key=OPENAI_API_KEY)
    summary = []
    for cluster in clusters.values():
        
        cluster_text = ' '.join(cluster)
        chunk_summary = client.chat.completions.create(
            messages=[
                {
                    'role': 'user',
                    'content': f"Provide a well detailed technical summary of the following text:\n{cluster_text}"
                }
            ],
            model='gpt-3.5-turbo'
        )
        summary.append(chunk_summary.choices[0].message.content.strip())

    summary_text = ' '.join(summary)
    final_summarization = client.chat.completions.create(
        messages=[
            {
                'role': 'user',
                'content': f"Improve the semantic format:\n{summary_text}"
            }
        ],
        model='gpt-3.5-turbo'
    )
        
    return final_summarization.choices[0].message.content.strip()

## Executable Function

In [44]:
def summarize_pdf(filepath, n_clusters=5):
    text = _extract_text_from_pdf(filepath)
    preprocessed_text = _preprocess_text(text)
    semantically_chunked_text = _semantic_chunking(preprocessed_text, n_clusters)
    summary = _abstractive_summary(semantically_chunked_text)

    return summary

# Testing

In [50]:
filepath = 'PDFs/2.pdf'

In [51]:
summary = summarize_pdf(filepath)

In [52]:
print(summary)

The text provides a summary of a survey conducted by researchers from various universities in the UK and Denmark on attack methodologies in machine learning. The article was received in December 2018, revised in June 2019, and accepted for publication in September 2019. It was made available online in October 2019.

The researchers discuss machine learning methodologies that assume a benign environment and present a taxonomy of attacks in machine learning systems. The study aims to provide an overview of attack techniques and strategies used in the field of machine learning.

Researchers from multiple universities conducted the study, focusing on vulnerabilities and threats faced by machine learning systems. The text addresses potential vulnerabilities in machine learning models and discusses poisoning attacks, where false data is inserted into the training set to bias the model, and evasion attacks, where test data is manipulated to evade detection by the model.

It is crucial for dev