# Data Extraction

In [1]:
sample_filepath = 'PDFs/1.pdf'

## Text Extraction

In [2]:
from pdfminer.high_level import extract_text

In [3]:
def extract_text_from_pdf(filepath):
    text = extract_text(filepath)

    return text

In [4]:
sample_text = extract_text_from_pdf(sample_filepath)

# Data Prepration

## Preprocessing

In [5]:
import re
import nltk
import spacy
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize

In [6]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# !python -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Akash\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Akash\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Akash\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
def preprocess_text(text):
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    
    doc = nlp(text)

    processed_tokens = []

    for token in doc:
        if token.ent_type_ in ('PERSON', 'ORG', 'GPE', 'LOC'):
            processed_tokens.append(token.text)
        else:
            processed_tokens.append(token.text.lower())

    text = ' '.join(processed_tokens)
    
    words = word_tokenize(text)

    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    
    return ' '.join(words)

In [8]:
preprocessed_text = preprocess_text(sample_text)

## Embedding Generation

In [9]:
from sentence_transformers import SentenceTransformer

In [10]:
model = SentenceTransformer('bert-base-nli-mean-tokens')



In [11]:
sentences = sent_tokenize(preprocessed_text)
sentence_embeddings = model.encode(sentences)

## Semantic Chunking

In [12]:
from sklearn.cluster import KMeans
import numpy as np

In [13]:
kmeans = KMeans(n_clusters=5, n_init='auto', random_state=42)

In [14]:
kmeans.fit(sentence_embeddings)

In [15]:
clusters = {}

for idx, label in enumerate(kmeans.labels_):
    if label not in clusters:
        clusters[label] = []
    clusters[label].append(sentences[idx])

In [16]:
for cluster, sentences in clusters.items():
    print(f"Cluster {cluster}:")
    for sentence in sentences:
        print(f" - {sentence}")
    print()

Cluster 4:
 - Journal Parallel distributed computing 130 ( 2019 ) 1223 content list available ScienceDirect J .
 - journal homepage : www.elsevier.com/locate/jpdc security machine learning adversarial setting : survey Xianmin Wang , , Jing Li , Xiaohui Kuang b , Yu - Tan c , Jin Li , School Computer Science , Guangzhou University , Guangzhou 510006 , China b National Key Laboratory Science Technology Information System Security , Beijing , China c School Computer Science , Beijing Institute Technology University , Beijing , China State Key Laboratory Information Security , Institute Information Engineering , Chinese Academy Sciences , Beijing , China r c l e n f b r c article history : received 17 december 2018 received revised form 2 february 2019 accepted 2 march 2019 available online 3 april 2019 keywords : machine learning adversarial setting adversarial attack adversarial example security model machine learning ( ML ) method demonstrated impressive performance many application fie

## Abstractive Summarization

In [29]:
import os
from getpass import getpass

from openai import OpenAI

In [66]:
OPENAI_API_KEY = getpass('Enter OpenAI API Key:')

Enter OpenAI API Key: ········


In [33]:
client = OpenAI(api_key=OPENAI_API_KEY)

In [64]:
def abstractive_summary(clusters):
    summary = []
    for cluster in clusters.values():
        
        cluster_text = ' '.join(cluster)
        chunk_summary = client.chat.completions.create(
            messages=[
                {
                    'role': 'user',
                    'content': f"Summarize the following text:\n{cluster_text}"
                }
            ],
            model='gpt-3.5-turbo'
        )
        summary.append(chunk_summary.choices[0].message.content.strip())

    summary_text = ' '.join(summary)
    final_summarization = client.chat.completions.create(
        messages=[
            {
                'role': 'user',
                'content': f"Improve the semantic format:\n{summary_text}"
            }
        ],
        model='gpt-3.5-turbo'
    )
        
    return final_summarization.choices[0].message.content.strip()

In [65]:
summary = abstractive_summary(clusters)
print("Abstractive Summary:")
print(summary)

Abstractive Summary:
The survey explores security in machine learning within adversarial settings, spanning applications like autopilot systems and facial recognition. It delves into attack and defense methods in machine learning security, crediting various funding sources. The authors, with backgrounds in computer science and information security from Chinese universities, focus on secure protocols and privacy protection in new computing environments. They detail defense approaches against adversarial attacks, referencing researchers like Szegedy, Lowd, Kurakin, and Wittel who have proposed techniques such as feature squeezing and foveation-based methods. Papers by Nelson, Guo, and Moosavi are cited for improving model security and robustness. Emphasizing the significance of parallel distributed computing in machine learning research, the text scrutinizes the traditional training and deployment of ML models in benign environments. It investigates the security properties of ML algorith