# Normalizing the Content

In [1]:
# Warning control
import warnings
warnings.filterwarnings('ignore')

In [2]:
!pip install unstructured_client



In [3]:
!pip install ipython



In [4]:
!pip install unstructured



In [5]:
!pip install pdf2image



In [6]:
!pip install pdfminer



In [7]:
from IPython.display import JSON

import json

from unstructured_client import UnstructuredClient
from unstructured_client.models import shared
from unstructured_client.models.errors import SDKError

from unstructured.staging.base import dict_to_elements, elements_to_json

In [8]:
s = UnstructuredClient(
    api_key_auth="API_Key"
)

In [9]:
filename = "/content/paper_1.pdf"
with open(filename, "rb") as f:
    files=shared.Files(
        content=f.read(),
        file_name=filename,
    )

req = shared.PartitionParameters(
    files=files,
    strategy='hi_res',
    pdf_infer_table_structure=True,
    languages=["eng"],
)
try:
    resp = s.general.partition(req)
    print(json.dumps(resp.elements[:3], indent=2))
except SDKError as e:
    print(e)

[
  {
    "type": "Title",
    "element_id": "a44d1d6fc092a5c400177ec7f4082212",
    "text": "Arti\ufb01cial Intelligence as an Enabler for Cognitive Self-Organizing Future Networks",
    "metadata": {
      "filetype": "application/pdf",
      "languages": [
        "eng"
      ],
      "page_number": 1,
      "filename": "paper_1.pdf"
    }
  },
  {
    "type": "NarrativeText",
    "element_id": "1473019572f7ac20ade4148006719a53",
    "text": "Siddique Latif1, Farrukh Pervez1, Muhammad Usama2, Junaid Qadir2",
    "metadata": {
      "filetype": "application/pdf",
      "languages": [
        "eng"
      ],
      "page_number": 1,
      "parent_id": "a44d1d6fc092a5c400177ec7f4082212",
      "filename": "paper_1.pdf"
    }
  },
  {
    "type": "NarrativeText",
    "element_id": "c6f621a389cc20b899a410b531a50401",
    "text": "1National University of Science and Technology, Islamabad, 2Information Technology University (ITU), Lahore",
    "metadata": {
      "filetype": "application/pdf

In [10]:
print(json.dumps(resp.elements, indent=2))

[
  {
    "type": "Title",
    "element_id": "a44d1d6fc092a5c400177ec7f4082212",
    "text": "Arti\ufb01cial Intelligence as an Enabler for Cognitive Self-Organizing Future Networks",
    "metadata": {
      "filetype": "application/pdf",
      "languages": [
        "eng"
      ],
      "page_number": 1,
      "filename": "paper_1.pdf"
    }
  },
  {
    "type": "NarrativeText",
    "element_id": "1473019572f7ac20ade4148006719a53",
    "text": "Siddique Latif1, Farrukh Pervez1, Muhammad Usama2, Junaid Qadir2",
    "metadata": {
      "filetype": "application/pdf",
      "languages": [
        "eng"
      ],
      "page_number": 1,
      "parent_id": "a44d1d6fc092a5c400177ec7f4082212",
      "filename": "paper_1.pdf"
    }
  },
  {
    "type": "NarrativeText",
    "element_id": "c6f621a389cc20b899a410b531a50401",
    "text": "1National University of Science and Technology, Islamabad, 2Information Technology University (ITU), Lahore",
    "metadata": {
      "filetype": "application/pdf

# Metadata Extraction and Chunking

Finding elements associated with chapters

In [11]:
[x for x in resp.elements if x['type'] == 'Title']

[{'type': 'Title',
  'element_id': 'a44d1d6fc092a5c400177ec7f4082212',
  'text': 'Artiﬁcial Intelligence as an Enabler for Cognitive Self-Organizing Future Networks',
  'metadata': {'filetype': 'application/pdf',
   'languages': ['eng'],
   'page_number': 1,
   'filename': 'paper_1.pdf'}},
 {'type': 'Title',
  'element_id': 'fc7d2de50ceb8e143ec5037e2a865829',
  'text': 'REFERENCES',
  'metadata': {'filetype': 'application/pdf',
   'languages': ['eng'],
   'page_number': 5,
   'filename': 'paper_1.pdf'}}]

In [12]:
Title =[]
for x in resp.elements:
   if x['type'] == 'Title':
    Title.append(x['text'])

In [13]:
# Display the titles
for title in Title:
    print(title)


Artiﬁcial Intelligence as an Enabler for Cognitive Self-Organizing Future Networks
REFERENCES


In [14]:
Title_ids = {}
for element in resp.elements:
  for title in Title:
        if element["type"] == Title:
            Title_ids[element["element_id"]] = Title
            break

In [15]:
title_to_id = {v: k for k, v in Title_ids.items()}

# Get the title ID safely
title_id = title_to_id.get("Title")

# Find the narrative text elements whose parent_id matches the title ID and have type "NarrativeText"
narrative_texts = [x for x in resp.elements if x["metadata"].get("parent_id") == title_id and x["type"] == "NarrativeText"]

# Loop through each narrative text element and print its text content
for element in narrative_texts:
    print(element)

{'type': 'NarrativeText', 'element_id': '337d565fbf1804596e31336072c30592', 'text': 'maximize long term goals has been employed for efﬁcient spectrum utilization [15], minimum power consumption [16] and ﬁlling the spectrum holes dynamically [17]. SVM, a supervised classiﬁcation model, is being utilized for channel selection [18], adaptation of transmission parameters [19] and beam-forming design [20]. In CRNs, HMMs have been widely used to identify spectrum holes detection [21], spectrum handoff [22], and competitive spectrum access [23]. The range of AI-based techniques are not limited to the above mentioned applications, other applications of AI in CRNs are expressed in [3], [4]. By combining increasing spectrum agility, context aware adaptability of CR and AI techniques, CR has become an increasingly important feature of wireless systems. IEEE 802.16h has recommended CR as one of its key features and a lot of efforts are being made to introduce CR features in 3GPP LTE-Advance.', 'me

 Load documents into a vector db

See the elements in Vector DB

In [16]:
from unstructured.chunking.basic import chunk_elements
from unstructured.chunking.title import chunk_by_title

In [17]:
elements = dict_to_elements(narrative_texts)

In [18]:
chunks = chunk_elements(
    elements
)

In [19]:
for i in range(len(chunks)):
  print(json.dumps(chunks[i].to_dict()["text"], indent=2))

"maximize long term goals has been employed for ef\ufb01cient spectrum utilization [15], minimum power consumption [16] and \ufb01lling the spectrum holes dynamically [17]. SVM, a supervised classi\ufb01cation model, is being utilized for channel selection [18], adaptation of transmission parameters [19] and beam-forming design [20]. In CRNs, HMMs have been widely used to identify spectrum holes detection [21], spectrum handoff [22], and competitive spectrum access [23]. The range of AI-based techniques are not"
"limited to the above mentioned applications, other applications of AI in CRNs are expressed in [3], [4]. By combining increasing spectrum agility, context aware adaptability of CR and AI techniques, CR has become an increasingly important feature of wireless systems. IEEE 802.16h has recommended CR as one of its key features and a lot of efforts are being made to introduce CR features in 3GPP LTE-Advance."
"The rapid proliferation of multi-radio access technology-disparate sma

In [20]:
len(elements)

7

In [21]:
len(chunks)

13

In [22]:
!pip install together



In [34]:
import os
from together import Together
import json

client = Together(api_key="API_KEY")
ANS = []

for i in range(len(chunks)):
    T = (chunks[i].to_dict()["text"])
    response = client.chat.completions.create(
        model="meta-llama/Llama-3-8b-chat-hf",
        messages=[{"role": "user", "content": f"""Generate the neutral text of the following text in the format of insturction , input output, history. Given Text: {T}""".format(T=T)}],
    )
    ANS.append(response.choices[0].message.content)

# Save responses to a JSON file
output_file = "/content/responses.json"
with open(output_file, "w") as f:
    json.dump(ANS, f, indent=4)

print(f"Responses saved to {output_file}")

Responses saved to /content/responses.json


In [24]:
filename = "/content/paper_1.pdf"
with open(filename, "rb") as f:
    files=shared.Files(
        content=f.read(),
        file_name=filename,
    )

req = shared.PartitionParameters(
    files=files,
    strategy='hi_res',
    pdf_infer_table_structure=True,
    languages=["eng"],
)
try:
    resp = s.general.partition(req)
    print(json.dumps(resp.elements[:3], indent=2))
except SDKError as e:
    print(e)
[x for x in resp.elements if x['type'] == 'Title']
Title =[]
for x in resp.elements:
   if x['type'] == 'Title':
    Title.append(x['text'])
Title_ids = {}
for element in resp.elements:
  for title in Title:
        if element["type"] == Title:
            Title_ids[element["element_id"]] = Title
            break
title_to_id = {v: k for k, v in Title_ids.items()}

# Get the title ID safely
title_id = title_to_id.get("Title")

# Find the narrative text elements whose parent_id matches the title ID and have type "NarrativeText"
narrative_texts = [x for x in resp.elements if x["metadata"].get("parent_id") == title_id and x["type"] == "NarrativeText"]

# Loop through each narrative text element and print its text content
for element in narrative_texts:
    print(element)
elements = dict_to_elements(narrative_texts)
chunks = chunk_elements(
    elements
)
for i in range(len(chunks)):
  print(json.dumps(chunks[i].to_dict()["text"], indent=2))


client = Together(api_key="API_KEY")

ANS = []

for i in range(len(chunks)):
    T = (chunks[i].to_dict()["text"])
    response = client.chat.completions.create(
        model="meta-llama/Llama-3-8b-chat-hf",
        messages=[{"role": "user", "content": f"""Generate the neutral text of the following text.
        Given Text: {T}""".format(T=T)}],
    )
    ANS.append(response.choices[0].message.content)

for i in ANS:
    print(i)
    print("============================================")


[
  {
    "type": "Title",
    "element_id": "a44d1d6fc092a5c400177ec7f4082212",
    "text": "Arti\ufb01cial Intelligence as an Enabler for Cognitive Self-Organizing Future Networks",
    "metadata": {
      "filetype": "application/pdf",
      "languages": [
        "eng"
      ],
      "page_number": 1,
      "filename": "paper_1.pdf"
    }
  },
  {
    "type": "NarrativeText",
    "element_id": "1473019572f7ac20ade4148006719a53",
    "text": "Siddique Latif1, Farrukh Pervez1, Muhammad Usama2, Junaid Qadir2",
    "metadata": {
      "filetype": "application/pdf",
      "languages": [
        "eng"
      ],
      "page_number": 1,
      "parent_id": "a44d1d6fc092a5c400177ec7f4082212",
      "filename": "paper_1.pdf"
    }
  },
  {
    "type": "NarrativeText",
    "element_id": "c6f621a389cc20b899a410b531a50401",
    "text": "1National University of Science and Technology, Islamabad, 2Information Technology University (ITU), Lahore",
    "metadata": {
      "filetype": "application/pdf

In [25]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [32]:
import os
import json
from together import Together
import PyPDF2

# Initialize Together client
client = Together(api_key="API_Key")

# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page_num in range(len(reader.pages)):
            text += reader.pages[page_num].extract_text()
        return text

# Directory containing PDF files
pdf_directory = '/content/'

# Initialize list to store results
neutral_texts = []

# Iterate over PDF files
for pdf_file in os.listdir(pdf_directory):
    if pdf_file.endswith('.pdf'):
        pdf_path = os.path.join(pdf_directory, pdf_file)
        # Extract text from PDF
        pdf_text = extract_text_from_pdf(pdf_path)
        # Generate neutral text
        response = client.chat.completions.create(
            model="meta-llama/Llama-3-8b-chat-hf",
            messages=[{"role": "user", "content": f"Generate the neutral text of the following text in the format of instruction, input, output, history for each chunk of the data. Given Text: {pdf_text}"}],
        )
        neutral_text = response.choices[0].message.content
        # Split the neutral text into instruction, input, output, and history
        parts = neutral_text.split("\n\n")
        neutral_text_dict = {
            "instruction": parts[0],
            "input": parts[1],
            "output": parts[2],
            "history": parts[3] if len(parts) > 3 else ""
        }
        # Append results to the list
        neutral_texts.append({
            "pdf_file": pdf_file,
            "neutral_text": neutral_text_dict
        })

# Save results to a JSON file
output_file = '/content/instruct.json'
with open(output_file, 'w') as json_file:
    json.dump(neutral_texts, json_file, indent=4)
print("Neutral texts saved to:", output_file)

Neutral texts saved to: /content/instruct.json
