<a href="https://colab.research.google.com/github/Ashis-Palai/Cancer_Information_RAG_GenAI/blob/main/GenAI_Cancer_Information_Powered_by_LLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **DATA EXTRACTION:**



## **Sources**

* **Cervical , Breast & Oral Cancers:**


> ```
https://tmc.gov.in/ncg/docs/PDF/DraftGuidelines/Preventive/3_%20NCG_INDIA_Rev_Preventive%20Oncology_Primary_Care.pdf
```


* **Ovarian_Cancer:**

> ```
https://main.icmr.nic.in/sites/default/files/guidelines/Ovarian_Cancer.pdf
```


* **Generic Information About Cancer**

>```
https://www.mayoclinic.org/diseases-conditions/cancer/diagnosis-treatment/drc-20370594
```



* **Breast Cancer:**

>```
https://www.cancer.org/cancer/types/breast-cancer/screening-tests-and-early-detection.html
```






In [None]:
!curl -O "https://raw.githubusercontent.com/Ashis-Palai/Cancer_Information_RAG_GenAI/main/helper.py"
!curl -O "https://raw.githubusercontent.com/Ashis-Palai/Cancer_Information_RAG_GenAI/main/requirements.txt"

In [None]:
!pip install -r requirements.txt

In [None]:
all_url = [ 'https://www.cancer.org/cancer/types/breast-cancer/screening-tests-and-early-detection.html',
'https://www.cancer.org/cancer/types/breast-cancer/screening-tests-and-early-detection/american-cancer-society-recommendations-for-the-early-detection-of-breast-cancer.html',
'https://www.cancer.org/cancer/types/breast-cancer/screening-tests-and-early-detection/mammograms.html',
'https://www.cancer.org/cancer/types/breast-cancer/screening-tests-and-early-detection/breast-ultrasound.html',
'https://www.cancer.org/cancer/types/breast-cancer/screening-tests-and-early-detection/breast-mri-scans.html',
'https://www.cancer.org/cancer/types/breast-cancer/screening-tests-and-early-detection/breast-cancer-signs-and-symptoms.html',
'https://tmc.gov.in/ncg/docs/PDF/DraftGuidelines/Preventive/3_%20NCG_INDIA_Rev_Preventive%20Oncology_Primary_Care.pdf',
'https://main.icmr.nic.in/sites/default/files/guidelines/Ovarian_Cancer.pdf'
            ]

In [None]:
from helper import extract_headings_and_content , word_wrap
for i, url in enumerate(all_url, start=1):
    result_message = extract_headings_and_content(url, i)
    print(result_message)

# **DATA TRANSFORMATION USING LANGCHAIN**

In [None]:
from langchain_community.document_loaders import PyPDFLoader , UnstructuredHTMLLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter , SentenceTransformersTokenTextSplitter
import glob

In [None]:


base_path = '/content/data'

# List all HTML files
html_files = glob.glob(f"{base_path}/html*/*.html")

# List all PDF files
pdf_files = glob.glob(f"{base_path}/pdf*/*.pdf")


all_results = [PyPDFLoader(file).load() if file.endswith(".pdf") else UnstructuredHTMLLoader(file).load() for file in pdf_files + html_files]



In [None]:
total_pages = sum([len(i) for i in all_results ])
total_pages

In [None]:
all_text_data = [j.page_content.strip() for i in all_results for j  in i]

In [None]:
len(all_text_data)

In [None]:
all_text_data = [ i for i in all_text_data if i] # To exclude the blank pages

In [None]:
len(all_text_data)

In [None]:
splitter_1 =  RecursiveCharacterTextSplitter(
    separators= ['\n\n\n\n\n','\n\n\n\n','\n\n\n','\n\n','\n','.',',',' '],
    chunk_size = 1000 ,chunk_overlap = 100)

splitter_2 = SentenceTransformersTokenTextSplitter(tokens_per_chunk=256,chunk_overlap=10)

In [None]:
prefinal_data = splitter_1.split_text('\n'.join(all_text_data))
len(prefinal_data)

In [None]:
final_data = []
[final_data.extend(splitter_2.split_text(text)) for text in prefinal_data]

len(final_data)

In [None]:
prefinal_data[0]

In [None]:
final_data[0]

# **DATA LOADING**

## **Embedding : GooglePalmEmbeddings**

In [None]:
!pip install pinecone-client

In [None]:
PINECONE_API_KEY = userdata.get('PINECONE_API_KEY')
GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')

In [None]:
from langchain_community.embeddings import GooglePalmEmbeddings
from google.colab import userdata

emb = GooglePalmEmbeddings(model_name='models/embedding-gecko-001',google_api_key=GOOGLE_API_KEY)
print(f"Out put dimension: {len(emb.embed_query('Hi How are you?'))}")

## **Data Base : Pinecone Vector DB**

In [None]:
from pinecone import Pinecone, ServerlessSpec, PodSpec


p = Pinecone(api_key= PINECONE_API_KEY)

In [None]:
p.create_index('cancer-retrieval',
              dimension=768,
              metric='cosine',
              spec = PodSpec(environment='gcp-starter',
                             pod_type='starter',
                             pods = 1))
index =  p.Index('cancer-retrieval')

In [None]:
index =  p.Index('cancer-retrieval')

In [None]:
index.describe_index_stats()