### Extract Data from the Unniversity Web Site
 - We are extracting only 20 links for this proptype
 - No filterning is done on the text extracted from the pages

 - Python Packages
     - sentence-transformers ## for embedding model
     - qdrant-client         ## for interacting with qdrant and inmemory qdrant
     - langchain             ## for tokenizing the text
     - beautifulsoup4        ## for extrating the text from HTML pages
     - lxml                  ## used by above package
  

In [1]:
import requests
from bs4 import BeautifulSoup

def get_website_text(url):
    response = requests.get(url)
    html_content = response.text
    soup = BeautifulSoup(html_content, "html.parser")
    text = soup.get_text(separator="\n", strip=True)
    return text

def get_all_links(site):
    url = site
    reqs = requests.get(url)
    soup = BeautifulSoup(reqs.text, 'html.parser')
    
    urls = []
    for link in soup.find_all('a'):
        site = link.get('href')
        if site is not None:
            if site.startswith("https"):
                urls.append(site)
    return urls
"""
It returns a array of dictionary containing html_text of link and reference of the link
"""
def get_all_data_with_links(urls):
    arr_dict = []
    for url in urls:
        data_dict = {}
        data_dict["reference"] = url
        data_dict["html_text"] = get_website_text(url)
        arr_dict.append(data_dict)
        
    return arr_dict

### Extract only first 20 links

In [2]:
urls = get_all_links("https://lpu.in")
print(len(urls))
all_data = get_all_data_with_links(urls[0:20])


185


In [4]:
from langchain.document_loaders import ReadTheDocsLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from typing import List
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


### Chunk Size is 300 and with a overlap of 50

In [5]:
class ChunkData:
    def __init__(self, chunk_size=300, chunk_overlap=50):
        chunk_size = chunk_size
        chunk_overlap = chunk_overlap
        self.text_splitter = RecursiveCharacterTextSplitter(
                               
                                chunk_size=chunk_size,
                                chunk_overlap=chunk_overlap,
                                length_function=len,
                        )
    def create_chunks(self, text):
        chunks = self.text_splitter.create_documents(
                texts=[text["html_text"]], 
                metadatas=[{"source": text["reference"]
                   }])
        return [{"text": chunk.page_content, "source": chunk.metadata["source"]} for chunk in chunks]

In [6]:
chunkData = ChunkData(chunk_size=500)
chunks = chunkData.create_chunks(all_data[0])
chunk_list = chunkData.create_chunks(all_data[0])
for i in range(1,len(all_data)):
    c_data = chunkData.create_chunks(all_data[i])
    for c in c_data:
        chunk_list.append(c)

In [8]:
chunk_list[0:3]

[{'text': 'CUET - Lovely Professional University\nUnlock your Potential at LPU: Scholarship for CUET Aspirants. Click Here\nToday is the last day to apply for LPUNEST with maximum scholarship! Apply Now\nJobs\nHappenings\nConferences\nStudy Abroad\nLPUNEST\nInternational Admissions\nOnline Education\nDistance Education\nContact\nAdmissions\nPlacements\nExplore Programmes\nAbout\nOverview\nInfrastructure\nAccreditation & Approvals\nRanking\nPlacements\nAlumni\nLeadership\nOrganization Structure\nLocation\nTour LPU\nAdmissions',
  'source': 'https://www.lpu.in/events/cuet/'},
 {'text': "Location\nTour LPU\nAdmissions\nGet Started\nOverview\nWhy LPU?\nScholarship\nStudy Grant\nEducation Loan Assistance\nResidential Facilities\nTransportation Facilities\nMigration\nLPU Buzz (Influencer's Talk)\nLPU's EDUfair (Webinars)\nReporting and Induction\nDISHA (Counselling and Workshops)\nLPU in Your Town\nAfter 12th (Undergraduate) Programmes\nRegular Programmes\nHons. Programmes with Industry Coll

### Load the Data into Qdrant

In [9]:
from qdrant_client import models, QdrantClient
from sentence_transformers import SentenceTransformer

### Use the following model for converting the text into embeddings

In [10]:
client = QdrantClient(":memory:")
encoder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

### Use In memory Qdrant Database (Qdrant is a vector database)

In [11]:
qdrant = QdrantClient(":memory:")
qdrant.recreate_collection(
    collection_name="university_data",
    vectors_config=models.VectorParams(
        size=encoder.get_sentence_embedding_dimension(),  # Vector size is defined by used model
        distance=models.Distance.COSINE,
    ),
)

  qdrant.recreate_collection(


True

### Upload records into Qdrant

In [12]:
qdrant.upload_records(
    collection_name="university_data",
    records=[
        models.Record(
            id=idx, vector=encoder.encode(doc["text"]).tolist(), payload=doc
        )
        for idx, doc in enumerate(chunk_list)
    ],
)

  qdrant.upload_records(


### Search similar text and return top 3 elements

In [13]:
hits = qdrant.search(
    collection_name="university_data",
    query_vector=encoder.encode("how to get scholarship").tolist(),
    limit=1,
)
for hit in hits:
    print(hit.payload, "score:", hit.score)

{'text': 'Admission and Scholarship\nOnly Scholarship\nAdmission Process User Guide\nHow to Apply\nImportant Dates\nCovid-19 Initiatives\nFAQs\nSuccess link\n×\nSuccess\nPlease check your inbox we have sent you a link,\nClick on it to reset your password\nOk\nError link\nError\nError\nOk\n×\nConfirm\nDo you want to copy the form Form-name.\nNo\nYes\n×\nConfirm\nDo you want to copy the form Form-name.\nNo\nYes\nConfirmation\nYou have filtered Paid Applicants\nOk\nConfirmation\nResubmission Logic added successfully !\nOk\n×\nRe-assign User', 'source': 'https://admission.lpu.in/'} score: 0.6225961441644048
{'text': 'Overview\nProgramme Offered\nEnglish Language Requirement\nScholarship\nHow to Apply\nOnline Education\nProgrammes\nApply Now\nDistance Education\nAbout\nProgrammes on offer\nEnquire Now\nInternational\nGet Started\nHow to Apply\nApply Online\nHow to Pay\nSchlorship\nAdmission Guidelines\nAdmission Authetication\nEnglish Language Requirement\nOur Global Representatives\nProgra