<div align="center">
<h1> SEIR Project-1 </h1   >
</div>

In [33]:
import sys
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re


In [20]:

def fetch_page(url):
    response = requests.get(url)
    return response.text

In [None]:

def get_title(soup):
    if soup.title:
        return soup.title.string
    return "No title Found"

In [22]:
def get_body_text(soup):
    if soup.body:
        for tag in soup(["script", "style"]):
            tag.decompose()
        return soup.body.get_text()
    return ""

In [23]:

def get_links(soup, base_url):
    links = []
    for link in soup.find_all('a'):
        href = link.get('href')
        if href:
            full_url = urljoin(base_url, href)
            if full_url.startswith('http'):
                links.append(full_url)
    return links

In [7]:
def getDetailByURL(url):
    response = fetch_page(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    print(f"Title: {get_title(soup)}\n")
    
    body = get_body_text(soup)
    print(f"Body: {body}\n")
    
    print("Links:")
    for link in get_links(soup, url):
        print(link)

In [17]:
getDetailByURL("https://www.oshoworld.com/")

Title: Osho World

Body: OSHO'S LIFE Osho Dham Meditation osho pearls StoreOshodham ProgramsUpcoming ProgramsView AllOsho Meditation RetreatFebruary 27th, 2026Click HereBreath : The Key to MeditationMarch 6th, 2026Click HereBorn Again GroupMarch 9th, 2026Click HereVideosVIEW ALLOsho DiscourseshindienglishVIEW ALLReadshindienglishOsho MeditationsDynamic MeditationKundalini MeditationNataraj MeditationMandala MeditationNadabrahma MeditationNo Dimensions MeditationShopBooksExplore MoreAudio DiscourseExplore MoreOsho PhotosExplore MoreOsho MagazineExplore MorePearlsNews

Links:
https://www.oshoworld.com/osho-biography
https://www.oshoworld.com/osho-dham
https://www.oshoworld.com/meditation
https://www.oshoworld.com/pearls
https://www.oshoworld.com/shop
https://www.oshoworld.com/osho-dham
https://www.oshoworld.com/events
https://www.oshoworld.com/osho-meditation-retreat
https://www.oshoworld.com/breath-the-key-to-meditation
https://www.oshoworld.com/born-again-group
https://www.oshoworld.co

In [9]:
getDetailByURL("https://oshoworld.com/")

Title: Osho World

Body: OSHO'S LIFE Osho Dham Meditation osho pearls StoreOshodham ProgramsUpcoming ProgramsView AllOsho Meditation RetreatFebruary 27th, 2026Click HereBreath : The Key to MeditationMarch 6th, 2026Click HereBorn Again GroupMarch 9th, 2026Click HereVideosVIEW ALLOsho DiscourseshindienglishVIEW ALLReadshindienglishOsho MeditationsDynamic MeditationKundalini MeditationNataraj MeditationMandala MeditationNadabrahma MeditationNo Dimensions MeditationShopBooksExplore MoreAudio DiscourseExplore MoreOsho PhotosExplore MoreOsho MagazineExplore MorePearlsNews

Links:
https://oshoworld.com/osho-biography
https://oshoworld.com/osho-dham
https://oshoworld.com/meditation
https://oshoworld.com/pearls
https://oshoworld.com/shop
https://oshoworld.com/osho-dham
https://oshoworld.com/events
https://oshoworld.com/osho-meditation-retreat
https://oshoworld.com/breath-the-key-to-meditation
https://oshoworld.com/born-again-group
https://oshoworld.com/video
https://oshoworld.com/video
https://

In [24]:
def count_word_frequency(text):
    words = re.findall(r'[a-zA-Z0-9]+', text.lower())
    word_count = {}
    for word in words:
        if word in word_count:
            word_count[word] = word_count[word] + 1
        else:
            word_count[word] = 1
    return word_count

In [25]:
def polynomial_rolling_hash(word):
    p = 53
    hash_value = 0
    for char in word:
        ascii_val = ord(char)
        hash_value = hash_value * p + ascii_val
        hash_value = hash_value % (2**64)
    return hash_value

In [26]:
def simhash(word_frequencies):
    boxes = [0] * 64
    for word, count in word_frequencies.items():
        word_hash = polynomial_rolling_hash(word)
        for position in range(64):
            bit_is_1 = (word_hash >> position) & 1
            if bit_is_1 == 1:
                boxes[position] = boxes[position] + count
            else:
                boxes[position] = boxes[position] - count
    
    fingerprint = 0
    for position in range(64):
        if boxes[position] > 0:
            fingerprint = fingerprint | (1 << position)
    return fingerprint

In [28]:
def countCommonBits(fp1, fp2):
    different_count = 0
    for position in range(64):
        bit1 = (fp1 >> position) & 1
        bit2 = (fp2 >> position) & 1
        if bit1 != bit2:
            different_count = different_count + 1
    return 64 - different_count



In [29]:
def process_url(url):
    html = fetch_page(url)
    soup = BeautifulSoup(html, 'html.parser')
    title = get_title(soup)
    body = get_body_text(soup)
    links = get_links(soup, url)
    word_freq = count_word_frequency(body)
    fingerprint = simhash(word_freq)
    return title, body, links, word_freq, fingerprint

In [34]:

def compare_two_urls(url1, url2):
    print("\n" + "="*50)
    print("FIRST WEBSITE:", url1)
    print("="*50)
    title1, body1, links1, freq1, fp1 = process_url(url1)
    
    print("Title:", title1)
    print("Body length:", len(body1), "characters")
    print("Number of links found:", len(links1))
    print("Unique words found:", len(freq1))
    
    print("Top 5 most used words:")
    word_list = []
    for word, count in freq1.items():
        word_list.append((word, count))
    
    for i in range(len(word_list)):
        for j in range(i + 1, len(word_list)):
            if word_list[i][1] < word_list[j][1]:
                word_list[i], word_list[j] = word_list[j], word_list[i]
    
    for i in range(min(5, len(word_list))):
        print("  ", word_list[i][0], ":", word_list[i][1], "times")
    
    print("Fingerprint (64 bits):", bin(fp1))
    
    print("\n" + "="*50)
    print("SECOND WEBSITE:", url2)
    print("="*50)
    title2, body2, links2, freq2, fp2 = process_url(url2)
    
    print("Title:", title2)
    print("Body length:", len(body2), "characters")
    print("Number of links found:", len(links2))
    print("Unique words found:", len(freq2))
    
    print("Top 5 most used words:")
    word_list = []
    for word, count in freq2.items():
        word_list.append((word, count))
    
    for i in range(len(word_list)):
        for j in range(i + 1, len(word_list)):
            if word_list[i][1] < word_list[j][1]:
                word_list[i], word_list[j] = word_list[j], word_list[i]
    
    for i in range(min(5, len(word_list))):
        print("  ", word_list[i][0], ":", word_list[i][1], "times")
    
    print("Fingerprint (64 bits):", bin(fp2))
    
    print("\n" + "="*50)
    print("COMPARING THE TWO WEBSITES")
    print("="*50)
    common = count_common_bits(fp1, fp2)
    print("Same bit positions:", common, "out of 64")
    print("Similarity:", (common/64)*100, "%")
    
    return common

In [35]:
compare_two_urls("https://oshoworld.com/", "https://www.oshoworld.com/")


FIRST WEBSITE: https://oshoworld.com/
Title: Osho World
Body length: 548 characters
Number of links found: 30
Unique words found: 40
Top 5 most used words:
   osho : 3 times
   2026click : 3 times
   allosho : 2 times
   meditation : 2 times
   moreosho : 2 times
Fingerprint (64 bits): 0b100000100000000000001000010100110011100101

SECOND WEBSITE: https://www.oshoworld.com/
Title: Osho World
Body length: 548 characters
Number of links found: 30
Unique words found: 40
Top 5 most used words:
   osho : 3 times
   2026click : 3 times
   allosho : 2 times
   meditation : 2 times
   moreosho : 2 times
Fingerprint (64 bits): 0b100000100000000000001000010100110011100101

COMPARING THE TWO WEBSITES
Same bit positions: 64 out of 64
Similarity: 100.0 %


64