<a href="https://colab.research.google.com/github/AbdAllAh950/Algorithms/blob/main/T7_%D0%98%D1%81%D1%81%D0%B0_%D0%90%D0%B1%D0%B4%D0%B0%D0%BB%D0%BB%D0%B0(Abdallah_Essa).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install required libraries
!pip install beautifulsoup4 bitarray

# Import libraries
import requests
from bs4 import BeautifulSoup
from collections import Counter, defaultdict
import heapq
from bitarray import bitarray

Collecting bitarray
  Downloading bitarray-3.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (32 kB)
Downloading bitarray-3.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (278 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.3/278.3 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitarray
Successfully installed bitarray-3.0.0


In [None]:
# Step 1: Fetch and prepare text
def fetch_text_from_url(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        poem_div = soup.find("div", class_="poem-text")
        if poem_div:
            return poem_div.get_text(separator=" ").strip()
        else:
            return "Fallback text: By the shores of Gitche Gumee, By the shining Big-Sea-Water," \
                   " Stood the wigwam of Nokomis, Daughter of the Moon, Nokomis."
    except Exception as e:
        print(f"Error fetching text: {e}")
        return "Fallback text: By the shores of Gitche Gumee, By the shining Big-Sea-Water," \
               " Stood the wigwam of Nokomis, Daughter of the Moon, Nokomis."

In [None]:
# URL of the poem
url = "https://www.hwlongfellow.org/poems_poem.php?pid=62"
text = fetch_text_from_url(url)

In [None]:
# Step 2: Huffman Encoding
def build_huffman_tree(freq_table):
    heap = [[weight, [symbol, ""]] for symbol, weight in freq_table.items()]
    heapq.heapify(heap)
    while len(heap) > 1:
        lo = heapq.heappop(heap)
        hi = heapq.heappop(heap)
        for pair in lo[1:]:
            pair[1] = '0' + pair[1]
        for pair in hi[1:]:
            pair[1] = '1' + pair[1]
        heapq.heappush(heap, [lo[0] + hi[0]] + lo[1:] + hi[1:])
    return sorted(heapq.heappop(heap)[1:], key=lambda p: (len(p[-1]), p))

def huffman_encode(text, huffman_code):
    encoded = bitarray()
    for char in text:
        encoded.extend(huffman_code[char])
    return encoded

freq_table = Counter(text)
huffman_tree = build_huffman_tree(freq_table)
huffman_code = {symbol: bitarray(code) for symbol, code in huffman_tree}
encoded_text_huffman = huffman_encode(text, huffman_code)
original_size = len(text) * 8  # ASCII has 8 bits per character
compressed_size_huffman = len(encoded_text_huffman)
compression_ratio_huffman = original_size / compressed_size_huffman

In [None]:
# Step 3: LZW Compression
def lzw_compress(text):
    dictionary = {chr(i): i for i in range(256)}
    next_code = 256
    current = ""
    compressed = []
    for char in text:
        combined = current + char
        if combined in dictionary:
            current = combined
        else:
            compressed.append(dictionary[current])
            dictionary[combined] = next_code
            next_code += 1
            current = char
    if current:
        compressed.append(dictionary[current])
    return compressed, dictionary

compressed_data_lzw, lzw_dict = lzw_compress(text)
compressed_size_lzw = len(compressed_data_lzw) * 16  # Assuming 16-bit codes
compression_ratio_lzw = original_size / compressed_size_lzw

In [None]:
# Step 4: Country Name Embedding (Explanation)
embedding_explanation = """
To embed a country name (e.g., "France") in a dataset for a machine learning task:
1. Replace random words in the dataset with country names from a list.
2. Use embeddings for natural language processing tasks (e.g., Named Entity Recognition).
3. Example: Replace "X is a beautiful place" with "France is a beautiful place."
This creates a labeled dataset for models to learn recognizing geographic entities.
"""

In [None]:
# Step 5: Display Results
print(f"Original Size (bits): {original_size}")
print(f"Huffman Compressed Size (bits): {compressed_size_huffman}")
print(f"Huffman Compression Ratio: {compression_ratio_huffman:.2f}")
print(f"LZW Compressed Size (bits): {compressed_size_lzw}")
print(f"LZW Compression Ratio: {compression_ratio_lzw:.2f}")
print(f"Huffman Code Table Size: {len(huffman_code)}")
print(f"LZW Dictionary Size: {len(lzw_dict)}")
print("\nCountry Name Embedding Explanation:")
print(embedding_explanation)

Original Size (bits): 1080
Huffman Compressed Size (bits): 613
Huffman Compression Ratio: 1.76
LZW Compressed Size (bits): 1664
LZW Compression Ratio: 0.65
Huffman Code Table Size: 34
LZW Dictionary Size: 359

Country Name Embedding Explanation:

To embed a country name (e.g., "France") in a dataset for a machine learning task:
1. Replace random words in the dataset with country names from a list.
2. Use embeddings for natural language processing tasks (e.g., Named Entity Recognition).
3. Example: Replace "X is a beautiful place" with "France is a beautiful place."
This creates a labeled dataset for models to learn recognizing geographic entities.

