<a href="https://colab.research.google.com/github/Alaa-f-Abdalaal/Machine-Projectss/blob/main/Search_Engine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re
import requests


In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [4]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = re.sub(r'[^\w\s]', '', text.lower())
    tokens = word_tokenize(text)
    tokens = [
        lemmatizer.lemmatize(t)
        for t in tokens
        if t not in stop_words and t.isalpha()
    ]
    return tokens


In [5]:
import requests

def load_gutenberg_book(url):
    r = requests.get(url)
    r.encoding = "utf-8"
    return r.text

def clean_gutenberg_text(text):
    start = "*** START OF THIS PROJECT GUTENBERG EBOOK"
    end = "*** END OF THIS PROJECT GUTENBERG EBOOK"

    if start in text and end in text:
        text = text.split(start)[1].split(end)[0]
    return text


In [6]:
book_links = [
    "https://www.gutenberg.org/files/1342/1342-0.txt",  # Pride and Prejudice
    "https://www.gutenberg.org/files/11/11-0.txt",      # Alice in Wonderland
    "https://www.gutenberg.org/files/98/98-0.txt"       # A Tale of Two Cities
]


In [7]:
docs = []

for link in book_links:
    raw_text = load_gutenberg_book(link)
    clean_text = clean_gutenberg_text(raw_text)
    docs.append(clean_text)


In [8]:
def build_index(docs):
    index = {}
    for doc_id, doc in enumerate(docs):
        tokens = preprocess(doc)
        for pos, token in enumerate(tokens):
            if token not in index:
                index[token] = {}
            if doc_id not in index[token]:
                index[token][doc_id] = []
            index[token][doc_id].append(pos)
    return index


In [9]:
index = build_index(docs)
print("Index built for Project Gutenberg book!")


Index built for Project Gutenberg book!


In [10]:
def search(query, index, docs):
    q_tokens = preprocess(query)
    if not q_tokens:
        return []

    if q_tokens[0] not in index:
        return []

    result_docs = set(index[q_tokens[0]].keys())

    for term in q_tokens[1:]:
        if term not in index:
            return []
        result_docs &= set(index[term].keys())

    return [(i, docs[i]) for i in result_docs]


In [None]:
while True:
    query = input("\nSearch: ").strip()
    if query == "quit":
        break

    results = search(query, index, docs)

    print(f"\nFound {len(results)} document(s):")
    for doc_id, _ in results:
        print(f"Document [{doc_id}]")
        print(f"Link: {book_links}\n")



Search: half whisper

Found 3 document(s):
Document [0]
Link: ['https://www.gutenberg.org/files/1342/1342-0.txt', 'https://www.gutenberg.org/files/11/11-0.txt', 'https://www.gutenberg.org/files/98/98-0.txt']

Document [1]
Link: ['https://www.gutenberg.org/files/1342/1342-0.txt', 'https://www.gutenberg.org/files/11/11-0.txt', 'https://www.gutenberg.org/files/98/98-0.txt']

Document [2]
Link: ['https://www.gutenberg.org/files/1342/1342-0.txt', 'https://www.gutenberg.org/files/11/11-0.txt', 'https://www.gutenberg.org/files/98/98-0.txt']


Search: boisterousness

Found 1 document(s):
Document [0]
Link: ['https://www.gutenberg.org/files/1342/1342-0.txt', 'https://www.gutenberg.org/files/11/11-0.txt', 'https://www.gutenberg.org/files/98/98-0.txt']


Search: Blazing strange

Found 1 document(s):
Document [2]
Link: ['https://www.gutenberg.org/files/1342/1342-0.txt', 'https://www.gutenberg.org/files/11/11-0.txt', 'https://www.gutenberg.org/files/98/98-0.txt']

