In [5]:
import os
import re
import nltk
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

class IndexFileCreator:
    def __init__(self, crawled_pages_folder):
        self.crawled_pages_folder = crawled_pages_folder
        self.inverted_index = {}

    def generate_meta_file(self, meta_file_path):
        with open(meta_file_path, 'w', encoding='utf-8') as meta_file:
            for file_name in os.listdir(self.crawled_pages_folder):
                file_path = os.path.join(self.crawled_pages_folder, file_name)
                if os.path.isfile(file_path):
                    with open(file_path, 'r', encoding='utf-8') as page_file:
                        content = page_file.read()
                        meta_info = self.extract_meta_info(content, file_name)  
                        meta_file.write(f"Web Page ID: {meta_info['page_id']}\n")
                        meta_file.write(f"URI: {meta_info['url']}\n")
                        meta_file.write(f"Title: {meta_info['title']}\n")
                        meta_file.write(f"Page Body: {meta_info['body']}\n")
                        meta_file.write(f"Description Keywords: {', '.join(meta_info['keywords'])}\n\n")

    def extract_meta_info(self, content, file_name):
        url_pattern = re.compile(r"URL: (.+)")
        title_pattern = re.compile(r"Title: (.+)")
        body_pattern = re.compile(r"Page Body:(.+?)(?=Headings:|$)", re.DOTALL)
        heading_pattern = re.compile(r"Headings:(.+?)Page Body:", re.DOTALL)
    
        url_match = url_pattern.search(content)
        title_match = title_pattern.search(content)
        body_match = body_pattern.search(content)
        heading_match = heading_pattern.search(content)
    
        page_id_match = re.search(r'page(\d+)\.txt', file_name)
        page_id = int(page_id_match.group(1)) if page_id_match else None
    
        url = url_match.group(1).strip() if url_match else ""
        title = title_match.group(1).strip() if title_match else ""
        body_text = body_match.group(1).strip() if body_match else ""
        body = ' '.join(body_text.split()[:200])
        
        headings = heading_match.group(1).strip().split("\n-") if heading_match else []
    
        # Combine body text with headings for keyword extraction
        all_text = " ".join(headings) + " " + body_text
        
    
        # Tokenize and normalize text
        tokens = word_tokenize(all_text.lower())
        tokens = [token for token in tokens if token.isalnum()]
    
        # Remove stop words
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in tokens if token not in stop_words]
    
        # Stemming
        lemmatizer = WordNetLemmatizer()
        stemmed_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
        return {'page_id': page_id, 'url': url, 'title': title, 'keywords': stemmed_tokens,'body':body}

    def generate_inverted_index(self, inverted_index_file_path):
        for file_name in os.listdir(self.crawled_pages_folder):
            file_path = os.path.join(self.crawled_pages_folder, file_name)
            if os.path.isfile(file_path):
                with open(file_path, 'r', encoding='utf-8') as page_file:
                    content = page_file.read()
                    meta_info = self.extract_meta_info(content, file_name)
                    for keyword in meta_info['keywords']:
                        if keyword not in self.inverted_index:
                            self.inverted_index[keyword] = {'page_ids': set(), 'frequency': 0}
                        self.inverted_index[keyword]['page_ids'].add(meta_info['page_id'])
                        self.inverted_index[keyword]['frequency'] += 1
    
        with open(inverted_index_file_path, 'w', encoding='utf-8') as inverted_index_file:
            inverted_index_file.write("Keyword|Frequency|Document\n")
            for keyword, data in self.inverted_index.items():
                page_ids = sorted(data['page_ids'])
                frequency = data['frequency']
                inverted_index_file.write(f"{keyword}|{frequency}| {' '.join(map(str, page_ids))}\n")



[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
if __name__ == "__main__":
    # Define the path to the folder containing crawled pages
    crawled_pages_folder = "crawled_pages"
    
    # Create an instance of MetaFileCreator
    meta_creator = IndexFileCreator(crawled_pages_folder)
    
    # Define the path where you want to save the meta file
    meta_file_path = "meta.txt"
    
    # Generate the meta file
    meta_creator.generate_meta_file(meta_file_path)
    
    # Define the path where you want to save the inverted index file
    inverted_index_file_path = "inverted_index.txt"
    
    # Generate the inverted index file
    meta_creator.generate_inverted_index(inverted_index_file_path)
    
    # Print a message indicating both files have been generated
    print("Meta file and inverted index file have been generated successfully.")


Meta file and inverted index file have been generated successfully.
