**CollectionStats class**
Implement the appropirate methods for the CollectionStats class, so that the statistics returned by the corresponding methods that are already implemented returns the correct values.

Implement the two methods, one of which writes the stats to file, and the other one of which loads the stats previously written.

In [None]:
import os
import pickle
from collections import defaultdict
import string

class CollectionStats:
    
    def __init__(self):
        self.N = 0 # number of documents
        self.TF = 0 # number of tokens in the collection
        self.M = 0 # number of unique terms in the collection
        self.L = 0 # average length of the documents in the collection
        self.C = 0 # average length of the terms in the collection
    
    def addDocument(self, document):
        # update collection statistics based on the new document
        self.N += 1
        self.TF += len(document)
        self.L = ((self.N - 1) * self.L + len(document)) / self.N
        
        # update the frequency of terms in the collection
        terms = set(document)
        self.M += len(terms)
        term_lengths = sum(len(term) for term in terms)
        if self.N == 1:
            self.C = term_lengths / len(terms) # initialize C to the average length of terms
        else:
            self.C = ((self.M - len(terms)) * self.C + term_lengths) / self.M
    
    def numberOfDocuments(self):
        return self.N
    
    def numberOfTokens(self):
        return self.TF
    
    def numberOfTerms(self):
        return self.M
    
    def averageTokensPerDoc(self):
        if self.N != 0:
          return self.TF / self.N
        else:
            return 0
    
    def averageCharsPerToken(self):
        if self.M != 0:
          return self.C / self.M
        else:
          return 0
    
    def write(self, index_path):
        # write collection statistics to disk
        with open(os.path.join(index_path, 'collection_stats.pkl'), 'wb') as f:
            pickle.dump(self.__dict__, f)
    
    def load(self, index_path):
        # load collection statistics from disk
        with open(os.path.join(index_path, 'collection_stats.pkl'), 'rb') as f:
            self.__dict__ = pickle.load(f)



## `Lexicon` Class

> Implement the `Lexicon` class for **Term Dictionary**

> This Lexicon class, for each unique term in the collection, stores 
1. An integer valued index for the term
2. The Document Frequency (DF) for the term<br>

That is, it stores $<\text{term: (idx, DF)}>$ pairs.

In [None]:
from collections import defaultdict

class Lexicon:
    
    def __init__(self):
        self.lexicon = defaultdict(int) # dictionary containing the terms and their frequency in the collection
        self.totalDF = 0 # total document frequency over all terms
    
    def addToken(self, token):
        # add token to the lexicon
        self.lexicon[token] += 1
        self.totalDF += 1
    
    def size(self):
        return len(self.lexicon)
    
    def getDF(self, term):
        # returns the document frequency of the input term
        return self.lexicon.get(term, 0)
    
    def getTotalDF(self):
        # returns the total document frequency over all terms
        return sum(self.lexicon.values())
    
    def getIdx(self, term):
        # returns the index of the term in the lexicon
        return sorted(list(self.lexicon.keys())).index(term)
    
    def getTerm(self, idx):
        # returns the term for the given index idx
        return sorted(list(self.lexicon.keys()))[idx]
    
    def write(self, index_path):
        # write lexicon to disk
        with open(os.path.join(index_path, 'lexicon.pkl'), 'wb') as f:
            pickle.dump(self.__dict__, f)
    
    def load(self, index_path):
        # load lexicon from disk
        with open(os.path.join(index_path, 'lexicon.pkl'), 'rb') as f:
            self.__dict__ = pickle.load(f)



## `Documents` Class

> Similar to the `Lexicon` class, `Document` Class, stores, for each of the document in the collection:
1. An integer valued index for the document
2. Lenght of the document in terms of tokens, i.e. Document Size

That is, it stores $<\text{document: (idx, length)}>$ pairs.

In [None]:
class Documents:

    def __init__(self):
        self.documents = []
        self.doc_idx_map = {}

    def __iter__(self):
        return iter(self.documents)

    def addDocument(self, document):
        self.documents.append(document)
        self.doc_idx_map[document] = len(self.documents) - 1

    def getIdx(self, document):
        return self.doc_idx_map.get(document, -1)

    def getDocument(self, idx):
        if 0 <= idx < len(self.documents):
          return self.documents[idx]
        return None

    def write(self, index_path):
        with open(os.path.join(index_path, "documents.txt"), "w", encoding="utf-8") as f:
            for doc in self:
                f.write(doc + "\n")


    def load(self, index_path):
        with open(os.path.join(index_path, "documents.txt"), "r", encoding="utf-8") as f:
          self.documents = [line.strip() for line in f.readlines()]
          self.doc_idx_map = {doc: i for i, doc in enumerate(self.documents)}


## `Index` Class

> This class implements BSB indexer.

In [None]:
import os
from nltk.tokenize import word_tokenize
from nltk.tokenize import TreebankWordTokenizer
from collections import Counter
import re
import json
import shutil
from collections import defaultdict
import nltk
nltk.download('punkt')  # download punkt tokenizer


def get_tokens(text):
    return word_tokenize(text)

class Index:
    def __init__(self, index_path):
        self.index_path = index_path
        self.lexicon = Lexicon()
        self.documents = Documents()
        self.collectionStats = CollectionStats()
        self.inverted_index = defaultdict(lambda: defaultdict(int))

    def add_block_to_index(self, block_path, block_id, block_docs, block_tokens):
        block = {
            'block_id': block_id,
            'docs': block_docs,
            'tokens': block_tokens
        }
        block_file = os.path.join(block_path, f'block_{block_id}.json')
        with open(block_file, 'w', encoding='utf-8') as f:
            json.dump(block, f)
        return block_file

    def flush_block(self, block_id, block_docs, block_tokens):
        for doc in block_docs:
            self.documents.append(doc)
        for token in block_tokens:
            self.collectionStats['num_tokens'] += 1
            self.collectionStats['total_chars'] += len(token)
            self.inverted_index[token][block_id] += 1

    def merge_blocks(self, block1, block2):
        merged_docs = block1['docs'] + block2['docs']
        merged_tokens = block1['tokens'] + block2['tokens']
        merged_block = {
            'block_id': block1['block_id'],
            'docs': merged_docs,
            'tokens': merged_tokens
        }
        return merged_block

  
    def create(self, collection_path):
        # Create Inverted Index using BSB Indexing
        BLOCK_SIZE = 10
        block_path = os.path.join(self.index_path, "blocks")
        if os.path.exists(block_path):
            shutil.rmtree(block_path)
        os.makedirs(block_path)

        block_id = 0
        block_docs = []
        block_tokens = []
        block_size = 0

        for root, dirs, files in os.walk(collection_path):
            for file in files:
                if file.endswith(".txt"):
                    file_path = os.path.join(root, file)
                    with open(file_path, "r", encoding="utf-8") as f:
                        content = f.read()
                        doc = os.path.relpath(file_path, collection_path)
                        block_docs.append(doc)
                        tokens = get_tokens(content)
                        block_tokens.extend(tokens)
                        block_size += len(tokens)

                        if block_size > BLOCK_SIZE:
                            block_file = self.add_block_to_index(block_path, block_id, block_docs, block_tokens)
                            self.flush_block(block_id, block_docs, block_tokens)
                            block_docs = []
                            block_tokens = []
                            block_size = 0
                            block_id += 1

        if block_size > 0:
            block_file = self.add_block_to_index(block_path, block_id, block_docs, block_tokens)
            self.flush_block(block_id, block_docs, block_tokens)

        # Merge Blocks
        block_files = [os.path.join(block_path, f) for f in os.listdir(block_path) if f.endswith(".json")]
        block_files.sort()
   
        merged_blocks = []
        for block_file in block_files:
            with open(block_file, "r", encoding="utf-8") as f:
                block = json.load(f)
                merged_blocks.append(block)
        
        while len(merged_blocks) > 1:
            new_merged_blocks = []
            for i in range(0, len(merged_blocks), 2):
                if i+1 < len(merged_blocks):
                    block1 = merged_blocks[i]
                    block2 = merged_blocks[i+1]
                    merged_block = self.merge_blocks(block1, block2)
                    new_merged_blocks.append(merged_block)
                else:
                    new_merged_blocks.append(merged_blocks[i])
            merged_blocks = new_merged_blocks

        # After index is created
        self.collectionStats.write(self.index_path)
        self.lexicon.write(self.index_path)
        self.documents.write(self.index_path)
        
        if len(merged_blocks) == 1:
            self.invertedIndex = merged_blocks[0]
            self.write(self.index_path)
        else:
            print("No blocks were created during indexing.")


    def write(self, index_path):
        with open(os.path.join(index_path, "inverted_index.json"), "w", encoding="utf-8") as f:
            json.dump(self.invertedIndex, f)

        with open(os.path.join(index_path, "documents.json"), "w", encoding="utf-8") as f:
            json.dump(self.documents.docs, f)

    def load(self):
      with open(os.path.join(self.index_path, "inverted_index.json"), "r", encoding="utf-8") as f:
          self.invertedIndex = json.load(f)

      self.collectionStats.load(self.index_path)
      self.lexicon.load(self.index_path)
      self.documents.load(self.index_path)



## Index creation

In [None]:


collection_path = "Your Collection file Path"
index_path = "Your Indexed file Path"

milliyet_index = Index(index_path)


# Create the index using the extracted collection
milliyet_index.create(collection_path)

# Get collection statistics and lexicon
collectionStats = milliyet_index.collectionStats
lexicon = milliyet_index.lexicon

# Print collection statistics
N = collectionStats.numberOfDocuments() # Number of documents
L = collectionStats.averageTokensPerDoc() # Average # of tokens per document
C = collectionStats.averageCharsPerToken() # Average # of chars per token

M = lexicon.size() # Total # of terms, size of term dictionary
NPP = lexicon.getTotalDF() # "Total DF" over all terms

print("\nCollection Statistics")
print(f"\nN\t{N}\nL\t{L}\nC\t{C}\nM\t{M}\nNPP\t{NPP}")


## Index Loading

In [None]:
index_path = "Your indexed file path"

milliyet_index = Index(index_path)

milliyet_index.load();

collectionStats = milliyet_index.collectionStats
lexicon = milliyet_index.lexicon

# Print collection statistics
N  = collectionStats.numberOfDocuments() # Number of documents
L  = collectionStats.averageTokensPerDoc() # Average # of tokens per document
C  = collectionStats.averageCharsPerToken() # Average # of chars per token

M  = lexicon.size() # Total # of terms, size of term dictionary
NPP = lexicon.getTotalDF() # "Total DF" over all terms
TF = collectionStats.numberOfTokens() # Total term frequency over all documents
print("\nCollection Statistics")
print(f"\nN\t{N}\nL\t{L}\nC\t{C}\nM\t{M}\nTF\t{TF}\nNPP\t{NPP}")