# Crawling

In [16]:
from pathlib import Path
data = {}
for doc_path in Path('src/data/documents').iterdir():
    
    if doc_path.suffix != '.txt':
        continue
    
    with open(doc_path) as f:
        doc_name = doc_path.stem.replace('_',' ').title()
        data[doc_name] = f.read()

# Text PreProcess

In [17]:
from abc import ABC, abstractmethod
import string

class TextProcessor(ABC):
    @abstractmethod
    def transform(self, text):
        pass
    
class ConvertCase(TextProcessor):
    def __init__(self, casing='lower'):
        self.casing = casing
        
    def transform(self, text):
        if self.casing == 'lower':
            return text.lower()
        elif self.casing == 'upper':
            return text.upper()
        elif self.casing == 'title':
            return text.title()
    
class RemoveDigit(TextProcessor):
    def transform(self, text):
        return ''.join(char if not char.isdigit() else ' ' for char in text)
    
class RemoveSpace(TextProcessor):
    def transform(self, text):
        return ' '.join(text.split())
    
class RemovePunkt(TextProcessor):
    def transform(self, text):
        return ''.join(char if not char in string.punctuation else ' ' for char in text)
    
class TextPipeline:
    def __init__(self, *args):
        self.transformers = args

    def transform(self, text):
        for i in self.transformers:
            text = i.transform(text)
        return text
    
    def __str__(self):
        transformers = ' -> '.join([tf.__class__.__name__ for tf in self.transformers])
        return f'Pipeline: [{transformers}]'
    


In [18]:
pipe = TextPipeline(ConvertCase(), RemoveDigit(), RemovePunkt(), RemoveSpace())    

# Indexing

In [25]:
stop_words = open('src/data/stop_words.txt').readlines()
stop_words = list(map(str.strip, stop_words))
stop_words = list(map(pipe.transform, stop_words))

index = {}

for doc_name, content in data.items():
    for word in content.split():
        word = pipe.transform(word)
        
        # Empty Words
        if not word:
            continue

        # Ignore Stop Words
        if word in stop_words:
            continue

        # Add to index
        if index.get(word):
            index[word].add(doc_name)
        else:
            index[word] = {doc_name, }



# Search

In [26]:
from termcolor import colored
from collections import Counter

def print_success(text):
    print(colored(text, 'green'))
    
TOP_N = 3
while True:
    # Get user input
    search_input = input('Search to find a doc (q to quit):')
    if search_input.lower() == 'q':
        break
    search_input = pipe.transform(search_input)

    # Get input tokens
    search_tokens = search_input.split()

    # Get relevant documents
    docs = []
    for token in search_tokens:
        docs.extend(index.get(token, []))
        
    # Rank documents
    docs_counter = Counter(docs).most_common()
    docs = [d[0] for d in docs_counter][:TOP_N]
    
    # Print the results
    for doc in docs:
        print_success(f'- {doc}')

Search to find a doc (q to quit): q
