In [58]:
from concurrent.futures import ThreadPoolExecutor
import json
import multiprocessing
import os
import pickle
from queue import Empty, Queue
from urllib.parse import urljoin, urlparse
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from scipy import sparse
from nltk.stem import PorterStemmer
import re
from pathlib import Path
from bs4 import BeautifulSoup, Comment
from nltk.tokenize import word_tokenize
from ordered_set import OrderedSet
import requests
import numpy as np

In [92]:
def custom_preprocessor(s): 
    ps = PorterStemmer() 
    s = s.lower()
    s = re.sub(r'[^A-Za-z]', ' ', s) 
    s = re.sub(r'\s+', ' ', s) 
    s = word_tokenize(s) 
    s = [word for word in s if len(word)>2] 
    s = ' '.join(s) 
    return s 

In [49]:
class BM25(object):
    def __init__(self, vectorizer, b=0.75, k1=1.6):
        self.vectorizer = vectorizer
        self.b = b
        self.k1 = k1

    def fit(self, X):
        """ Fit IDF to documents X """
        self.vectorizer.fit(X) 
        self.y = super(TfidfVectorizer, self.vectorizer).transform(X)
        self.avdl = self.y.sum(1).mean()

    def transform(self, q):
        """ Calculate BM25 between query q and documents X """
        b, k1, avdl = self.b, self.k1, self.avdl

         # apply CountVectorizer
        len_y = self.y.sum(1).A1
        q, = super(TfidfVectorizer, self.vectorizer).transform([q])
        assert sparse.isspmatrix_csr(q)

        # convert to csc for better column slicing
        y = self.y.tocsc()[:, q.indices]
        denom = y + (k1 * (1 - b + b * len_y / avdl))[:, None]
        idf = self.vectorizer._tfidf.idf_[None, q.indices] - 1.
        numer = y.multiply(np.broadcast_to(idf, y.shape)) * (k1 + 1)
        return (numer / denom).sum(1).A1


# (1) Develop a simple multithreaded web crawler (pg 36-41)

In [50]:
class MultiThreadCrawler:
    def __init__(self, base_url, depth):
        self.base_url = base_url
        extracted_url = urlparse(base_url)
        parent = extracted_url.path[:extracted_url.path.rfind("/") + 1]
        self.root_url = '{}://{}{}'.format(extracted_url.scheme, extracted_url.netloc, parent)
        self.pool = ThreadPoolExecutor(max_workers=multiprocessing.cpu_count() - 1)
        self.to_crawl = multiprocessing.Queue()
        self.to_crawl.put({self.base_url: depth})
        self.stored_folder = Path(os.path.abspath('')).parent / 'crawled/'

        if not Path(self.stored_folder).exists():
            Path.mkdir(self.stored_folder)

        if Path(self.stored_folder / 'url_list.pickle').exists():
            with open(self.stored_folder / 'url_list.pickle', 'rb') as f:
                self.crawled_pages = pickle.load(f)
            print(self.crawled_pages)
        else:
            self.crawled_pages = set([])
    
    def extract_page(self, obj): 
        if obj.result(): 
            result, url, depth = obj.result() 
        if result and result.status_code == 200: 
            url_lists = self.parse_links(result.text, depth) 
            self.parse_contents(url, result.text, url_lists)
        
    def get_page(self,url, depth): 
        try: 
            result = requests.get(url, timeout=(3,30)) 
            return result, url, depth 
        except requests.RequestException: 
            return
         
    def parse_links(self, html, depth): 
        soup = BeautifulSoup(html, 'html.parser') 
        links = soup.find_all('a', href=True) 
        url_lists = [] 
        for link in links: 
            url = link['href'] 
            url = urljoin(self.root_url, url) 
            if depth >= 0 and '..' not in url and url not in self.crawled_pages: 
                print("Adding {}".format(url)) 
                self.to_crawl.put({url: depth}) 
            url_lists.append(url) 
        return url_lists 
    
    def parse_contents(self, url, html, url_lists): 
        def tag_visible(element): 
            if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']: 
                return False 
            if isinstance(element, Comment): 
                return False 
            return True 

        try: 
            soup = BeautifulSoup(html, 'html.parser') 
            texts = soup.findAll(string=True) 
            visible_texts = filter(tag_visible, texts) 

            title = soup.find('title').string.strip() 
            text = u" ".join(t.strip() for t in visible_texts).strip() 

            with open(self.stored_folder / (str(hash(url)) + '.txt'), 'w', encoding='utf-8') as f: 
                json.dump({'url': url, 'title': title, 'text': text, 'url_lists': url_lists}, f, ensure_ascii=False) 
        except: 
            pass 
        
    def run_scraper(self): 
        while True: 
            try: 
                target = self.to_crawl.get(timeout=10) 
                url, depth = [(k, target[k]) for k in target][0] 
                if url not in self.crawled_pages: 
                    self.crawled_pages.add(url) 
                    job = self.pool.submit(self.get_page, url, depth - 1) 
                    job.add_done_callback(self.extract_page) 
            except Empty: 
                with open(self.stored_folder / 'url_list.pickle', 'wb') as f: 
                    pickle.dump(self.crawled_pages, f, pickle.HIGHEST_PROTOCOL) 
                with open(self.stored_folder / 'url_list.pickle', 'rb') as f: 
                    print(pickle.load(f)) 
                break 
            except Exception as e: 
                print(e) 
                continue 
            
    
        

In [27]:
if __name__ == '__main__': 
    s = MultiThreadCrawler("https://camt.cmu.ac.th/index.php/en/", 2) 
    s.run_scraper()  

{'https://camt.cmu.ac.th/index.php/th/payment.html', 'https://camt.cmu.ac.th/index.php/en/all-news-groups/17-ข่าวบริการการศึกษา.html', 'https://camt.cmu.ac.th/images/gallery_in_article/2024012604/DSC00070.jpg', 'https://camt.cmu.ac.th/index.php/th/หัวข้อกลุ่มข่าวทั้งหมด.html', 'https://camt.cmu.ac.th/attachments/article/871/20230704102010786.pdf', 'https://camt.cmu.ac.th/index.php/th/หัวข้อกลุ่มข่าวทั้งหมด/17-ข่าวบริการการศึกษา.html', 'https://camt.cmu.ac.th/index.php/en/student/download-documents-for-reimbursement-for-education-fees-for-children-of-civil-servants/category/23-dii-หลักสูตรบูรณาการอุตสาหกรรมดิจิทัล.html', 'https://camt.cmu.ac.th/images/gallery_in_article/2024012604/DSC00041.jpg', 'https://account.cmu.ac.th/Forget/', 'https://camt.cmu.ac.th/index.php/en/day.php?year=2024&month=3&day=23&area=1&room=1', 'https://camt.cmu.ac.th/index.php/en/?p=profile_catalog', 'https://go.camt.cmu.ac.th/index.php/th/major/graduate/graduate-dtm', 'https://twitter.com/intent/tweet', 'https://

# (2) Develop a simple web indexer (pg 43)

In [93]:
class Indexer():
    def __init__ (self):
        self.crawled_folder = Path(os.path.abspath('')).parent / 'crawled/' 
        self.stored_file = 'src/resources/manual_indexer.pkl'
        if os.path.isfile(self.stored_file):
            with open(self.stored_file, 'rb') as f:
                cached_dict = pickle.load(f)
            self.__dict__.update(cached_dict)
        else:
            self.run_indexer()
            
    def run_indexer(self):
        documents = []
        for  file in os.listdir(self.crawled_folder):
            if file.endswith('.txt'):
                j =json .load(open(os.path.join(self.crawled_folder, file)))
                documents.append(j)
        self.documents = pd.DataFrame.from_dict(documents)
        tfidf_verctorizer = TfidfVectorizer(preprocessor=custom_preprocessor, stop_words=stopwords.words('english'))
        self.bm25 = BM25(tfidf_verctorizer)
        self.bm25.fit(self.documents.apply(lambda s: ''.join(s[['title','text']]), axis=1))
        with open(self.stored_file, 'wb') as f:
            pickle.dump(self.__dict__, f) 
        
    def search(self, query): 
        score = self.bm25.transform(query)
        return self.documents.join(pd.DataFrame(score, columns=['score'])).sort_values(by='score', ascending=False)


In [96]:
i = Indexer()

# (3) Quick workout #1: Search using a query ‘school’ with BM25. (pg 44)

In [97]:
i.search('school')

Unnamed: 0,url,title,text,url_lists,score
2,https://go.camt.cmu.ac.th/index.php/th/2019-05...,Gifted School 2020,"Choose your language ไทย li dir=""ltr"" ...","[http://www.go-camt.com/index.php/th/, http://...",5.124833
80,https://www.grad.cmu.ac.th/index.php?lang=en,"Graduate School, Chiang Mai University",MIdS : Multidisciplinary and Interdisciplinary...,"[https://cmu.to/admission/, https://w3.grad.cm...",5.044423
15,https://service.camt.cmu.ac.th/gifted,Gift School 2023,<< คลิกที่นี่ >> ระบบรับสมัคร Gifted School | ...,[https://service.camt.cmu.ac.th/gifted/gifted/...,4.869457
212,https://go.camt.cmu.ac.th/index.php/th/major/g...,การจัดการความรู้และนวัตกรรม ป.โท,Choose your language ไทย English (UK)...,"[https://go.camt.cmu.ac.th/index.php/th/, http...",4.021525
114,https://go.camt.cmu.ac.th/index.php/th/major/g...,การจัดการความรู้และนวัตกรรม ป.เอก,Choose your language ไทย English (UK)...,"[https://go.camt.cmu.ac.th/index.php/th/, http...",3.953882
...,...,...,...,...,...
83,https://admission.grad.cmu.ac.th/admissions/in...,"Application for Graduate study, Chiang Mai Uni...",à¹à¸à¸£à¸à¸ªà¸£à¹à¸²à¸à¸«à¸¥à¸±à¸à¸ªà¸¹à...,"[https://camt.cmu.ac.th/index.php/en/?p=101, h...",0.000000
84,https://smartoffice.camt.cmu.ac.th/v1r,CAMT Smart Office,CMU Account Personal Account เข้าสู...,"[https://camt.cmu.ac.th/v1r/authen, https://ca...",0.000000
85,https://camt.cmu.ac.th/index.php/en/all-news-g...,รับสมัครบุคคลเพื่อคัดเลือกเข้าร่วมรับทุนสนับสน...,Home About us Back Visio...,"[https://camt.cmu.ac.th/index.php/en/, https:/...",0.000000
86,https://go.camt.cmu.ac.th/index.php/en/,"College of Arts, Media and Technology, Chiang ...",Choose your language Thailand(TH) Eng...,"[http://www.go-camt.com/index.php/th/, http://...",0.000000
