In [91]:
import requests
from bs4 import BeautifulSoup 
import re
import unicodedata
import pythainlp.util
import spacy
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from pythainlp.tokenize import word_tokenize
from pythainlp.util import find_keyword
from pythainlp.util import rank
#from pythainlp.summarize import extract_keywords
from pythainlp.summarize import summarize
import itertools
import sqlite3
from urllib.parse import urljoin
import time
from pythainlp.tag import tag_provinces
from datetime import datetime


In [92]:
class spyder:
    def __init__( self ,target_links,base_url ):
        self.base_url = base_url
        self.target_links = target_links
    
    def get_crawler(self):
        self.result_crawler = self.crawl(self.base_url,0,set())
        return self.result_crawler
    
    def get_check_domain(self):
        self.check_domain_result = self.check_domain(self.base_url,self.get_crawler())
        return self.check_domain_result
    
    def get_check_not_domain(self):
        self.check_not_domain_result = self.check_not_domain(self.base_url,self.get_crawler())   
        return self.check_not_domain_result
    
    def get_check_ref(self):
        self.check_ref_result = self.check_ref(self.get_check_not_domain(),self.target_links)
        return self.check_ref_result
    
    def get_all(self):
        crawl = self.crawl(self.base_url,0,set())
        check_domain =  self.check_domain(self.base_url,crawl) 
        check_not_domain = self.check_not_domain(self.base_url,crawl)
        check_ref = self.check_ref(check_not_domain,self.target_links)
        return check_domain,check_ref
    
    def crawl(self,url, depth,visited):
        if depth < 3 :
            visited.add(url)
            headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"}
            time.sleep(0.3)
            response = requests.get(url,headers=headers)
            try:
                soup = BeautifulSoup(response.text, 'html.parser')
            except:
                soup = BeautifulSoup(response.text, 'lxml')
            links = soup.find_all('a')
            links = [link.get('href') for link in links if link.get('href') and not link.get('href').startswith('#')]
            links = [urljoin(url, link) for link in links if link]

            for link in links:
                if link not in visited:
                    link = link.replace(' ','')
                    visited.add(link)
                    if link.startswith(url):
                        self.crawl(link,depth=depth+1, visited=visited)
        return visited
    
    def check_domain(self,base_url,links):
        result= set()
        for link in links :
            if link.startswith(base_url):
                result.add(link)
        return result
    
    def check_not_domain(self,base_url,links):
        result= set()
        for link in links :
            if not link.startswith(base_url):
                result.add(link)
        return result
    
    def check_ref(self,links,target_links):
        for i in links:
            for j in target_links:
                if i.startswith(j):
                    target_links[j]+=1
        return target_links

In [104]:
def scrape_tags(url):
  response = requests.get(url)
  soup = BeautifulSoup(response.text, 'html.parser')
  try:
    title_tag = soup.find('title').text
  except:
    title_tag = soup.find('title')
  body_tag = soup.find('body')
  text_below_body = body_tag.get_text() 
  body_list =[]
  body_list.append(text_below_body)
  return (body_list,title_tag)


In [94]:
def cleansing(body):
    for i in body:
        output = i.replace('\n', '  ').replace('\xa0', '  ').replace('®', ' ').replace(';', ' ')
        output = " ".join(output.split())
    return output

In [98]:
# Neither spaCy nor NLTK have any methods for filtering punctuations 
def remove_punctuations(normalized_tokens):
    punctuations=['?',':','!',',','.',';','|','(',')','--','\n']
    for word in normalized_tokens:
        if word in punctuations:
            normalized_tokens.remove(word)  
    return normalized_tokens

In [96]:
def normalized_text(doc:str):
    vocab = English()
    # Create a Tokenizer with the default settings for English
    tokenizer = vocab.tokenizer
    tokens = tokenizer(doc)
    # just keeping pos tagger and lemmatizer
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner',
                                            'tok2vec', 'attribute_ruler'])
    doc = nlp(doc)   
    lemma_list = []
    for token in doc:
        lemma_list.append(token.lemma_)
    normalized_tokens =[] 
    for word in lemma_list:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            normalized_tokens.append(word) 
    normalized_tokens = remove_punctuations(normalized_tokens)
    return normalized_tokens

In [97]:
def get_word(body):
    words = normalized_text(body)
    word_freq = {}
    for word in words:
        if word in word_freq:
            word_freq[word] += 1
        else:
            word_freq[word] = 1
    return word_freq

In [None]:
body,title = scrape_tags('https://en.wikipedia.org/wiki/Main_Page')
body=cleansing(body)

In [None]:
target_links = {'https://www.bbc.com/news':0}
for i in target_links:
    get_link = spyder(target_links,i)
    target_links = get_link.get_check_ref()
    print(target_links)

In [105]:
def get_doc(target_links):
    doc=[]
    for i in target_links:
        web_spyder=spyder(target_links,i)
        domain_links =web_spyder.get_check_domain()
        for j in domain_links:
            print(j)
            j.replace(" ", "")
            d=dict()    
            body, title = scrape_tags(j)
            body=cleansing(body)
            word = get_word(body)
            d['link']= j
            d['title'] = title
            d['body']=body
            d['location']='location'
            d['word'] = word
            for k in target_links:
                if j.startswith(k):
                    d['ref'] = target_links[k]
            doc.append(d)
            print(d)
    return doc

In [106]:
doc=get_doc(target_links)

https://www.bbc.com/news/world-africa-59517501
{'link': 'https://www.bbc.com/news/world-africa-59517501', 'title': 'Why France faces so much anger in West Africa - BBC News', 'body': 'BBC HomepageSkip to contentAccessibility HelpYour accountHomeNewsSportReelWorklifeTravelFutureMore menuMore menuSearch BBCHomeNewsSportReelWorklifeTravelFutureCultureMusicTVWeatherSoundsClose menuBBC NewsMenuHomeWar in UkraineCoronavirusClimateVideoWorldAsiaUKBusinessTechMoreScienceStoriesEntertainment & ArtsHealthWorld News TVIn PicturesReality CheckNewsbeatLong ReadsWorldAfricaAustraliaEuropeLatin AmericaMiddle EastUS & CanadaWhy France faces so much anger in West AfricaPublished5 December 2021Shareclose panelShare pageCopy linkAbout sharingRelated TopicsSahel Islamist insurgencyImage source, Getty ImagesImage caption, Mali has witnessed several protests against FranceBy Paul MellyAfrica analystIt all started so positively. Where have things gone wrong? Why does France now appear so unpopular in Africa?

KeyboardInterrupt: 

In [None]:
import sqlite3

# Connect to SQLite3 database
conn = sqlite3.connect('inverted_index2.db')
cursor = conn.cursor()

# Create tables for words, documents, and word frequencies
conn.execute('''
CREATE TABLE words (
    ID INTEGER PRIMARY KEY,
    Word TEXT NOT NULL UNIQUE
);
''')

conn.execute('''
CREATE TABLE documents (
    ID INTEGER PRIMARY KEY,
    Link TEXT NOT NULL UNIQUE ,
    Title TEXT,
    Body TEXT,
    Location TEXT,
    Ref INTEGER,
    Time TEXT
);
''')

conn.execute('''
CREATE TABLE word_frequencies (
    Word_ID INTEGER ,
    Doc_ID INTEGER ,
    Frequency INTEGER NOT NULL,
    TF_IDF REAL ,
    PRIMARY KEY (word_id, doc_id),
    FOREIGN KEY (word_id) REFERENCES words(id),
    FOREIGN KEY (doc_id) REFERENCES documents(id)
);
''')

In [None]:
def insert_to_database(doc):
  for i in doc:
    conn.execute('''INSERT INTO documents (Link, Title, Body, Location, Ref,Time) VALUES (?, ?, ?, ?, ?, ?);''', (i['link'], i['title'], i['body'],i['location'],i['ref'],datetime.now()))
    doc_id = conn.execute("SELECT last_insert_rowid()").fetchone()[0]
    
    for j in i['word'].keys():
      word_id = conn.execute("SELECT id FROM words WHERE word = ?", (j,)).fetchone()
      if not word_id:
        conn.execute("INSERT INTO words (word) VALUES (?)", (j,))
        word_id = conn.execute("SELECT last_insert_rowid()").fetchone()[0]
      else:
        word_id = word_id[0]
      
      conn.execute('''INSERT INTO word_frequencies (word_id, doc_id, Frequency) VALUES (?, ?, ?);''', (word_id, doc_id, i['word'][j]))
    
  conn.commit()
#conn.close()

In [None]:
insert_to_database(doc)

In [None]:
def update_data(link):
    