In [None]:
import requests
from bs4 import BeautifulSoup 
import re
import unicodedata
import pythainlp.util
import spacy
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
from pythainlp.tokenize import word_tokenize
from pythainlp.util import find_keyword
from pythainlp.util import rank
#from pythainlp.summarize import extract_keywords
from pythainlp.summarize import summarize
import itertools
import sqlite3
from urllib.parse import urljoin
import time
from pythainlp.tag import tag_provinces
from datetime import datetime
import math


In [None]:
class spyder:
    def __init__( self ,target_links,base_url ):
        self.base_url = base_url
        self.target_links = target_links
    
    def get_crawler(self):
        self.result_crawler = self.crawl(self.base_url,0,set())
        return self.result_crawler
    
    def get_check_domain(self):
        self.check_domain_result = self.check_domain(self.base_url,self.get_crawler())
        return self.check_domain_result
    
    def get_check_not_domain(self):
        self.check_not_domain_result = self.check_not_domain(self.base_url,self.get_crawler())   
        return self.check_not_domain_result
    
    def get_check_ref(self):
        self.check_ref_result = self.check_ref(self.get_check_not_domain(),self.target_links)
        return self.check_ref_result
    
    def get_all(self):
        crawl = self.crawl(self.base_url,0,set())
        check_domain =  self.check_domain(self.base_url,crawl) 
        check_not_domain = self.check_not_domain(self.base_url,crawl)
        check_ref = self.check_ref(check_not_domain,self.target_links)
        return check_domain,check_ref
    
    def crawl(self,url, depth,visited):
        if depth < 3 :
            visited.add(url)
            headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"}
            time.sleep(0.3)
            response = requests.get(url,headers=headers)
            try:
                soup = BeautifulSoup(response.text, 'html.parser')
            except:
                soup = BeautifulSoup(response.text, 'lxml')
            links = soup.find_all('a')
            links = [link.get('href') for link in links if link.get('href') and not link.get('href').startswith('#')]
            links = [urljoin(url, link) for link in links if link]

            for link in links:
                if link not in visited:
                    link = link.replace(' ','')
                    visited.add(link)
                    if link.startswith(url):
                        self.crawl(link,depth=depth+1, visited=visited)
        return visited
    
    def check_domain(self,base_url,links):
        result= set()
        for link in links :
            if link.startswith(base_url):
                result.add(link)
        return result
    
    def check_not_domain(self,base_url,links):
        result= set()
        for link in links :
            if not link.startswith(base_url):
                result.add(link)
        return result
    
    def check_ref(self,links,target_links):
        for i in links:
            for j in target_links:
                if i.startswith(j):
                    target_links[j]+=1
        return target_links

In [None]:
def scrape_tags(url):
  response = requests.get(url)
  soup = BeautifulSoup(response.text, 'html.parser')
  try:
    title_tag = soup.find('title').text
  except:
    title_tag = soup.find('title')
  body_tag = soup.find('body')
  text_below_body = body_tag.get_text() 
  body_list =[]
  body_list.append(text_below_body)
  return (body_list,title_tag)


In [None]:
def cleansing(body):
    for i in body:
        output = i.replace('\n', '  ').replace('\xa0', '  ').replace('®', ' ').replace(';', ' ')
        output = " ".join(output.split())
    return output

In [None]:
# Neither spaCy nor NLTK have any methods for filtering punctuations 
def remove_punctuations(normalized_tokens):
    punctuations=['?',':','!',',','.',';','|','(',')','--','\n']
    for word in normalized_tokens:
        if word in punctuations:
            normalized_tokens.remove(word)  
    return normalized_tokens

In [None]:
def normalized_text(doc:str):
    vocab = English()
    # Create a Tokenizer with the default settings for English
    tokenizer = vocab.tokenizer
    tokens = tokenizer(doc)
    # just keeping pos tagger and lemmatizer
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner',
                                            'tok2vec', 'attribute_ruler'])
    doc = nlp(doc)   
    lemma_list = []
    for token in doc:
        lemma_list.append(token.lemma_)
    normalized_tokens =[] 
    for word in lemma_list:
        lexeme = nlp.vocab[word]
        if lexeme.is_stop == False:
            normalized_tokens.append(word) 
    normalized_tokens = remove_punctuations(normalized_tokens)
    return normalized_tokens

In [None]:
def get_word(body):
    words = normalized_text(body)
    word_freq = {}
    for word in words:
        if word in word_freq:
            word_freq[word] += 1
        else:
            word_freq[word] = 1
    return word_freq

In [None]:
def make_doc(link,target_links):
    print(link)
    link.replace(" ", "")
    d=dict()    
    body, title = scrape_tags(link)
    body=cleansing(body)
    word = get_word(body)
    d['link']= link
    d['title'] = title
    d['body']=body
    d['location']='location'
    d['word'] = word
    for k in target_links:
        if link.startswith(k):
            d['ref'] = target_links[k]
    print(d)
    return d


In [None]:
def get_doc(target_links):
    doc=[]
    num=0
    for i in target_links:
        web_spyder=spyder(target_links,i)
        domain_links,target_links =web_spyder.get_all()
        print('all link =', len(domain_links))
        for j in domain_links:
            num+=1
            d = make_doc(j,target_links)
            doc.append(d)
            print(num)
    return doc

In [None]:
target_links = {'https://www.bbc.com/news':0}

In [None]:
doc=get_doc(target_links)

In [None]:
import sqlite3

# Connect to SQLite3 database
conn = sqlite3.connect('inverted_index2.db')
cursor = conn.cursor()

# Create tables for words, documents, and word frequencies
conn.execute('''
CREATE TABLE words (
    ID INTEGER PRIMARY KEY,
    Word TEXT NOT NULL UNIQUE
);
''')

conn.execute('''
CREATE TABLE documents (
    ID INTEGER PRIMARY KEY,
    Link TEXT NOT NULL UNIQUE ,
    Title TEXT,
    Body TEXT,
    Location TEXT,
    Ref INTEGER,
    Time TEXT
);
''')

conn.execute('''
CREATE TABLE word_frequencies (
    Word_ID INTEGER ,
    Doc_ID INTEGER ,
    Frequency INTEGER NOT NULL,
    TF_IDF REAL ,
    PRIMARY KEY (word_id, doc_id),
    FOREIGN KEY (word_id) REFERENCES words(id),
    FOREIGN KEY (doc_id) REFERENCES documents(id)
);
''')


In [None]:
def update_tf_idf():
    conn = sqlite3.connect('inverted_index2.db',timeout=3)

    cursor = conn.execute('SELECT COUNT(*) FROM documents')
    N = cursor.fetchone()[0]
    
    cursor = conn.execute('SELECT ID, Word FROM words')
    words = cursor.fetchall()
    
    for word in words:
        word_id = word[0]
        word_str = word[1]

        cursor = conn.execute('SELECT Doc_ID, Frequency FROM word_frequencies WHERE Word_ID = ?', (word_id,))
        doc_freqs = cursor.fetchall()

        df = len(doc_freqs)
        idf = math.log(N / df)

        for doc_freq in doc_freqs:
            doc_id = doc_freq[0]
            tf = doc_freq[1]
            tf_idf = tf * idf
            conn.execute('UPDATE word_frequencies SET TF_IDF = ? WHERE Word_ID = ? AND Doc_ID = ?', (tf_idf, word_id, doc_id))

    conn.commit()



In [None]:
def insert_to_database(doc):
  conn = sqlite3.connect('inverted_index2.db')
  for i in doc:
    conn.execute('''INSERT INTO documents (Link, Title, Body, Location, Ref,Time) VALUES (?, ?, ?, ?, ?, ?);''', (i['link'], i['title'], i['body'],i['location'],i['ref'],datetime.now()))
    doc_id = conn.execute("SELECT last_insert_rowid()").fetchone()[0]
    
    for j in i['word'].keys():
      word_id = conn.execute("SELECT id FROM words WHERE word = ?", (j,)).fetchone()
      if not word_id:
        conn.execute("INSERT INTO words (word) VALUES (?)", (j,))
        word_id = conn.execute("SELECT last_insert_rowid()").fetchone()[0]
      else:
        word_id = word_id[0]
      
      conn.execute('''INSERT INTO word_frequencies (word_id, doc_id, Frequency) VALUES (?, ?, ?);''', (word_id, doc_id, i['word'][j]))
  
 
    
  conn.commit()
  update_tf_idf()


In [None]:
get_doc()

In [None]:
def delete_data(link):
    conn = sqlite3.connect('inverted_index2.db', timeout=10)

    # retry up to 3 times if the database is locked
    for i in range(3):
        try:
            doc_id = conn.execute('''
                SELECT id FROM documents WHERE link = ?; ''', (link,)).fetchone()[0]
            conn.execute('''
                DELETE FROM documents WHERE link = ?; ''', (link,))

            conn.execute('''
                DELETE FROM word_frequencies WHERE Doc_ID = ?;''', (doc_id,))

            conn.execute('''
                DELETE FROM words
                WHERE NOT EXISTS (SELECT 1 FROM word_frequencies WHERE word_frequencies.word_id = words.id );''')
            conn.commit()
            update_tf_idf()
            break  # exit the loop if commit is successful

        except sqlite3.OperationalError as e:
            if 'database is locked' in str(e):
                print('Database is locked, retrying...')
                time.sleep(1)  # wait for 1 second before retrying
            else:
                raise e  # raise the error if it's not a locking issue
    


In [None]:
insert_to_database(doc)

In [None]:
def delete_data(link):
    conn = sqlite3.connect('inverted_index2.db',timeout=10)
    doc_id = conn.execute('''
    SELECT id FROM documents WHERE link = ?; ''', (link,)).fetchone()[0]
    conn.execute('''
        DELETE FROM documents WHERE link = ?; ''', (link,))

    conn.execute('''
        DELETE FROM word_frequencies WHERE Doc_ID = ?;''', (doc_id,))

    conn.execute('''
        DELETE FROM words
        WHERE NOT EXISTS (SELECT 1 FROM word_frequencies WHERE word_frequencies.word_id = words.id );''')
    
    conn.commit()
    update_tf_idf()


In [None]:
delete_data('https://www.bbc.com/news/uk')

In [None]:
update_tf_idf()

In [None]:
def update_data(target_links):
    conn = sqlite3.connect('inverted_index2.db')
    for i in target_links:
        get_link = spyder(target_links,i)
        domain_link,target_links = get_link.get_all()
    for j in domain_link:
        link = conn.execute('''SELECT  documents.link
                                    FROM documents
                                    WHERE documents.link = ?
                                    ''',(j,)) 
        link = link.fetchone()
        doc = [make_doc(j,target_links)]
        if link == None :
            print(j)
            insert_to_database(doc)
        else:
            print(j)
            delete_data(j)
            insert_to_database(doc)
    conn.close()

In [None]:
target_links = {'https://www.bbc.com/news':0}

In [None]:
update_data(target_links)

In [None]:

def search(search_term):
    conn = sqlite3.connect('inverted_index2.db')
    results = conn.execute('''
        SELECT documents.link, documents.title
        FROM documents
        JOIN word_frequencies ON word_frequencies.doc_id = documents.id
        JOIN words ON words.id = word_frequencies.word_id
        WHERE words.word LIKE ?
        GROUP BY documents.id
        ORDER BY SUM(word_frequencies.TF_IDF) DESC;
    ''', ('%'+search_term+'%',))
    print('Search Term:',search_term)
    print('Search results:')
    result = results.fetchone()
    if result is not None:
        while result is not None:
            print(result)
            result = results.fetchone()
    else:
        print('not found')


In [None]:
word = str(input()) 
search(word)

In [None]:

conn = sqlite3.connect('inverted_index2.db')
cursor = conn.cursor()
cursor.execute('SELECT Link FROM documents')
rows = cursor.fetchall()
for row in rows:
    print(row[0])

In [None]:
import sqlite3
from bs4 import BeautifulSoup
import re

def update_backlinks():
    headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"}
    conn = sqlite3.connect('inverted_index2.db')

    # Get all the documents
    documents = conn.execute('SELECT Link FROM documents').fetchall()

    # Loop over each document
    for url in documents:
        # Parse the HTML body and extract all links
        
        response = requests.get(url,headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        links = soup.find_all('a', href=True)
        backlinks = [link['href'] for link in links]

        # Loop over each backlink and update the count
        for backlink in backlinks:
            # Remove any anchors or query strings from the link
            backlink = re.sub(r'[#?].*$', '', backlink)

            # Check if the backlink already exists in the database
            row = conn.execute('SELECT Count FROM backlinks WHERE Link = ?', (backlink,)).fetchone()

            if row:
                # Update the count
                count = row[0] + 1
                conn.execute('UPDATE backlinks SET Count = ? WHERE Link = ?', (count, backlink))
            else:
                # Insert a new row
                conn.execute('INSERT INTO backlinks (Link, Count) VALUES (?, ?)', (backlink, 1))

    conn.commit()


In [None]:
conn.execute('''CREATE TABLE backlinks (
    ID INTEGER PRIMARY KEY,
    Link TEXT NOT NULL UNIQUE ,
    Count INTEGER NOT NULL);'''
)


In [None]:
update_backlinks()

In [None]:
#sentence = str(input())
sentence = ['an elephant is eating banana and swiming in a Pool']
clean_sentence = cleansing(sentence)
word = normalized_text(clean_sentence)
print(clean_sentence)
print(word)


In [None]:
.lower().split()
an elephant is eating banana and swiming in a Pool