In [None]:
import requests
from bs4 import BeautifulSoup 
import re
import unicodedata
import pythainlp.util
from pythainlp.tokenize import word_tokenize
from pythainlp.util import find_keyword
from pythainlp.util import rank
#from pythainlp.summarize import extract_keywords
from pythainlp.summarize import summarize
import itertools

from urllib.parse import urljoin

In [None]:
def get_all_links(url, depth=0, visited={}):
  headers = {'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"}
  response = requests.get(url,headers=headers)
  soup = BeautifulSoup(response.text, 'html.parser')
  links = soup.find_all('a')
  links = [link.get('href') for link in links if link.get('href') and not link.get('href').startswith('#')]
  links = [link for link in links if link.startswith(url) or link.startswith('/')]
  links = [urljoin(url, link) for link in links if link]


  # Recursively crawl the links at the next depth level
  if depth < 3: #3
    new_links = []
    for link in links:
      # Increment the visit count for the link
      if link in visited:
        visited[link] += 1
      else:
        visited[link] = 1
      # Get the newly-crawled links and add them to the list
        new_links.extend(get_all_links(link, depth=depth+1, visited=visited))
      # Add the newly-crawled links to the original list
    links.extend(new_links)

  return visited

base_url = 'https://www.thairath.co.th'
website_dict = get_all_links(base_url, depth=0, visited={})
print(website_dict)



In [None]:
class Thai:
    def __init__(self,data:list):
        self.data_value = data
        self.sentence = self.get_sentence()
        self.keyword = self.get_keyword()
        self.summarize = self.get_summarize()
    def make_sentence(self,list_word):
        self.sentence_value = ''
        for i in list_word:
            for i in list_word:
                if pythainlp.util.countthai(i)<10:
                    list_word.remove(i)
        self.sentence_value = ' '.join(list_word)
        return self.sentence_value
    def get_sentence(self):
        self.sentence_result = self.make_sentence(self.data_value)
        return self.sentence_result
    def get_keyword(self):
        self.keyword_result = {}
        self.keyword_value = word_tokenize(self.sentence, engine="newmm")
        self.keyword_dict = find_keyword(self.keyword_value)
        # Iterate over the keys in the dictionary
        for key in self.keyword_dict:
        # Check if the key is text (i.e., not a space or quotation mark)
            if key.isalpha():
            # If the key is text, add it to the new dictionary
                self.keyword_result[key] = self.keyword_dict[key]
        return self.keyword_result
    def get_summarize(self):
        self.summarize_result =[]
        self.summarize_result = summarize(self.sentence,n=5)
        return self.summarize_result


In [None]:
def scrape_tags(url):
  response = requests.get(url)
  soup = BeautifulSoup(response.text, 'html.parser')
  title_tag = soup.find('title').text
  p_tags = soup.find_all('p')
  p_list =[]
  for p in p_tags:
    if p.string != None:
      p_list.append(unicodedata.normalize("NFKD", p.string))
  if len(p_list) == 0:
    p_list.append('ไม่พบข้อความในเว็บนี้')
  
  p_tag = "".join(p_list)
  thai_nlp = Thai(p_list)
  keyword = thai_nlp.keyword
  keyword = {k: v for k, v in sorted(keyword.items(), key=lambda item: item[1], reverse=True)}
  keyword = dict(itertools.islice(keyword.items(), 5))
  summarize_article = thai_nlp.summarize
  
  if  title_tag == None:
    title_tag = summarize_article[0]
  
  
  
  
  return p_tag, title_tag, keyword

import sqlite3
conn = sqlite3.connect('scraped_data.db')
conn.execute('''CREATE TABLE DATA
             (ID INTEGER PRIMARY KEY AUTOINCREMENT,
             WEBSITE STRING NOT NULL,
             BODY TEXT NOT NULL,
             TITLE TEXT NOT NULL,
             KEYWORD TEXT NOT NULL,
             WORD_FREQUENCY INT NOT NULL,
             REF INT NOT NULL);''')
for website in website_dict.keys():
  if 'news' in website:
    p_tag, title, keyword  = scrape_tags(website)
    for i in keyword: 
      conn.execute("INSERT INTO DATA (WEBSITE, BODY, TITLE, KEYWORD, WORD_FREQUENCY ,REF) VALUES (?, ?, ?, ?, ?, ?)", (website, p_tag, title, i, keyword[i],website_dict[website]))
      print(f'For website {website}\n the p tags is: {p_tag} \n the title tag is: {title}\n  the keyword is:{i}\n  the word frequency is:{keyword[i]}\n  the ref is:{website_dict[website]}')



conn.commit()
conn.close()

In [1]:
import sqlite3
from tkinter import *
from tkinter import ttk

# Connect to the .db file
conn = sqlite3.connect('scraped_data.db')
cursor = conn.cursor()

# Create a Tkinter window
root = Tk()
root.title("Search")

# Create a style for the widgets
style = ttk.Style()
style.configure('.', font=('Arial', 24))

# Create a label for the title
title_label = ttk.Label(root, text="Search data", style='Title.TLabel')
title_label.pack()

# Create a Frame for the input
input_frame = ttk.Frame(root)
input_frame.pack()

# Create a StringVar to store the user input
user_input = StringVar()

# Create an Entry widget for the user to input text
entry = ttk.Entry(input_frame, textvariable=user_input)
entry.grid(row=0, column=0)

# Create a button to submit the input
submit_button = ttk.Button(input_frame, text='Submit', command = lambda: submit_query())
submit_button.grid(row=0, column=1)

# Create a Listbox widget to display the results
listbox = Listbox(root, height=40, width=90)
listbox.pack()

def submit_query():
    # Get user input
    user_input_value = user_input.get()
    # Clear the Listbox
    listbox.delete(0, END)
    # Execute the SQL query with user input
    query = "SELECT DISTINCT website,title,ref FROM DATA WHERE TITLE like ? order by ref desc"
    cursor.execute(query, ('%'+user_input_value+'%',))
    # Fetch the results
    results = cursor.fetchall()
    # Iterate through the rows of the results
    for row in results:
        # Iterate through the columns of the current row
        for column in row:
            # Insert the column value into the Listbox
            listbox.insert(END, column)
        listbox.insert(END, '\n')

root.mainloop()
