In [2]:
!pip install get-chrome-driver
!pip install selenium
!pip install pymysql



In [3]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import re

### WEB scraping with selenium

def browser(engine, term):
    # Remove stopwords
    sw = stopwords.words('english')
    words_ns = [word for word in term.split() if word.lower() not in sw]

    # Construct search URL
    search_url = engine + "+".join(words_ns)

    # Set up Selenium options
    options = Options()
    options.add_argument("--headless")  # Run Chrome in headless mode (no GUI)

    # Provide path to chromedriver executable directly in webdriver.Chrome()
    driver = webdriver.Chrome(options=options)

    # Get search results
    driver.get(search_url)
    html = driver.page_source

    # Parse HTML with BeautifulSoup
    soup = BeautifulSoup(html, "html.parser")
    links = soup.find_all("a")
    search_results = []

    # Regular expression pattern to extract URLs
    url_pattern = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

    for link in links:
        href = link.get("href")
        if href:
            # Extract URLs using regular expression
            match = re.search(url_pattern, href)
            if match:
                url = match.group(0)
                search_results.append(url)

    # Close Selenium WebDriver
    driver.quit()

    return search_results

In [10]:
from collections import Counter
import pymysql
from tkinter import Tk, Label, Button, Entry, Text



def mysearch():
    term = txt1.get()
    engines = (
    "https://www.google.com/search?q=",
    "https://www.bing.com/?q=",
    "https://search.yahoo.com/search?p=",
    "https://duckduckgo.com/?q=",
    "https://news.search.yahoo.com/search?p="
    )

    exclude_words = ["google", "yahoo", "microsoft","duckduckgo", "bing"]  # Words to exclude from search results

    all_results = []  # List to collect all search results
    for engine in engines:
        results = browser(engine, term)
        all_results.extend(results)  # Append results to the list

    # Filter out results containing certain words
    filtered_results = [result for result in all_results if not any(word in result.lower() for word in exclude_words)]

    # Count occurrences of each URL
    url_counts = Counter(filtered_results)

    # Get URLs ordered by their count, from most repeated to least repeated
    ordered_urls = url_counts.most_common()

    # Clear previous results
    txt2.delete('1.0', 'end')

    # Insert results into the Text widget
    for url, count in ordered_urls:
        txt2.insert('end', f"URL: {url}\nCount: {count}\n\n")



###Conecting to MySQL server --- Local host#


    try:
        # Connect to MySQL server
        myConnection = pymysql.connect(host='localhost', user='root', db='my_custom_bot')

        # Create a cursor object
        cursor = myConnection.cursor()

        # SQL query with placeholder for values
        sql = f'INSERT INTO New_Engine (Search, URL, Count) VALUES (%s, %s, %s)'

        # List of tuples containing values to be inserted
        values = [(term, url, count) for url, count in url_counts.items()]

        # Execute the SQL query with executemany() method
        cursor.executemany(sql, values)

        # Commit the changes
        myConnection.commit()

    except pymysql.Error as e:
        # Handle any potential errors
        print(f"Error: {e}")

    finally:
        # Close the connection
        myConnection.close()

        
        
                      
        
## GUI ----Creating the browser window #######################################################

def resize_window(window, width_percent, height_percent):
    screen_width = window.winfo_screenwidth()
    screen_height = window.winfo_screenheight()

    width = int(screen_width * width_percent)
    height = int(screen_height * height_percent)

    x = (screen_width - width) // 2
    y = (screen_height - height) // 2

    window.geometry(f"{width}x{height}+{x}+{y}")

window = Tk()
window.title("Browser project DS1E400")

# Resize the window to 80% width and 70% height of the screen
resize_window(window, 0.8, 0.7)

lbl = Label(window, text="the medical device development process for today")
lbl.place(relx=0.05, rely=0.02, relwidth=0.9, relheight=0.05)

txt1 = Entry(window, bg='white')
txt1.place(relx=0.05, rely=0.1, relwidth=0.4, relheight=0.05)

txt2 = Text(window, bg='white')
txt2.place(relx=0.05, rely=0.18, relwidth=0.9, relheight=0.6)

btn = Button(window, text="New Search", command=mysearch)
btn["fg"] = "white"
btn["bg"] = "blue"
btn.place(relx=0.55, rely=0.1, relwidth=0.2, relheight=0.05)

window.mainloop()