In [94]:
from bs4 import BeautifulSoup
import io
import os
from openai import OpenAI
import re
import PyPDF2
import requests
requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS = 'DES-CBC3-SHA'
import json
import csv
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait 
from selenium.webdriver.support import expected_conditions as EC
from dotenv import load_dotenv


In [95]:
from nltk.tokenize import sent_tokenize
import nltk

# nltk.download('punkt')
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

def get_sentences(text):
    return sent_tokenize(text, language='english')

In [96]:
load_dotenv()
client = OpenAI(
    api_key=os.environ.get("KEY"),
)


In [97]:
def get_annotation(text, url):
    # send the text and prompt to the OpenAI API and get the annotation response
    prompt = f"You are a helpful assistant. Classify topic, pick only one of the options: [stories, hard hearing, deaf, learning sign language, identities of deaf, fiction books, what is hearing loss, hearing implantation, loud music, hearing test, ear poisoning, Improve your English, Caring for the Deaf Patient]. You can use a combination of the URL and the text to make a decision. If the text is a page number, date, author name, date, none-english, or words that doesn't make sense, return `None` If you are not able to select only one, then use `None`. always one label:     - text: {text}     - url: {url}"
    response = client.chat.completions.create(
        messages=[
        {
            "role": "user",
            "content": prompt,
        }
    ],
    model="gpt-4",
    )
    # return the answer
    return response.choices[0].message.content
    # return response/


In [98]:
def clean_text(string):
    text = json.dumps(string)
    # Remove Unicode escape sequences like \u201c, \u201d, etc.
    text = re.sub(r'\\u[0-9A-Fa-f]{4}', '', text)
    # Remove escaped newline and tab characters (i.e., \\n and \\t)
    text = re.sub(r'\\[nt]', '', text)
    # Replace multiple spaces, tabs, and new lines with a single space
    cleaned_text = re.sub(r'\s+', ' ', text).strip()
    return json.loads(cleaned_text)

In [99]:
def scrap_webpage(url):
    response = requests.get(url, verify=False)
    soup = BeautifulSoup(response.text, 'html.parser')
    text = soup.findAll("p")
    text = [t.text for t in text]
    text = "\n".join(text)
    return text

In [100]:
def scrap_pdf(url):
  print(url)
  response = requests.get(url, stream=True, verify=False)
  pdf_reader = PyPDF2.PdfReader(io.BytesIO(response.content))
  all_text = ""
  for page in pdf_reader.pages:
      all_text += page.extract_text()
  return all_text

In [101]:
def scrape_url(url):
  if url.endswith('.pdf'):
    return scrap_pdf(url)
  else:  
    return scrap_webpage(url)

In [102]:
url_list = [
    "https://www.childrensmn.org/references/pfs/rehabpublic/sign-language-for-hearing-children.pdf",
    "https://www.slj.com/story/read-the-signs-deaf-experience",
    "https://www.swarthmore.edu/sites/default/files/assets/documents/faculty-donna-jo-napoli/Fun%20published%20version%20Bilingual%20Bimodal%20Ebooks%20for%20Deaf%20Children%20Developing%20language%20and%20preliteracy%20skills.pdf",
    "https://www.frontiersin.org/journals/psychology/articles/10.3389/fpsyg.2013.00889/full",
    "https://www.scirp.org/journal/paperinformation?paperid=65497",
    "https://www.researchgate.net/publication/301312097_Early_Reading_for_Young_Deaf_and_Hard_of_Hearing_Children_Alternative_Frameworks",
    "https://preply.com/en/blog/22-useful-english-greetings-for-every-day/"
]
for url in url_list:
    with open('sentences.csv', 'a', encoding="utf-8", newline='') as f:
        writer = csv.writer(f)
        cleaned_text = clean_text(scrape_url(url))
        tokenized_text = get_sentences(cleaned_text)
        for sentence in tokenized_text:
            annotation = get_annotation(sentence, url)
            writer.writerow([sentence, annotation])

https://www.childrensmn.org/references/pfs/rehabpublic/sign-language-for-hearing-children.pdf




https://www.swarthmore.edu/sites/default/files/assets/documents/faculty-donna-jo-napoli/Fun%20published%20version%20Bilingual%20Bimodal%20Ebooks%20for%20Deaf%20Children%20Developing%20language%20and%20preliteracy%20skills.pdf




In [103]:

driver = webdriver.Chrome()
driver.get("https://www.childrensmn.org/search?q=deaf&site=") 
# if the page is unsecure, press button with id="details-button"
# then press button with id="proceed-link"
# add if statement check check if the page is secure or not by checking if id details-button exists
if len(driver.find_elements(by="id", value="details-button")) > 0:
  driver.find_element(by="id", value="details-button").click()
  driver.find_element(by="id", value="proceed-link").click()
driver.find_element(By.XPATH,value="/html/body/div[1]/div/main/div/form/div/div/div/div[7]/a").click()
wait = WebDriverWait(driver, 20)
element = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, 'result-link')))
page_source = driver.page_source
driver.quit()

soup = BeautifulSoup(page_source, 'html.parser')
box = soup.find(name="div", id="search-results")
links = box.find_all('a')

for url in links:
    title = url.get('title')
    link = url.get('href')
    with open('sentences.csv', 'a', encoding="utf-8", newline='') as f:
      writer = csv.writer(f)
      cleaned_text = clean_text(scrape_url(link))
      tokenized_text = get_sentences(cleaned_text)
      for sentence in tokenized_text:
        annotation = get_annotation(sentence, link)
        writer.writerow([sentence, annotation])


