In [53]:
import os
import pandas as pd
from bs4 import BeautifulSoup
from pypdf import PdfReader
from pptx import Presentation
import gzip
import json
from nltk.tokenize import word_tokenize, TweetTokenizer

In [54]:
def extract_other_texts(html_path):
    with open(html_path, 'rt') as f:
        html_str = f.read()
    soup = BeautifulSoup(html_str, features="html.parser")

    # kill all script and style elements
    for script in soup(["script", "style"]):
        script.extract()    # rip it out

    # get text
    text = soup.get_text()

    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    text = '\n'.join(chunk for chunk in chunks if chunk)
    return text

In [55]:
from bs4 import BeautifulSoup

def extract_html_texts(html_path):
    with open(html_path, 'rt', encoding='utf-8') as f:
        html_str = f.read()
    soup = BeautifulSoup(html_str, "html.parser")

    # Remove irrelevant elements
    for element in soup(["script", "style", "header", "footer", "nav", "aside", "form"]):
        element.extract()
    
    # Collect text from main content-bearing tags
    content_texts = []
    
    # Search within common content-bearing tags
    for tag in ['main', 'article', 'section', 'div', 'p']:
        for element in soup.find_all(tag):
            text = element.get_text(separator=" ", strip=True)
            # Filter for significant content (e.g., longer than 50 characters to exclude small chunks)
            if len(text) > 50:
                content_texts.append((text, len(text)))

    # Sort blocks by length to get the most likely main content at the top
    content_texts.sort(key=lambda x: x[1], reverse=True)
    
    # Concatenate the top content blocks
    # You can adjust the number of top blocks to include
    top_content = "\n\n".join([text for text, length in content_texts[:5]])

    # Clean up extra whitespace and line breaks
    lines = (line.strip() for line in top_content.splitlines())
    cleaned_text = '\n'.join(line for line in lines if line)
    
    return cleaned_text



In [56]:
def extract_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    page_texts = []
    for page in reader.pages:
        page_texts.append(page.extract_text())
    return ' '.join(page_texts)

In [57]:
def extract_pptx(pptx_path):
    prs = Presentation(pptx_path)
    text_runs = []
    for slide in prs.slides:
        for shape in slide.shapes:
            if not shape.has_text_frame:
                continue
            for paragraph in shape.text_frame.paragraphs:
                for run in paragraph.runs:
                    text_runs.append(run.text)
    return ' '.join(text_runs)

In [58]:
urls_df = pd.read_csv(r'contents\url_classes2.csv')
htmls_path = r'htmls'
contents_dict = {}

In [72]:
def extract_saved_web_data(index:int):
    filename = os.path.join(htmls_path, f'link_{index}')
    data_type = ''
    if True:
        if os.path.isfile(filename+'.html'):
            data_type = 'html'
            file_path = filename+'.html'
        elif os.path.isfile(filename+'.aspx'):
            data_type = 'aspx'
            file_path = filename+'.aspx'
        elif os.path.isfile(filename+'.x_empty'):
            data_type = ''
            file_path = filename+'.x_empty'
        elif os.path.isfile(filename+'.octet_stream'):
            data_type = 'octet_stream'
            file_path = filename+'.octet_stream'
        elif os.path.isfile(filename+'.xml'):
            data_type = 'xml'
            file_path = filename+'.xml'
        elif os.path.isfile(filename+'.cpp'):
            data_type = 'cpp'
            file_path = filename+'.cpp'
        elif os.path.isfile(filename+'.c'):
            data_type = 'c'
            file_path = filename+'.c'
        elif os.path.isfile(filename+'.cfm'):
            data_type = 'cfm'
            file_path = filename+'.cfm'
        elif os.path.isfile(filename+'.cgi'):
            data_type = 'cgi'
            file_path = filename+'.cgi'
        elif os.path.isfile(filename+'.ashx'):
            data_type = 'ashx'
            file_path = filename+'.ashx'
        elif os.path.isfile(filename+'.asp'):
            data_type = 'asp'
            file_path = filename+'.asp'
        elif os.path.isfile(filename+'.cgi'):
            data_type = 'cgi'
            file_path = filename+'.cgi'
        elif os.path.isfile(filename+'.txt'):
            data_type = 'txt'
            file_path = filename+'.txt'
        elif os.path.isfile(filename+'.x_appleworks3'):
            data_type = 'x_appleworks3'
            file_path = filename+'.x_appleworks3'
        elif os.path.isfile(filename+'.page'):
            data_type = 'page'
            file_path = filename+'.page'
        elif os.path.isfile(filename+'.php'):
            data_type = 'php'
            file_path = filename+'.php'
        elif os.path.isfile(filename+'.rxml'):
            data_type = 'rxml'
            file_path = filename+'.rxml'
        elif os.path.isfile(filename+'.com'):
            data_type = 'com'
            file_path = filename+'.com'
        elif os.path.isfile(filename+'.cat'):
            data_type = 'cat'
            file_path = filename+'.cat'
        elif os.path.isfile(filename+'.txt'):
            data_type = 'txt'
            file_path = filename+'.txt'
        elif os.path.isfile(filename+'.xsp'):
            data_type = 'xsp'
            file_path = filename+'.xsp'
        elif os.path.isfile(filename + '.pptx'):
            data_type = 'pptx'
            file_path = filename+'.pptx'
        elif os.path.isfile(filename+'.pdf'):
            data_type = 'pdf'
            file_path = filename+'.pdf'
    try:
        if data_type=='':
            return ''
        elif data_type == 'pdf':
            return extract_pdf(file_path)
        elif data_type == 'pptx':
            return extract_pptx(file_path)
        elif data_type == 'html':
            output = extract_html_texts(file_path)
            if(len(output) > 20000):
                print(f"length {len(output)} for data_type {data_type} in position {index}")
            return output
        else:
            output = extract_other_texts(file_path)
            if(len(output) > 20000):
                print(f"length {len(output)} for data_type {data_type} in position {index}")
            return output
    except:
        print(f'Error on {index}')
        return ''

In [73]:
urls_df.iloc[134].Address

'https://community.hubspot.com/t5/Lists-Lead-Scoring-Workflows/Lead-Scoring-Cumulative-Form-Submissions/m-p/185948'

In [74]:
cnt = extract_saved_web_data(35485)

In [77]:
cnt[0:500]



In [76]:
len(cnt)

17897

In [78]:
# Loop through all files in the folder
j=0
for i in range(j, 35631):#urls_df.shape[0]):
    if i%1000 == 0:
        print(i)
    content = extract_saved_web_data(i)
    if content == '':
        continue
    contents_dict[i] = content
            
with gzip.open(rf'contents\web_contents33.json.gz', 'wt') as file:
    json.dump(contents_dict, file)

0
length 29265 for data_type html in position 1
length 31885 for data_type html in position 3
length 25912 for data_type html in position 5
length 176023 for data_type html in position 11
length 28564 for data_type html in position 16
length 31097 for data_type html in position 17
length 147699 for data_type html in position 33
length 24557 for data_type html in position 34
length 60958 for data_type html in position 36
length 52028 for data_type html in position 37
length 31344 for data_type html in position 38
length 21416 for data_type html in position 44
length 30150 for data_type html in position 45
length 23857 for data_type html in position 47
length 44280 for data_type html in position 50
length 70121 for data_type html in position 60
length 72896 for data_type html in position 61
length 66704 for data_type html in position 62
length 64684 for data_type html in position 63
length 28843 for data_type html in position 64
length 36248 for data_type html in position 65
length 55429

In [34]:
file_path

'htmls\\link_3065.pdf'

In [39]:
with gzip.open(rf'contents\web_contents.json.gz', 'r') as file:
    web_contents2 = json.load(file)
len(web_contents2['5'])

9599

In [43]:
len(contents_dict[5])

9599

In [19]:
contents_dict[5]

"DUNKING | English meaning - Cambridge Dictionary\nDictionary\nTranslate\nGrammar\nThesaurus\n+Plus\nCambridge Dictionary +Plus\nShop\nCambridge Dictionary +Plus\nMy profile\n+Plus help\nLog out\nCambridge Dictionary +Plus\nMy profile\n+Plus help\nLog out\nLog in\n/\nSign up\nEnglish (UK)\nSearch\nSearch\nEnglish\nMeaning of dunking in English\ndunking\nAdd to word list\nAdd to word list\npresent participle of\ndunk\ndunkverb [ T ] uk\nYour browser doesn't support HTML5 audio\n/dʌŋk/ us\nYour browser doesn't support HTML5 audio\n/dʌŋk/\ndunk verb [T]\n(INTO LIQUID)\nto put a biscuit, piece of bread, etc. into a liquid such as tea, coffee, or soup for a short time before eating it:\nShe dunked a biscuit in her coffee.\ninformal to put something into liquid for a short time:\nDunk the sponge in water every once in a while to stop it from drying out.\nSMART Vocabulary: related words and phrases\nBiting, chewing & swallowing\nbit\nbite\nbiter\nchamp\nchew\nchew on something\nchomp\ncrunch\