In [6]:
# Set up the sql database for the project
import sqlite3
import os
# Database is one folder up
db_path = os.path.join(os.path.dirname(os.getcwd()), 'bookmarks.db')
db = sqlite3.connect(db_path)
cursor = db.cursor()

In [10]:
# Query the database to make sure it's set up correctly
query = """
SELECT * FROM bookmarks
"""

cursor.execute(query)
bookmarks = cursor.fetchall()
bookmarks[0]

(1,
 'Cosmetics bag',
 'https://www.youtube.com/watch?v=TW9t0HTK_7E&feature=youtu.be',
 '2016-04-10 14:32:02',
 'Sewing Patterns')

In [9]:
# Get the descriptions db
# Oh that's right I haven't created it yet lol
query = """
SELECT * FROM descriptions
"""

cursor.execute(query)
results = cursor.fetchall()
results[0]

OperationalError: no such table: descriptions

In [11]:
# Anyway let's give a go at web scraping
url = bookmarks[0][2]
url

'https://www.youtube.com/watch?v=TW9t0HTK_7E&feature=youtu.be'

In [12]:
import requests
from bs4 import BeautifulSoup
r = requests.get(url)
if r == 200:
    print("Success!")

In [14]:
soup = BeautifulSoup(r.text, 'html.parser')

In [15]:
page_content = soup.get_text()

In [16]:
page_content

'Sew an easy cosmetics bag - YouTubeAboutPressCopyrightContact usCreatorAdvertiseDevelopersTermsPrivacyPolicy & SafetyHow YouTube worksTest new features© 2024 Google LLC'

In [19]:
# We don't have much to go on lol but let's use NLTK to get some keywords
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
# Get keywords from page_content
# First tokenize
tokens = word_tokenize(page_content)
# Remove punctuation
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in tokens]
# Remove non-alphabetic tokens
words = [word for word in stripped if word.isalpha()]
# Remove stopwords
stop_words = set(stopwords.words('english'))
words = [w for w in words if not w in stop_words]
# Lemmatize
lemmatizer = WordNetLemmatizer()
words = [lemmatizer.lemmatize(word) for word in words]
# Get frequency distribution
freq_dist = nltk.FreqDist(words)
freq_dist.most_common(3)

[('Sew', 1), ('easy', 1), ('cosmetic', 1)]

In [20]:
# Super, now let's use a transformers model to get a summary
from transformers import pipeline

# We'll use a small model for speed
summarizer = pipeline("summarization", model="Falconsai/text_summarization")

summary = summarizer(page_content, max_length=130, min_length=30, do_sample=False)[0]['summary_text']
summary

  from .autonotebook import tqdm as notebook_tqdm
config.json: 100%|██████████| 1.49k/1.49k [00:00<00:00, 6.09MB/s]
model.safetensors: 100%|██████████| 242M/242M [00:38<00:00, 6.28MB/s] 
generation_config.json: 100%|██████████| 112/112 [00:00<00:00, 322kB/s]
tokenizer_config.json: 100%|██████████| 2.32k/2.32k [00:00<00:00, 9.73MB/s]
spiece.model: 100%|██████████| 792k/792k [00:00<00:00, 6.92MB/s]
tokenizer.json: 100%|██████████| 2.42M/2.42M [00:00<00:00, 3.84MB/s]
special_tokens_map.json: 100%|██████████| 2.20k/2.20k [00:00<00:00, 6.76MB/s]
Your max_length is set to 130, but your input_length is only 55. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=27)


'YouTubeAboutPressCopyrightContact usCreatorAdvertiseDevelopersTermsPrivacyPolicy& SafetyHow YouTube worksTest new features 2024 Google LLC.'

In [22]:
# It doesn't work great for the youtube video, but it's a start
# Let's find something better
bookmarks[20]

(21,
 'Time Series Analysis in Python - A Comprehensive Guide with Examples - ML+',
 'https://www.machinelearningplus.com/time-series/time-series-analysis-python/',
 '2023-07-14 12:50:54',
 'Data Science Useful Things')

In [23]:
# Get the content of bookmark 20
url = bookmarks[20][2]
r = requests.get(url)
if r == 200:
    print("Success!")

soup = BeautifulSoup(r.text, 'html.parser')
page_content = soup.get_text()

# Get keywords from page_content
# First tokenize
tokens = word_tokenize(page_content)
# Remove punctuation
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in tokens]
# Remove non-alphabetic tokens
words = [word for word in stripped if word.isalpha()]
# Remove stopwords
stop_words = set(stopwords.words('english'))
words = [w for w in words if not w in stop_words]
# Lemmatize
lemmatizer = WordNetLemmatizer()
words = [lemmatizer.lemmatize(word) for word in words]
# Get frequency distribution
freq_dist = nltk.FreqDist(words)
freq_dist.most_common(3)

[('Python', 185), ('series', 135), ('How', 116)]

In [46]:
# Instead of getting all text let's get p tags, headings and list items
p_tags = soup.find_all('p')
headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
list_items = soup.find_all(['li'])
page_content_dump = ''
for tag in p_tags:
    page_content_dump += tag.get_text()
for tag in headings:
    page_content_dump += tag.get_text()
for tag in list_items:
    page_content_dump += tag.get_text()

In [47]:
page_content_dump

"Time series is a sequence of observations recorded at regular time intervals. This guide walks you through the process of analyzing the characteristics of a given time series in python.Time Series Analysis in Python – A Comprehensive Guide. Photo by Daniel Ferrandiz.Time series is a sequence of observations recorded at regular time intervals.Depending on the frequency of observations, a time series may typically be hourly, daily, weekly, monthly, quarterly and annual. Sometimes, you might have seconds and minute-wise time series as well, like, number of clicks and user visits every minute etc.Why even analyze a time series?Because it is the preparatory step before you develop a forecast of the series.Besides, time series forecasting has enormous commercial significance because stuff that is important to a business like demand and sales, number of visitors to a website, stock price etc are essentially time series data.————————————————————————————————————————————-Download Free Resource:

In [34]:
# Now we can get a summary too
# Set a cap on the length of the input
summary = summarizer(page_content_dump[:512], max_length=130, min_length=30, do_sample=False)[0]['summary_text']
summary

Your max_length is set to 130, but your input_length is only 113. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=56)


'Time series is a sequence of observations recorded at regular time intervals . This guide walks you through the process of analyzing the characteristics of a given time series in python.Time Series Analysis in Python – A Comprehensive Guide.Depending on the frequency of observations, a time series may typically be hourly, daily, weekly, monthly, quarterly and annual.'

In [101]:
# Now let's define functions
def get_page_content(url):
    r = requests.get(url)
    if r.status_code != 200:
        return None
    soup = BeautifulSoup(r.text, 'html.parser')
    # Get main content
    soup = soup.find('body')
    p_tags = soup.find_all('p')
    headings = soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
    list_items = soup.find_all(['li'])
    page_content_dump = ' '.join([tag.get_text() for tag in p_tags])
    page_content_dump += ' '
    page_content_dump += ' '.join([tag.get_text() for tag in headings])
    page_content_dump += ' '
    page_content_dump += ' '.join([tag.get_text() for tag in list_items])
    return page_content_dump


def clean_text(text):
    tokens = word_tokenize(text)
    # Lowercase
    tokens = [w.lower() for w in tokens]
    # Remove punctuation
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # Remove non-alphabetic tokens
    words = [word for word in stripped if word.isalpha()]
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    return words

def get_tags(words):
    # Get frequency distribution
    freq_dist = nltk.FreqDist(words)
    return freq_dist.most_common(3)

def get_summary(text, max_length = 512):
    summary = summarizer(text, max_length=130, min_length=30, do_sample=False)[0]['summary_text']
    return summary

In [106]:
# Great now we can pick arbitrary bookmarks and get a summary
import random
random_bookmark = random.choice(bookmarks)
random_bookmark

(24,
 'NumPy quickstart — NumPy v1.25.dev0 Manual',
 'https://numpy.org/devdocs/user/quickstart.html',
 '2023-03-23 16:31:22',
 'Data Science Useful Things')

In [107]:
# Now let's get the page content
url = random_bookmark[2]
page_content = get_page_content(url)
if page_content is None:
    print("Couldn't get page content")
else:
    cleaned_content = clean_text(page_content)
    tags = get_tags(cleaned_content)
    summary = get_summary(page_content)

    print(f"Tags: {tags}")
    print(f"Summary: {summary}")

Tags: [('array', 147), ('numpy', 44), ('function', 32)]
Summary: In NumPy, these functions operate elementwise on an array, producing an array as output . The first axis has a length of 2, the second-to-last is printed from top to bottom, with each slice separated from the next by an empty line . For example, if x is an array with 5 axes, then x[1, 2, ...] is equivalent to ndarray.dtype.itemsize . This is also known by the alias array, which is


In [108]:
# Summarisation isn't great because it gets lots of crap from the page
# We could just get main content from the page but that relies on the page being well structured
# The tags seem to generally work well but the summary no
# We might instead consider using the OG meta tags to get a description where availablae
# So let's try that
def get_meta_description(soup):
    meta = soup.find('meta', attrs={'name':'description'})
    if meta:
        return meta['content']
    else:
        return None


In [124]:
from random import choice
# Try it
url = choice(bookmarks)[2]
print(f"Bookmark: {url}")
r = requests.get(url)
if r.status_code != 200:
    print("Couldn't get page")
else:
    soup = BeautifulSoup(r.text, 'html.parser')
    description = get_meta_description(soup)
    print(description)

Bookmark: https://procrastinators.org/
None


In [None]:
# NOTE: Need try excepts for requests because sometimes the URLs are invalid because bookmarks are arbitrary and may not necessarily refer to a valid URL