# Task 1
Write a Python script that loads an article from the internet and generates a summary and title
using a large language model (LLM).

In [1]:
# Importing Libraries that I will be using in the project
import requests
from bs4 import BeautifulSoup #For Parsing HMTL
import re #Manipulating text (e.g. removing words, such as in my case)
from typing import List #Not necessary at all, I just saw it from a book and I though it would be good practice and also look good.
from dotenv import load_dotenv
import os

from langchain_openai import ChatOpenAI
from langchain.schema import(
    AIMessage,
    HumanMessage,
    SystemMessage)

In [5]:
def configure():
    load_dotenv()

In [6]:
configure()
openai_api_key = os.getenv('api_key')
os.environ["OPENAI_API_KEY"] = openai_api_key

# Step 1 - Scraping an article for raw, readable and clean imput text
Methods "fetch_parse" and "clean_text" for fetching the text from the web and transforming it into a nice and readable block of text.

# 1.1 - fetch_parse
Fetches all information in 'p' HTML tags.

In [2]:
def fetch_parse(url: str, headers: dict) -> List[str]:
    page_to_be_scraped = requests.get(url, headers=headers)
    soup = BeautifulSoup(page_to_be_scraped.text, "html.parser")
    p_paragraphs = soup.find_all("p") #Finds paragraph(<p>) tags in the HTML file
    pure_paragraphs = [p.get_text(strip=True) for p in p_paragraphs]

    return pure_paragraphs

# 1.2 - clean_text
Cleans the input in order to ouput clean raw text data.

In [3]:
def clean_text(text_list: list) -> List:
    # Convert list to a single string
    text = "\n".join(text_list)

    # Removing common metadata
    text = re.sub(r"(?i)\b(Share this story:|Tags:|Categories:|Subscribe now|See also:|Want to learn more about .*?)\b.*", "", text)

    # Detect and remove block of short lines at the start (tags)
    lines = text.split("\n")
    filtered_lines = []
    for line in lines:
        if len(line.split()) > 5:  # Keep only the lines that have more than 5 words
            filtered_lines.append(line)
        elif filtered_lines:  # Once we hit real content -> stop filtering
            filtered_lines.append(line)

    text = "\n".join(filtered_lines)

    return text.strip()

# 1.3 - Clean Text Result


In [4]:
url = "https://news.bg/politics/peevski-obeshtava-da-ne-e-pateritsa-za-regulatorite.html" #URL to be scraped
headers = {'User-Agent': 'Mozilla/5.0'} #Here we are using an User-Agent because many websites have protection against bots that are doing the same thing we are


cleaned_text = clean_text(fetch_parse(url, headers))

# Step 2 - Setting up LLM and APIs'
Selecting Model and Configuring Context

In [7]:
chat_messages=[
    SystemMessage(content='You are a text summarization tool. You will be tasked to summarize input text'),
    HumanMessage(content=f'Please provide a short and concise summary of the following speech:\n TEXT: {cleaned_text}')
]
llm=ChatOpenAI(model_name='gpt-4o-mini')

# Step 2.1 - Number of Tokens
It is good practice to know how many tokens the article contains and how many tokens can the model handle, for example the model I chose to work with (GPT-4o-mini) which can handle 128,000-token context window and 16,384 output tokens per request

In [8]:
llm.get_num_tokens(cleaned_text)

971

# Step 2.2 - Summarized Result
Below is the summarized text

In [10]:
print(llm.invoke(chat_messages).content)

Делян Пеевски, лидер на ДПС-Ново начало, заяви, че неговата парламентарна група няма да подкрепи разпределението на регулатори от изтекъл мандат и изключва участие с кандидати. Той остави отворена възможността да сменят регулаторите след предсрочни парламентарни избори, ако ДПС-Ново начало стане първа политическа сила. Пеевски акцентира на необходимостта от контрол над цените на хранителните стоки и призова за обединение на политическите лидери срещу влиянието на Джордж Сорос в България. Той отправи призив към Бойко Борисов да стане национален лидер в усилията за освобождаване на страната от това влияние.
