# Task 1
Write a Python script that loads an article from the internet and generates a summary and title
using a large language model (LLM).

In [1]:
# Importing Libraries that I will be using in the project
import requests
from bs4 import BeautifulSoup #For Parsing HMTL
import re #Manipulating text (e.g. removing words, such as in my case)
from typing import List #Not necessary at all, I just saw it from a book and I though it would be good practice and also look good.
from dotenv import load_dotenv
import os

from langchain_openai import ChatOpenAI
from langchain.chains import LLMChain
from langchain import PromptTemplate
from langchain.schema import(
    AIMessage,
    HumanMessage,
    SystemMessage)

In [2]:
def configure():
    load_dotenv()

In [3]:
configure()
openai_api_key = os.getenv('api_key')
os.environ["OPENAI_API_KEY"] = openai_api_key

# Step 1 - Scraping an article for raw, readable and clean imput text
Methods "fetch_parse" and "clean_text" for fetching the text from the web and transforming it into a nice and readable block of text.

# 1.1 - fetch_parse
Fetches all information in 'p' HTML tags.

In [4]:
def fetch_parse(url: str, headers: dict) -> List[str]:
    page_to_be_scraped = requests.get(url, headers=headers)
    page_to_be_scraped.raise_for_status() #Ensuring fetching was successful
    soup = BeautifulSoup(page_to_be_scraped.text, "html.parser")
    
    p_paragraphs = soup.find_all("p") #Finds paragraph(<p>) tags in the HTML file
    pure_paragraphs = [p.get_text(strip=True) for p in p_paragraphs]

    return pure_paragraphs

# 1.2 - clean_text
Cleans the input in order to ouput clean raw text data.

In [5]:
def clean_text(text_list: list) -> List:
    # Convert list to a single string
    text = "\n".join(text_list)

    # Removing common metadata
    text = re.sub(r"(?i)\b(Share this story:|Tags:|Categories:|Subscribe now|See also:|Want to learn more about .*?)\b.*", "", text)

    # Detect and remove block of short lines at the start (tags)
    lines = text.split("\n")
    filtered_lines = []
    actual_content = False
    
    for line in lines:
        if len(line.split()) > 5:  # Keep only the lines that have more than 5 words
            actual_content = True
            filtered_lines.append(line)
        elif filtered_lines:  # Once we hit real content -> stop filtering
            filtered_lines.append(line)

    text = "\n".join(filtered_lines)

    return text

# 1.3 - Clean Text Result


In [6]:
url = "https://fakti.bg/krimi/947637-nad-11-000-kontrabandni-i-falshivi-parfuma-za-den" #URL to be scraped
headers = {'User-Agent': 'Mozilla/5.0'} #Here we are using an User-Agent because many websites have protection against bots that are doing the same thing we are


cleaned_text = clean_text(fetch_parse(url, headers))
cleaned_text

'11 225 парфюма задържаха митнически служители от ТД Митница София и ТД Митница Русе в рамките на ден при две отделни проверки – една на Митнически пункт Калотина и една в района на Дунав мост – Видин. Една част от задържаните парфюми са контрабандни, а други са иззети по подозрение, че са фалшиви. И двете проверки са извършени на 02.02.2025 г., парфюмите са превозвани с други стоки, тръгнали от България, и били опаковани в необозначени кашони.\n2874 контрабандни парфюма са задържани при проверка на български товарен автомобил, излизащ от страната през Митнически пункт Калотина. Митническите служители от ТД Митница София селектират за проверка излизащ от страната през ГКПП Калотина товарен автомобил, пътуващ със стока към Франция. След като водачът не декларирал друго освен описаната в придружаващите документи стока, товарът е отклонен за щателна митническа проверка и сканиране с рентгенова апаратура. Набелязани са зони с подозрителна плътност, за които при последващата физическа прове

# Step 2 - Setting up LLM and APIs'
Selecting Model and Configuring Context

In [7]:
generic_template=f'''
Write a summary of the following article and give it a title:
Article: '{cleaned_text}'''

prompt=PromptTemplate(
    input_variables=['text'],
    template=generic_template
)
complete_prompt = prompt.format(text=cleaned_text)

In [8]:
llm=ChatOpenAI(model_name='gpt-4o-mini')

# Step 2.1 - Number of Tokens
It is good practice to know how many tokens the article contains and how many tokens can the model handle, for example the model I chose to work with (GPT-4o-mini) which can handle 128,000-token context window and 16,384 output tokens per request

In [9]:
print(llm.get_num_tokens(cleaned_text))
print(llm.get_num_tokens(complete_prompt)) #We can check if the added characters from the prompt do not exceed limits.

1329
1346


# Step 2.2 - Summarized Result
Below is the summarized text

In [10]:
llm_chain=LLMChain(llm=llm,prompt=prompt)
summary=llm_chain.invoke({'text':cleaned_text})

  llm_chain=LLMChain(llm=llm,prompt=prompt)


In [11]:
summary

{'text': '**Title: Massive Seizure of Perfumes at Bulgarian Borders: Over 11,000 Confiscated**\n\nSummary: On February 2, 2025, customs officials from the Sofia and Ruse Customs Directorates seized a total of 11,225 perfumes during two separate inspections at the Kalotina and Danube Bridge - Vidin checkpoints. The inspections discovered a mix of contraband and counterfeit perfumes hidden among other goods in unmarked boxes. At Kalotina, authorities confiscated 2,874 counterfeit perfumes destined for France from a Bulgarian truck. Meanwhile, at the Danube Bridge, 8,351 perfumes of well-known brands were found in a shipment mistakenly labeled as "cosmetics" en route to Italy. Additionally, customs officials intercepted a bus carrying over 1,000 counterfeit clothes from Turkey. The inspections are part of measures by the Customs Agency in response to the removal of certain restrictions.'}