# Job Task 1
Write a Python script that loads an article from the internet and generates a summary and title
using a large language model (LLM).

In [1]:
# Importing Libraries that I will be using in the project

import requests                          #GET Request from the server is needed to get HTML Document.
from bs4 import BeautifulSoup            #For Parsing HMTL.
import re                                #Manipulating text (e.g. removing words, such as in my case).
from typing import List                  #Not necessary at all, I just saw it from a book and I though it would be good practice and also look good.
from dotenv import load_dotenv           #Loading .env with API keys.
import os

from langchain_openai import ChatOpenAI  #OpenAI Model
from langchain.prompts import PromptTemplate #Prompt Template element

Define a method for loading the '.env' file.

In [2]:
def configure():
    load_dotenv()

In [3]:
configure()                                    #Load .env
openai_api_key = os.getenv('api_key')          #Fetch from file
os.environ["OPENAI_API_KEY"] = openai_api_key  #Set Enviroment API Variable

# Step 1 - Scraping an article for raw, readable and clean imput text
Methods "fetch_parse" and "clean_text" for fetching the text from the web and transforming it into a nice and readable block of text.

# 1.1 - fetch_parse
Fetches all information in 'p' HTML tags.

In [4]:
def fetch_parse(url: str, headers: dict) -> List[str]:                 #I saw this approach for defining methods in a book and decided to stick with it.
    
    page_to_be_scraped = requests.get(url, headers=headers)            #Submit a GET request to the server.
    soup = BeautifulSoup(page_to_be_scraped.text, "html.parser")       #Parse the HTML file.
    p_paragraphs = soup.find_all("p")                                  #<---Finds paragraph(<p>) tags in the HTML file.
    pure_paragraphs = [p.get_text(strip=True) for p in p_paragraphs]   #Extracts the text from the <p> paragraphs.

    return pure_paragraphs

# 1.2 - clean_text
Cleans the input in order to ouput clean raw text data.

In [5]:
def clean_text(text_list: list) -> List:     #Again, here I am using the same approach because I tihnk it makes it better readable.
    text = "\n".join(text_list)              # Convert list to a single string

    # Removing common words that are used in web articles se we can clean the text as much as possible.
    text = re.sub(r"(?i)\b(Share this story:|Tags:|Categories:|Subscribe now|See also:|Want to learn more about .*?)\b.*", "", text) 

    # Detect and remove block of short lines at the start (tags)
    lines = text.split("\n")
    filtered_lines = []
    for line in lines:
        if len(line.split()) > 5:            # Keep only the lines that have more than 5 words
            filtered_lines.append(line)
        elif filtered_lines:                 # Once we hit real content -> stop filtering
            filtered_lines.append(line)

    text = "\n".join(filtered_lines)

    return text.strip()

# 1.3 - Clean Text Result


In [6]:
url = "https://news.bg/int-politics/tramp-razgovaryal-s-putin-za-kraya-na-voynata-v-ukrayna.html" #URL to be scraped
headers = {'User-Agent': 'Mozilla/5.0'} #Here we are using an User-Agent because many websites have protection against bots that are doing the same thing we are


cleaned_text = clean_text(fetch_parse(url, headers)) #Using both methods I created to clean the get and clean the text from the article.

# Step 2 - Chosing Model and Creating a Prompt Template
Selecting Model and Configuring Context. 

For this specific task I decided to go with a simpler approach, because the model will be used for summarizing articles on the web and not books, ect.

In [7]:
llm=ChatOpenAI(model_name='gpt-4o-mini') #LLM Model Selection

#Template
template = """                           
Please provide a short and simple summary of a couple of sentences of the following text:\n TEXT: {cleaned_text}\n in the format:

Title:
    [Insert Title Here]

    Summary:
    [Insert Summary Here]
"""

prompt = PromptTemplate(
    input_variables=["cleaned_text"],   #I am using 1 variable that will be injected into the prompt template - "cleaned_text".
    template=template                   #Argument for a template takes 'template' variable that I created a few rows above.
)

formatted_prompt = prompt.format(cleaned_text=cleaned_text) #Prompt formatting

# Step 2.1 - Number of Tokens
It is good practice to know how many tokens the article contains and how many tokens can the model handle, for example the model I chose to work with (GPT-4o-mini) which can handle 128,000-token context window and 16,384 output tokens per request

In [8]:
llm.get_num_tokens(cleaned_text)

1641

# Step 2.2 - Summarized Result
Below is the summarized text from the article provided

In [9]:
print(llm.invoke(formatted_prompt).content)

Title:  
Trump Discusses Ukraine War with Putin

Summary:  
U.S. President Donald Trump revealed he has spoken with Russian President Vladimir Putin about ending the war in Ukraine, stating, "Putin wants people to stop dying." Trump indicated he has a specific plan to resolve the conflict and expressed hope for a quick resolution, while also mentioning a possible upcoming meeting with Ukrainian President Volodymyr Zelensky to discuss the war's end.
